From 8da9e3f7476137747b8502b87df80738861d324c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 12 Oct 2016 17:18:56 -0700 Subject: [PATCH 0001/1212] f2fs: backport from (4c1fad64 - Merge tag 'for-f2fs-4.9' of git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs) Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 18 +- fs/Kconfig | 2 + fs/Makefile | 1 + fs/crypto/Kconfig | 18 + fs/crypto/Makefile | 3 + fs/crypto/crypto.c | 568 +++++++++ fs/{f2fs/crypto_fname.c => crypto/fname.c} | 276 ++-- fs/crypto/keyinfo.c | 304 +++++ fs/crypto/policy.c | 246 ++++ fs/f2fs/Kconfig | 21 +- fs/f2fs/Makefile | 2 - fs/f2fs/acl.c | 21 +- fs/f2fs/acl.h | 3 +- fs/f2fs/checkpoint.c | 529 +++++--- fs/f2fs/crypto.c | 491 -------- fs/f2fs/crypto_key.c | 254 ---- fs/f2fs/crypto_policy.c | 212 ---- fs/f2fs/data.c | 1252 ++++++++++-------- fs/f2fs/debug.c | 80 +- fs/f2fs/dir.c | 451 +++---- fs/f2fs/extent_cache.c | 315 ++--- fs/f2fs/f2fs.h | 1010 +++++++++------ fs/f2fs/f2fs_crypto.h | 151 --- fs/f2fs/file.c | 1323 ++++++++++++++------ fs/f2fs/gc.c | 353 ++++-- fs/f2fs/gc.h | 8 - fs/f2fs/inline.c | 266 ++-- fs/f2fs/inode.c | 176 +-- fs/f2fs/namei.c | 409 +++--- fs/f2fs/node.c | 733 +++++++---- fs/f2fs/node.h | 123 +- fs/f2fs/recovery.c | 274 ++-- fs/f2fs/segment.c | 689 ++++++---- fs/f2fs/segment.h | 47 +- fs/f2fs/shrinker.c | 8 +- fs/f2fs/super.c | 845 ++++++++++--- fs/f2fs/trace.c | 6 +- fs/f2fs/xattr.c | 69 +- fs/f2fs/xattr.h | 3 +- include/linux/dcache.h | 1 + include/linux/f2fs_fs.h | 44 +- include/linux/fs.h | 7 + include/linux/fscrypto.h | 435 +++++++ include/trace/events/f2fs.h | 64 +- include/uapi/linux/fs.h | 18 + 45 files changed, 7635 insertions(+), 4494 deletions(-) create mode 100644 fs/crypto/Kconfig create mode 100644 fs/crypto/Makefile create mode 100644 fs/crypto/crypto.c rename fs/{f2fs/crypto_fname.c => crypto/fname.c} (54%) create mode 100644 fs/crypto/keyinfo.c create mode 100644 fs/crypto/policy.c delete mode 100644 fs/f2fs/crypto.c delete mode 100644 fs/f2fs/crypto_key.c delete mode 100644 fs/f2fs/crypto_policy.c delete mode 100644 fs/f2fs/f2fs_crypto.h create mode 100644 include/linux/fscrypto.h diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index b102b436563e..753dd4f96afe 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -102,14 +102,16 @@ background_gc=%s Turn on/off cleaning operations, namely garbage collection, triggered in background when I/O subsystem is idle. If background_gc=on, it will turn on the garbage collection and if background_gc=off, garbage collection - will be truned off. If background_gc=sync, it will turn + will be turned off. If background_gc=sync, it will turn on synchronous garbage collection running in background. Default value for this option is on. So garbage collection is on by default. disable_roll_forward Disable the roll-forward recovery routine norecovery Disable the roll-forward recovery routine, mounted read- only (i.e., -o ro,disable_roll_forward) -discard Issue discard/TRIM commands when a segment is cleaned. +discard/nodiscard Enable/disable real-time discard in f2fs, if discard is + enabled, f2fs will issue discard/TRIM commands when a + segment is cleaned. no_heap Disable heap-style segment allocation which finds free segments for data from the beginning of main area, while for node from the end of main area. @@ -129,6 +131,7 @@ inline_dentry Enable the inline dir feature: data in new created directory entries can be written into inode block. The space of inode block which is used to store inline dentries is limited to ~3.4k. +noinline_dentry Diable the inline dentry feature. flush_merge Merge concurrent cache_flush commands as much as possible to eliminate redundant command issues. If the underlying device handles the cache_flush command relatively slowly, @@ -145,10 +148,15 @@ extent_cache Enable an extent cache based on rb-tree, it can cache as many as extent which map between contiguous logical address and physical address per inode, resulting in increasing the cache hit ratio. Set by default. -noextent_cache Diable an extent cache based on rb-tree explicitly, see +noextent_cache Disable an extent cache based on rb-tree explicitly, see the above extent_cache mount option. noinline_data Disable the inline data feature, inline data feature is enabled by default. +data_flush Enable data flushing before checkpoint in order to + persist data of regular and symlink. +mode=%s Control block allocation mode which supports "adaptive" + and "lfs". In "lfs" mode, there should be no random + writes towards main area. ================================================================================ DEBUGFS ENTRIES @@ -192,7 +200,7 @@ Files in /sys/fs/f2fs/ policy for garbage collection. Setting gc_idle = 0 (default) will disable this option. Setting gc_idle = 1 will select the Cost Benefit approach - & setting gc_idle = 2 will select the greedy aproach. + & setting gc_idle = 2 will select the greedy approach. reclaim_segments This parameter controls the number of prefree segments to be reclaimed. If the number of prefree @@ -298,7 +306,7 @@ The dump.f2fs shows the information of specific inode and dumps SSA and SIT to file. Each file is dump_ssa and dump_sit. The dump.f2fs is used to debug on-disk data structures of the f2fs filesystem. -It shows on-disk inode information reconized by a given inode number, and is +It shows on-disk inode information recognized by a given inode number, and is able to dump all the SSA and SIT entries into predefined files, ./dump_ssa and ./dump_sit respectively. diff --git a/fs/Kconfig b/fs/Kconfig index 6ce72d8d1ee1..16a7e2871213 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -73,6 +73,8 @@ config FILE_LOCKING for filesystems like NFS and for the flock() system call. Disabling this option saves about 11k. +source "fs/crypto/Kconfig" + source "fs/notify/Kconfig" source "fs/quota/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index 79f522575cba..252c96898a43 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -30,6 +30,7 @@ obj-$(CONFIG_EVENTFD) += eventfd.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_AIO) += aio.o obj-$(CONFIG_FS_DAX) += dax.o +obj-$(CONFIG_FS_ENCRYPTION) += crypto/ obj-$(CONFIG_FILE_LOCKING) += locks.o obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig new file mode 100644 index 000000000000..92348faf9865 --- /dev/null +++ b/fs/crypto/Kconfig @@ -0,0 +1,18 @@ +config FS_ENCRYPTION + tristate "FS Encryption (Per-file encryption)" + depends on BLOCK + select CRYPTO + select CRYPTO_AES + select CRYPTO_CBC + select CRYPTO_ECB + select CRYPTO_XTS + select CRYPTO_CTS + select CRYPTO_CTR + select CRYPTO_SHA256 + select KEYS + select ENCRYPTED_KEYS + help + Enable encryption of files and directories. This + feature is similar to ecryptfs, but it is more memory + efficient since it avoids caching the encrypted and + decrypted pages in the page cache. diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile new file mode 100644 index 000000000000..f17684c48739 --- /dev/null +++ b/fs/crypto/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_FS_ENCRYPTION) += fscrypto.o + +fscrypto-y := crypto.o fname.o policy.o keyinfo.o diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c new file mode 100644 index 000000000000..2fc8c43ce531 --- /dev/null +++ b/fs/crypto/crypto.c @@ -0,0 +1,568 @@ +/* + * This contains encryption functions for per-file encryption. + * + * Copyright (C) 2015, Google, Inc. + * Copyright (C) 2015, Motorola Mobility + * + * Written by Michael Halcrow, 2014. + * + * Filename encryption additions + * Uday Savagaonkar, 2014 + * Encryption policy handling additions + * Ildar Muslukhov, 2014 + * Add fscrypt_pullback_bio_page() + * Jaegeuk Kim, 2015. + * + * This has not yet undergone a rigorous security audit. + * + * The usage of AES-XTS should conform to recommendations in NIST + * Special Publication 800-38E and IEEE P1619/D16. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static unsigned int num_prealloc_crypto_pages = 32; +static unsigned int num_prealloc_crypto_ctxs = 128; + +module_param(num_prealloc_crypto_pages, uint, 0444); +MODULE_PARM_DESC(num_prealloc_crypto_pages, + "Number of crypto pages to preallocate"); +module_param(num_prealloc_crypto_ctxs, uint, 0444); +MODULE_PARM_DESC(num_prealloc_crypto_ctxs, + "Number of crypto contexts to preallocate"); + +static mempool_t *fscrypt_bounce_page_pool = NULL; + +static LIST_HEAD(fscrypt_free_ctxs); +static DEFINE_SPINLOCK(fscrypt_ctx_lock); + +static struct workqueue_struct *fscrypt_read_workqueue; +static DEFINE_MUTEX(fscrypt_init_mutex); + +static struct kmem_cache *fscrypt_ctx_cachep; +struct kmem_cache *fscrypt_info_cachep; + +/** + * fscrypt_release_ctx() - Releases an encryption context + * @ctx: The encryption context to release. + * + * If the encryption context was allocated from the pre-allocated pool, returns + * it to that pool. Else, frees it. + * + * If there's a bounce page in the context, this frees that. + */ +void fscrypt_release_ctx(struct fscrypt_ctx *ctx) +{ + unsigned long flags; + + if (ctx->flags & FS_WRITE_PATH_FL && ctx->w.bounce_page) { + mempool_free(ctx->w.bounce_page, fscrypt_bounce_page_pool); + ctx->w.bounce_page = NULL; + } + ctx->w.control_page = NULL; + if (ctx->flags & FS_CTX_REQUIRES_FREE_ENCRYPT_FL) { + kmem_cache_free(fscrypt_ctx_cachep, ctx); + } else { + spin_lock_irqsave(&fscrypt_ctx_lock, flags); + list_add(&ctx->free_list, &fscrypt_free_ctxs); + spin_unlock_irqrestore(&fscrypt_ctx_lock, flags); + } +} +EXPORT_SYMBOL(fscrypt_release_ctx); + +/** + * fscrypt_get_ctx() - Gets an encryption context + * @inode: The inode for which we are doing the crypto + * @gfp_flags: The gfp flag for memory allocation + * + * Allocates and initializes an encryption context. + * + * Return: An allocated and initialized encryption context on success; error + * value or NULL otherwise. + */ +struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags) +{ + struct fscrypt_ctx *ctx = NULL; + struct fscrypt_info *ci = inode->i_crypt_info; + unsigned long flags; + + if (ci == NULL) + return ERR_PTR(-ENOKEY); + + /* + * We first try getting the ctx from a free list because in + * the common case the ctx will have an allocated and + * initialized crypto tfm, so it's probably a worthwhile + * optimization. For the bounce page, we first try getting it + * from the kernel allocator because that's just about as fast + * as getting it from a list and because a cache of free pages + * should generally be a "last resort" option for a filesystem + * to be able to do its job. + */ + spin_lock_irqsave(&fscrypt_ctx_lock, flags); + ctx = list_first_entry_or_null(&fscrypt_free_ctxs, + struct fscrypt_ctx, free_list); + if (ctx) + list_del(&ctx->free_list); + spin_unlock_irqrestore(&fscrypt_ctx_lock, flags); + if (!ctx) { + ctx = kmem_cache_zalloc(fscrypt_ctx_cachep, gfp_flags); + if (!ctx) + return ERR_PTR(-ENOMEM); + ctx->flags |= FS_CTX_REQUIRES_FREE_ENCRYPT_FL; + } else { + ctx->flags &= ~FS_CTX_REQUIRES_FREE_ENCRYPT_FL; + } + ctx->flags &= ~FS_WRITE_PATH_FL; + return ctx; +} +EXPORT_SYMBOL(fscrypt_get_ctx); + +/** + * fscrypt_complete() - The completion callback for page encryption + * @req: The asynchronous encryption request context + * @res: The result of the encryption operation + */ +static void fscrypt_complete(struct crypto_async_request *req, int res) +{ + struct fscrypt_completion_result *ecr = req->data; + + if (res == -EINPROGRESS) + return; + ecr->res = res; + complete(&ecr->completion); +} + +typedef enum { + FS_DECRYPT = 0, + FS_ENCRYPT, +} fscrypt_direction_t; + +static int do_page_crypto(struct inode *inode, + fscrypt_direction_t rw, pgoff_t index, + struct page *src_page, struct page *dest_page, + gfp_t gfp_flags) +{ + u8 xts_tweak[FS_XTS_TWEAK_SIZE]; + struct skcipher_request *req = NULL; + DECLARE_FS_COMPLETION_RESULT(ecr); + struct scatterlist dst, src; + struct fscrypt_info *ci = inode->i_crypt_info; + struct crypto_skcipher *tfm = ci->ci_ctfm; + int res = 0; + + req = skcipher_request_alloc(tfm, gfp_flags); + if (!req) { + printk_ratelimited(KERN_ERR + "%s: crypto_request_alloc() failed\n", + __func__); + return -ENOMEM; + } + + skcipher_request_set_callback( + req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + fscrypt_complete, &ecr); + + BUILD_BUG_ON(FS_XTS_TWEAK_SIZE < sizeof(index)); + memcpy(xts_tweak, &index, sizeof(index)); + memset(&xts_tweak[sizeof(index)], 0, + FS_XTS_TWEAK_SIZE - sizeof(index)); + + sg_init_table(&dst, 1); + sg_set_page(&dst, dest_page, PAGE_SIZE, 0); + sg_init_table(&src, 1); + sg_set_page(&src, src_page, PAGE_SIZE, 0); + skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, + xts_tweak); + if (rw == FS_DECRYPT) + res = crypto_skcipher_decrypt(req); + else + res = crypto_skcipher_encrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { + BUG_ON(req->base.data != &ecr); + wait_for_completion(&ecr.completion); + res = ecr.res; + } + skcipher_request_free(req); + if (res) { + printk_ratelimited(KERN_ERR + "%s: crypto_skcipher_encrypt() returned %d\n", + __func__, res); + return res; + } + return 0; +} + +static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags) +{ + ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool, gfp_flags); + if (ctx->w.bounce_page == NULL) + return ERR_PTR(-ENOMEM); + ctx->flags |= FS_WRITE_PATH_FL; + return ctx->w.bounce_page; +} + +/** + * fscypt_encrypt_page() - Encrypts a page + * @inode: The inode for which the encryption should take place + * @plaintext_page: The page to encrypt. Must be locked. + * @gfp_flags: The gfp flag for memory allocation + * + * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx + * encryption context. + * + * Called on the page write path. The caller must call + * fscrypt_restore_control_page() on the returned ciphertext page to + * release the bounce buffer and the encryption context. + * + * Return: An allocated page with the encrypted content on success. Else, an + * error value or NULL. + */ +struct page *fscrypt_encrypt_page(struct inode *inode, + struct page *plaintext_page, gfp_t gfp_flags) +{ + struct fscrypt_ctx *ctx; + struct page *ciphertext_page = NULL; + int err; + + BUG_ON(!PageLocked(plaintext_page)); + + ctx = fscrypt_get_ctx(inode, gfp_flags); + if (IS_ERR(ctx)) + return (struct page *)ctx; + + /* The encryption operation will require a bounce page. */ + ciphertext_page = alloc_bounce_page(ctx, gfp_flags); + if (IS_ERR(ciphertext_page)) + goto errout; + + ctx->w.control_page = plaintext_page; + err = do_page_crypto(inode, FS_ENCRYPT, plaintext_page->index, + plaintext_page, ciphertext_page, + gfp_flags); + if (err) { + ciphertext_page = ERR_PTR(err); + goto errout; + } + SetPagePrivate(ciphertext_page); + set_page_private(ciphertext_page, (unsigned long)ctx); + lock_page(ciphertext_page); + return ciphertext_page; + +errout: + fscrypt_release_ctx(ctx); + return ciphertext_page; +} +EXPORT_SYMBOL(fscrypt_encrypt_page); + +/** + * f2crypt_decrypt_page() - Decrypts a page in-place + * @page: The page to decrypt. Must be locked. + * + * Decrypts page in-place using the ctx encryption context. + * + * Called from the read completion callback. + * + * Return: Zero on success, non-zero otherwise. + */ +int fscrypt_decrypt_page(struct page *page) +{ + BUG_ON(!PageLocked(page)); + + return do_page_crypto(page->mapping->host, + FS_DECRYPT, page->index, page, page, GFP_NOFS); +} +EXPORT_SYMBOL(fscrypt_decrypt_page); + +int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk, + sector_t pblk, unsigned int len) +{ + struct fscrypt_ctx *ctx; + struct page *ciphertext_page = NULL; + struct bio *bio; + int ret, err = 0; + + BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE); + + ctx = fscrypt_get_ctx(inode, GFP_NOFS); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ciphertext_page = alloc_bounce_page(ctx, GFP_NOWAIT); + if (IS_ERR(ciphertext_page)) { + err = PTR_ERR(ciphertext_page); + goto errout; + } + + while (len--) { + err = do_page_crypto(inode, FS_ENCRYPT, lblk, + ZERO_PAGE(0), ciphertext_page, + GFP_NOFS); + if (err) + goto errout; + + bio = bio_alloc(GFP_NOWAIT, 1); + if (!bio) { + err = -ENOMEM; + goto errout; + } + bio->bi_bdev = inode->i_sb->s_bdev; + bio->bi_iter.bi_sector = + pblk << (inode->i_sb->s_blocksize_bits - 9); + ret = bio_add_page(bio, ciphertext_page, + inode->i_sb->s_blocksize, 0); + if (ret != inode->i_sb->s_blocksize) { + /* should never happen! */ + WARN_ON(1); + bio_put(bio); + err = -EIO; + goto errout; + } + err = submit_bio_wait(WRITE, bio); + if ((err == 0) && bio->bi_error) + err = -EIO; + bio_put(bio); + if (err) + goto errout; + lblk++; + pblk++; + } + err = 0; +errout: + fscrypt_release_ctx(ctx); + return err; +} +EXPORT_SYMBOL(fscrypt_zeroout_range); + +/* + * Validate dentries for encrypted directories to make sure we aren't + * potentially caching stale data after a key has been added or + * removed. + */ +static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct dentry *dir; + struct fscrypt_info *ci; + int dir_has_key, cached_with_key; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + dir = dget_parent(dentry); + if (!d_inode(dir)->i_sb->s_cop->is_encrypted(d_inode(dir))) { + dput(dir); + return 0; + } + + ci = d_inode(dir)->i_crypt_info; + if (ci && ci->ci_keyring_key && + (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | + (1 << KEY_FLAG_REVOKED) | + (1 << KEY_FLAG_DEAD)))) + ci = NULL; + + /* this should eventually be an flag in d_flags */ + spin_lock(&dentry->d_lock); + cached_with_key = dentry->d_flags & DCACHE_ENCRYPTED_WITH_KEY; + spin_unlock(&dentry->d_lock); + dir_has_key = (ci != NULL); + dput(dir); + + /* + * If the dentry was cached without the key, and it is a + * negative dentry, it might be a valid name. We can't check + * if the key has since been made available due to locking + * reasons, so we fail the validation so ext4_lookup() can do + * this check. + * + * We also fail the validation if the dentry was created with + * the key present, but we no longer have the key, or vice versa. + */ + if ((!cached_with_key && d_is_negative(dentry)) || + (!cached_with_key && dir_has_key) || + (cached_with_key && !dir_has_key)) + return 0; + return 1; +} + +const struct dentry_operations fscrypt_d_ops = { + .d_revalidate = fscrypt_d_revalidate, +}; +EXPORT_SYMBOL(fscrypt_d_ops); + +/* + * Call fscrypt_decrypt_page on every single page, reusing the encryption + * context. + */ +static void completion_pages(struct work_struct *work) +{ + struct fscrypt_ctx *ctx = + container_of(work, struct fscrypt_ctx, r.work); + struct bio *bio = ctx->r.bio; + struct bio_vec *bv; + int i; + + bio_for_each_segment_all(bv, bio, i) { + struct page *page = bv->bv_page; + int ret = fscrypt_decrypt_page(page); + + if (ret) { + WARN_ON_ONCE(1); + SetPageError(page); + } else { + SetPageUptodate(page); + } + unlock_page(page); + } + fscrypt_release_ctx(ctx); + bio_put(bio); +} + +void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio) +{ + INIT_WORK(&ctx->r.work, completion_pages); + ctx->r.bio = bio; + queue_work(fscrypt_read_workqueue, &ctx->r.work); +} +EXPORT_SYMBOL(fscrypt_decrypt_bio_pages); + +void fscrypt_pullback_bio_page(struct page **page, bool restore) +{ + struct fscrypt_ctx *ctx; + struct page *bounce_page; + + /* The bounce data pages are unmapped. */ + if ((*page)->mapping) + return; + + /* The bounce data page is unmapped. */ + bounce_page = *page; + ctx = (struct fscrypt_ctx *)page_private(bounce_page); + + /* restore control page */ + *page = ctx->w.control_page; + + if (restore) + fscrypt_restore_control_page(bounce_page); +} +EXPORT_SYMBOL(fscrypt_pullback_bio_page); + +void fscrypt_restore_control_page(struct page *page) +{ + struct fscrypt_ctx *ctx; + + ctx = (struct fscrypt_ctx *)page_private(page); + set_page_private(page, (unsigned long)NULL); + ClearPagePrivate(page); + unlock_page(page); + fscrypt_release_ctx(ctx); +} +EXPORT_SYMBOL(fscrypt_restore_control_page); + +static void fscrypt_destroy(void) +{ + struct fscrypt_ctx *pos, *n; + + list_for_each_entry_safe(pos, n, &fscrypt_free_ctxs, free_list) + kmem_cache_free(fscrypt_ctx_cachep, pos); + INIT_LIST_HEAD(&fscrypt_free_ctxs); + mempool_destroy(fscrypt_bounce_page_pool); + fscrypt_bounce_page_pool = NULL; +} + +/** + * fscrypt_initialize() - allocate major buffers for fs encryption. + * + * We only call this when we start accessing encrypted files, since it + * results in memory getting allocated that wouldn't otherwise be used. + * + * Return: Zero on success, non-zero otherwise. + */ +int fscrypt_initialize(void) +{ + int i, res = -ENOMEM; + + if (fscrypt_bounce_page_pool) + return 0; + + mutex_lock(&fscrypt_init_mutex); + if (fscrypt_bounce_page_pool) + goto already_initialized; + + for (i = 0; i < num_prealloc_crypto_ctxs; i++) { + struct fscrypt_ctx *ctx; + + ctx = kmem_cache_zalloc(fscrypt_ctx_cachep, GFP_NOFS); + if (!ctx) + goto fail; + list_add(&ctx->free_list, &fscrypt_free_ctxs); + } + + fscrypt_bounce_page_pool = + mempool_create_page_pool(num_prealloc_crypto_pages, 0); + if (!fscrypt_bounce_page_pool) + goto fail; + +already_initialized: + mutex_unlock(&fscrypt_init_mutex); + return 0; +fail: + fscrypt_destroy(); + mutex_unlock(&fscrypt_init_mutex); + return res; +} +EXPORT_SYMBOL(fscrypt_initialize); + +/** + * fscrypt_init() - Set up for fs encryption. + */ +static int __init fscrypt_init(void) +{ + fscrypt_read_workqueue = alloc_workqueue("fscrypt_read_queue", + WQ_HIGHPRI, 0); + if (!fscrypt_read_workqueue) + goto fail; + + fscrypt_ctx_cachep = KMEM_CACHE(fscrypt_ctx, SLAB_RECLAIM_ACCOUNT); + if (!fscrypt_ctx_cachep) + goto fail_free_queue; + + fscrypt_info_cachep = KMEM_CACHE(fscrypt_info, SLAB_RECLAIM_ACCOUNT); + if (!fscrypt_info_cachep) + goto fail_free_ctx; + + return 0; + +fail_free_ctx: + kmem_cache_destroy(fscrypt_ctx_cachep); +fail_free_queue: + destroy_workqueue(fscrypt_read_workqueue); +fail: + return -ENOMEM; +} +module_init(fscrypt_init) + +/** + * fscrypt_exit() - Shutdown the fs encryption system + */ +static void __exit fscrypt_exit(void) +{ + fscrypt_destroy(); + + if (fscrypt_read_workqueue) + destroy_workqueue(fscrypt_read_workqueue); + kmem_cache_destroy(fscrypt_ctx_cachep); + kmem_cache_destroy(fscrypt_info_cachep); +} +module_exit(fscrypt_exit); + +MODULE_LICENSE("GPL"); diff --git a/fs/f2fs/crypto_fname.c b/fs/crypto/fname.c similarity index 54% rename from fs/f2fs/crypto_fname.c rename to fs/crypto/fname.c index ab377d496a39..5d6d49113efa 100644 --- a/fs/f2fs/crypto_fname.c +++ b/fs/crypto/fname.c @@ -1,46 +1,32 @@ /* - * linux/fs/f2fs/crypto_fname.c - * - * Copied from linux/fs/ext4/crypto.c + * This contains functions for filename crypto management * * Copyright (C) 2015, Google, Inc. * Copyright (C) 2015, Motorola Mobility * - * This contains functions for filename crypto management in f2fs - * * Written by Uday Savagaonkar, 2014. - * - * Adjust f2fs dentry structure - * Jaegeuk Kim, 2015. + * Modified by Jaegeuk Kim, 2015. * * This has not yet undergone a rigorous security audit. */ -#include -#include + #include #include -#include -#include -#include -#include -#include -#include -#include #include -#include -#include #include +#include -#include "f2fs.h" -#include "f2fs_crypto.h" -#include "xattr.h" +static u32 size_round_up(size_t size, size_t blksize) +{ + return ((size + blksize - 1) / blksize) * blksize; +} /** - * f2fs_dir_crypt_complete() - + * dir_crypt_complete() - */ -static void f2fs_dir_crypt_complete(struct crypto_async_request *req, int res) +static void dir_crypt_complete(struct crypto_async_request *req, int res) { - struct f2fs_completion_result *ecr = req->data; + struct fscrypt_completion_result *ecr = req->data; if (res == -EINPROGRESS) return; @@ -48,45 +34,35 @@ static void f2fs_dir_crypt_complete(struct crypto_async_request *req, int res) complete(&ecr->completion); } -bool f2fs_valid_filenames_enc_mode(uint32_t mode) -{ - return (mode == F2FS_ENCRYPTION_MODE_AES_256_CTS); -} - -static unsigned max_name_len(struct inode *inode) -{ - return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize : - F2FS_NAME_LEN; -} - /** - * f2fs_fname_encrypt() - + * fname_encrypt() - * * This function encrypts the input filename, and returns the length of the * ciphertext. Errors are returned as negative numbers. We trust the caller to * allocate sufficient memory to oname string. */ -static int f2fs_fname_encrypt(struct inode *inode, - const struct qstr *iname, struct f2fs_str *oname) +static int fname_encrypt(struct inode *inode, + const struct qstr *iname, struct fscrypt_str *oname) { u32 ciphertext_len; - struct ablkcipher_request *req = NULL; - DECLARE_F2FS_COMPLETION_RESULT(ecr); - struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; - struct crypto_ablkcipher *tfm = ci->ci_ctfm; + struct skcipher_request *req = NULL; + DECLARE_FS_COMPLETION_RESULT(ecr); + struct fscrypt_info *ci = inode->i_crypt_info; + struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; - char iv[F2FS_CRYPTO_BLOCK_SIZE]; + char iv[FS_CRYPTO_BLOCK_SIZE]; struct scatterlist src_sg, dst_sg; - int padding = 4 << (ci->ci_flags & F2FS_POLICY_FLAGS_PAD_MASK); + int padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK); char *workbuf, buf[32], *alloc_buf = NULL; - unsigned lim = max_name_len(inode); + unsigned lim; + lim = inode->i_sb->s_cop->max_namelen(inode); if (iname->len <= 0 || iname->len > lim) return -EIO; - ciphertext_len = (iname->len < F2FS_CRYPTO_BLOCK_SIZE) ? - F2FS_CRYPTO_BLOCK_SIZE : iname->len; - ciphertext_len = f2fs_fname_crypto_round_up(ciphertext_len, padding); + ciphertext_len = (iname->len < FS_CRYPTO_BLOCK_SIZE) ? + FS_CRYPTO_BLOCK_SIZE : iname->len; + ciphertext_len = size_round_up(ciphertext_len, padding); ciphertext_len = (ciphertext_len > lim) ? lim : ciphertext_len; if (ciphertext_len <= sizeof(buf)) { @@ -99,16 +75,16 @@ static int f2fs_fname_encrypt(struct inode *inode, } /* Allocate request */ - req = ablkcipher_request_alloc(tfm, GFP_NOFS); + req = skcipher_request_alloc(tfm, GFP_NOFS); if (!req) { printk_ratelimited(KERN_ERR "%s: crypto_request_alloc() failed\n", __func__); kfree(alloc_buf); return -ENOMEM; } - ablkcipher_request_set_callback(req, + skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - f2fs_dir_crypt_complete, &ecr); + dir_crypt_complete, &ecr); /* Copy the input */ memcpy(workbuf, iname->name, iname->len); @@ -116,79 +92,78 @@ static int f2fs_fname_encrypt(struct inode *inode, memset(workbuf + iname->len, 0, ciphertext_len - iname->len); /* Initialize IV */ - memset(iv, 0, F2FS_CRYPTO_BLOCK_SIZE); + memset(iv, 0, FS_CRYPTO_BLOCK_SIZE); /* Create encryption request */ sg_init_one(&src_sg, workbuf, ciphertext_len); sg_init_one(&dst_sg, oname->name, ciphertext_len); - ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv); - res = crypto_ablkcipher_encrypt(req); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv); + res = crypto_skcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } kfree(alloc_buf); - ablkcipher_request_free(req); - if (res < 0) { + skcipher_request_free(req); + if (res < 0) printk_ratelimited(KERN_ERR "%s: Error (error code %d)\n", __func__, res); - } + oname->len = ciphertext_len; return res; } /* - * f2fs_fname_decrypt() + * fname_decrypt() * This function decrypts the input filename, and returns * the length of the plaintext. * Errors are returned as negative numbers. * We trust the caller to allocate sufficient memory to oname string. */ -static int f2fs_fname_decrypt(struct inode *inode, - const struct f2fs_str *iname, struct f2fs_str *oname) +static int fname_decrypt(struct inode *inode, + const struct fscrypt_str *iname, + struct fscrypt_str *oname) { - struct ablkcipher_request *req = NULL; - DECLARE_F2FS_COMPLETION_RESULT(ecr); + struct skcipher_request *req = NULL; + DECLARE_FS_COMPLETION_RESULT(ecr); struct scatterlist src_sg, dst_sg; - struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; - struct crypto_ablkcipher *tfm = ci->ci_ctfm; + struct fscrypt_info *ci = inode->i_crypt_info; + struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; - char iv[F2FS_CRYPTO_BLOCK_SIZE]; - unsigned lim = max_name_len(inode); + char iv[FS_CRYPTO_BLOCK_SIZE]; + unsigned lim; + lim = inode->i_sb->s_cop->max_namelen(inode); if (iname->len <= 0 || iname->len > lim) return -EIO; /* Allocate request */ - req = ablkcipher_request_alloc(tfm, GFP_NOFS); + req = skcipher_request_alloc(tfm, GFP_NOFS); if (!req) { printk_ratelimited(KERN_ERR "%s: crypto_request_alloc() failed\n", __func__); return -ENOMEM; } - ablkcipher_request_set_callback(req, + skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - f2fs_dir_crypt_complete, &ecr); + dir_crypt_complete, &ecr); /* Initialize IV */ - memset(iv, 0, F2FS_CRYPTO_BLOCK_SIZE); + memset(iv, 0, FS_CRYPTO_BLOCK_SIZE); /* Create decryption request */ sg_init_one(&src_sg, iname->name, iname->len); sg_init_one(&dst_sg, oname->name, oname->len); - ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv); - res = crypto_ablkcipher_decrypt(req); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv); + res = crypto_skcipher_decrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } - ablkcipher_request_free(req); + skcipher_request_free(req); if (res < 0) { printk_ratelimited(KERN_ERR - "%s: Error in f2fs_fname_decrypt (error code %d)\n", - __func__, res); + "%s: Error (error code %d)\n", __func__, res); return res; } @@ -200,7 +175,7 @@ static const char *lookup_table = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; /** - * f2fs_fname_encode_digest() - + * digest_encode() - * * Encodes the input digest using characters from the set [a-zA-Z0-9_+]. * The encoded string is roughly 4/3 times the size of the input string. @@ -249,148 +224,152 @@ static int digest_decode(const char *src, int len, char *dst) return cp - dst; } -/** - * f2fs_fname_crypto_round_up() - - * - * Return: The next multiple of block size - */ -u32 f2fs_fname_crypto_round_up(u32 size, u32 blksize) +u32 fscrypt_fname_encrypted_size(struct inode *inode, u32 ilen) { - return ((size + blksize - 1) / blksize) * blksize; + int padding = 32; + struct fscrypt_info *ci = inode->i_crypt_info; + + if (ci) + padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK); + if (ilen < FS_CRYPTO_BLOCK_SIZE) + ilen = FS_CRYPTO_BLOCK_SIZE; + return size_round_up(ilen, padding); } +EXPORT_SYMBOL(fscrypt_fname_encrypted_size); /** - * f2fs_fname_crypto_alloc_obuff() - + * fscrypt_fname_crypto_alloc_obuff() - * * Allocates an output buffer that is sufficient for the crypto operation * specified by the context and the direction. */ -int f2fs_fname_crypto_alloc_buffer(struct inode *inode, - u32 ilen, struct f2fs_str *crypto_str) +int fscrypt_fname_alloc_buffer(struct inode *inode, + u32 ilen, struct fscrypt_str *crypto_str) { - unsigned int olen; - int padding = 16; - struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; + unsigned int olen = fscrypt_fname_encrypted_size(inode, ilen); - if (ci) - padding = 4 << (ci->ci_flags & F2FS_POLICY_FLAGS_PAD_MASK); - if (padding < F2FS_CRYPTO_BLOCK_SIZE) - padding = F2FS_CRYPTO_BLOCK_SIZE; - olen = f2fs_fname_crypto_round_up(ilen, padding); crypto_str->len = olen; - if (olen < F2FS_FNAME_CRYPTO_DIGEST_SIZE * 2) - olen = F2FS_FNAME_CRYPTO_DIGEST_SIZE * 2; - /* Allocated buffer can hold one more character to null-terminate the - * string */ + if (olen < FS_FNAME_CRYPTO_DIGEST_SIZE * 2) + olen = FS_FNAME_CRYPTO_DIGEST_SIZE * 2; + /* + * Allocated buffer can hold one more character to null-terminate the + * string + */ crypto_str->name = kmalloc(olen + 1, GFP_NOFS); if (!(crypto_str->name)) return -ENOMEM; return 0; } +EXPORT_SYMBOL(fscrypt_fname_alloc_buffer); /** - * f2fs_fname_crypto_free_buffer() - + * fscrypt_fname_crypto_free_buffer() - * * Frees the buffer allocated for crypto operation. */ -void f2fs_fname_crypto_free_buffer(struct f2fs_str *crypto_str) +void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str) { if (!crypto_str) return; kfree(crypto_str->name); crypto_str->name = NULL; } +EXPORT_SYMBOL(fscrypt_fname_free_buffer); /** - * f2fs_fname_disk_to_usr() - converts a filename from disk space to user space + * fscrypt_fname_disk_to_usr() - converts a filename from disk space to user + * space */ -int f2fs_fname_disk_to_usr(struct inode *inode, - f2fs_hash_t *hash, - const struct f2fs_str *iname, - struct f2fs_str *oname) +int fscrypt_fname_disk_to_usr(struct inode *inode, + u32 hash, u32 minor_hash, + const struct fscrypt_str *iname, + struct fscrypt_str *oname) { const struct qstr qname = FSTR_TO_QSTR(iname); char buf[24]; int ret; - if (is_dot_dotdot(&qname)) { + if (fscrypt_is_dot_dotdot(&qname)) { oname->name[0] = '.'; oname->name[iname->len - 1] = '.'; oname->len = iname->len; return oname->len; } - if (F2FS_I(inode)->i_crypt_info) - return f2fs_fname_decrypt(inode, iname, oname); + if (iname->len < FS_CRYPTO_BLOCK_SIZE) + return -EUCLEAN; - if (iname->len <= F2FS_FNAME_CRYPTO_DIGEST_SIZE) { + if (inode->i_crypt_info) + return fname_decrypt(inode, iname, oname); + + if (iname->len <= FS_FNAME_CRYPTO_DIGEST_SIZE) { ret = digest_encode(iname->name, iname->len, oname->name); oname->len = ret; return ret; } if (hash) { - memcpy(buf, hash, 4); - memset(buf + 4, 0, 4); - } else + memcpy(buf, &hash, 4); + memcpy(buf + 4, &minor_hash, 4); + } else { memset(buf, 0, 8); + } memcpy(buf + 8, iname->name + iname->len - 16, 16); oname->name[0] = '_'; ret = digest_encode(buf, 24, oname->name + 1); oname->len = ret + 1; return ret + 1; } +EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); /** - * f2fs_fname_usr_to_disk() - converts a filename from user space to disk space + * fscrypt_fname_usr_to_disk() - converts a filename from user space to disk + * space */ -int f2fs_fname_usr_to_disk(struct inode *inode, +int fscrypt_fname_usr_to_disk(struct inode *inode, const struct qstr *iname, - struct f2fs_str *oname) + struct fscrypt_str *oname) { - int res; - struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; - - if (is_dot_dotdot(iname)) { + if (fscrypt_is_dot_dotdot(iname)) { oname->name[0] = '.'; oname->name[iname->len - 1] = '.'; oname->len = iname->len; return oname->len; } - - if (ci) { - res = f2fs_fname_encrypt(inode, iname, oname); - return res; - } - /* Without a proper key, a user is not allowed to modify the filenames + if (inode->i_crypt_info) + return fname_encrypt(inode, iname, oname); + /* + * Without a proper key, a user is not allowed to modify the filenames * in a directory. Consequently, a user space name cannot be mapped to - * a disk-space name */ + * a disk-space name + */ return -EACCES; } +EXPORT_SYMBOL(fscrypt_fname_usr_to_disk); -int f2fs_fname_setup_filename(struct inode *dir, const struct qstr *iname, - int lookup, struct f2fs_filename *fname) +int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, + int lookup, struct fscrypt_name *fname) { - struct f2fs_crypt_info *ci; int ret = 0, bigname = 0; - memset(fname, 0, sizeof(struct f2fs_filename)); + memset(fname, 0, sizeof(struct fscrypt_name)); fname->usr_fname = iname; - if (!f2fs_encrypted_inode(dir) || is_dot_dotdot(iname)) { + if (!dir->i_sb->s_cop->is_encrypted(dir) || + fscrypt_is_dot_dotdot(iname)) { fname->disk_name.name = (unsigned char *)iname->name; fname->disk_name.len = iname->len; return 0; } - ret = f2fs_get_encryption_info(dir); - if (ret) + ret = get_crypt_info(dir); + if (ret && ret != -EOPNOTSUPP) return ret; - ci = F2FS_I(dir)->i_crypt_info; - if (ci) { - ret = f2fs_fname_crypto_alloc_buffer(dir, iname->len, - &fname->crypto_buf); + + if (dir->i_crypt_info) { + ret = fscrypt_fname_alloc_buffer(dir, iname->len, + &fname->crypto_buf); if (ret < 0) return ret; - ret = f2fs_fname_encrypt(dir, iname, &fname->crypto_buf); + ret = fname_encrypt(dir, iname, &fname->crypto_buf); if (ret < 0) goto errout; fname->disk_name.name = fname->crypto_buf.name; @@ -400,18 +379,19 @@ int f2fs_fname_setup_filename(struct inode *dir, const struct qstr *iname, if (!lookup) return -EACCES; - /* We don't have the key and we are doing a lookup; decode the + /* + * We don't have the key and we are doing a lookup; decode the * user-supplied name */ if (iname->name[0] == '_') bigname = 1; - if ((bigname && (iname->len != 33)) || - (!bigname && (iname->len > 43))) + if ((bigname && (iname->len != 33)) || (!bigname && (iname->len > 43))) return -ENOENT; fname->crypto_buf.name = kmalloc(32, GFP_KERNEL); if (fname->crypto_buf.name == NULL) return -ENOMEM; + ret = digest_decode(iname->name + bigname, iname->len - bigname, fname->crypto_buf.name); if (ret < 0) { @@ -421,20 +401,24 @@ int f2fs_fname_setup_filename(struct inode *dir, const struct qstr *iname, fname->crypto_buf.len = ret; if (bigname) { memcpy(&fname->hash, fname->crypto_buf.name, 4); + memcpy(&fname->minor_hash, fname->crypto_buf.name + 4, 4); } else { fname->disk_name.name = fname->crypto_buf.name; fname->disk_name.len = fname->crypto_buf.len; } return 0; + errout: - f2fs_fname_crypto_free_buffer(&fname->crypto_buf); + fscrypt_fname_free_buffer(&fname->crypto_buf); return ret; } +EXPORT_SYMBOL(fscrypt_setup_filename); -void f2fs_fname_free_filename(struct f2fs_filename *fname) +void fscrypt_free_filename(struct fscrypt_name *fname) { kfree(fname->crypto_buf.name); fname->crypto_buf.name = NULL; fname->usr_fname = NULL; fname->disk_name.name = NULL; } +EXPORT_SYMBOL(fscrypt_free_filename); diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c new file mode 100644 index 000000000000..1ac263eddc4e --- /dev/null +++ b/fs/crypto/keyinfo.c @@ -0,0 +1,304 @@ +/* + * key management facility for FS encryption support. + * + * Copyright (C) 2015, Google, Inc. + * + * This contains encryption key functions. + * + * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. + */ + +#include +#include +#include +#include +#include +#include + +static void derive_crypt_complete(struct crypto_async_request *req, int rc) +{ + struct fscrypt_completion_result *ecr = req->data; + + if (rc == -EINPROGRESS) + return; + + ecr->res = rc; + complete(&ecr->completion); +} + +/** + * derive_key_aes() - Derive a key using AES-128-ECB + * @deriving_key: Encryption key used for derivation. + * @source_key: Source key to which to apply derivation. + * @derived_key: Derived key. + * + * Return: Zero on success; non-zero otherwise. + */ +static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], + u8 source_key[FS_AES_256_XTS_KEY_SIZE], + u8 derived_key[FS_AES_256_XTS_KEY_SIZE]) +{ + int res = 0; + struct skcipher_request *req = NULL; + DECLARE_FS_COMPLETION_RESULT(ecr); + struct scatterlist src_sg, dst_sg; + struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0); + + if (IS_ERR(tfm)) { + res = PTR_ERR(tfm); + tfm = NULL; + goto out; + } + crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY); + req = skcipher_request_alloc(tfm, GFP_NOFS); + if (!req) { + res = -ENOMEM; + goto out; + } + skcipher_request_set_callback(req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + derive_crypt_complete, &ecr); + res = crypto_skcipher_setkey(tfm, deriving_key, + FS_AES_128_ECB_KEY_SIZE); + if (res < 0) + goto out; + + sg_init_one(&src_sg, source_key, FS_AES_256_XTS_KEY_SIZE); + sg_init_one(&dst_sg, derived_key, FS_AES_256_XTS_KEY_SIZE); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, + FS_AES_256_XTS_KEY_SIZE, NULL); + res = crypto_skcipher_encrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { + wait_for_completion(&ecr.completion); + res = ecr.res; + } +out: + skcipher_request_free(req); + crypto_free_skcipher(tfm); + return res; +} + +static int validate_user_key(struct fscrypt_info *crypt_info, + struct fscrypt_context *ctx, u8 *raw_key, + u8 *prefix, int prefix_size) +{ + u8 *full_key_descriptor; + struct key *keyring_key; + struct fscrypt_key *master_key; + const struct user_key_payload *ukp; + int full_key_len = prefix_size + (FS_KEY_DESCRIPTOR_SIZE * 2) + 1; + int res; + + full_key_descriptor = kmalloc(full_key_len, GFP_NOFS); + if (!full_key_descriptor) + return -ENOMEM; + + memcpy(full_key_descriptor, prefix, prefix_size); + sprintf(full_key_descriptor + prefix_size, + "%*phN", FS_KEY_DESCRIPTOR_SIZE, + ctx->master_key_descriptor); + full_key_descriptor[full_key_len - 1] = '\0'; + keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL); + kfree(full_key_descriptor); + if (IS_ERR(keyring_key)) + return PTR_ERR(keyring_key); + + if (keyring_key->type != &key_type_logon) { + printk_once(KERN_WARNING + "%s: key type must be logon\n", __func__); + res = -ENOKEY; + goto out; + } + down_read(&keyring_key->sem); + ukp = user_key_payload(keyring_key); + if (ukp->datalen != sizeof(struct fscrypt_key)) { + res = -EINVAL; + up_read(&keyring_key->sem); + goto out; + } + master_key = (struct fscrypt_key *)ukp->data; + BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE); + + if (master_key->size != FS_AES_256_XTS_KEY_SIZE) { + printk_once(KERN_WARNING + "%s: key size incorrect: %d\n", + __func__, master_key->size); + res = -ENOKEY; + up_read(&keyring_key->sem); + goto out; + } + res = derive_key_aes(ctx->nonce, master_key->raw, raw_key); + up_read(&keyring_key->sem); + if (res) + goto out; + + crypt_info->ci_keyring_key = keyring_key; + return 0; +out: + key_put(keyring_key); + return res; +} + +static void put_crypt_info(struct fscrypt_info *ci) +{ + if (!ci) + return; + + key_put(ci->ci_keyring_key); + crypto_free_skcipher(ci->ci_ctfm); + kmem_cache_free(fscrypt_info_cachep, ci); +} + +int get_crypt_info(struct inode *inode) +{ + struct fscrypt_info *crypt_info; + struct fscrypt_context ctx; + struct crypto_skcipher *ctfm; + const char *cipher_str; + u8 raw_key[FS_MAX_KEY_SIZE]; + u8 mode; + int res; + + res = fscrypt_initialize(); + if (res) + return res; + + if (!inode->i_sb->s_cop->get_context) + return -EOPNOTSUPP; +retry: + crypt_info = ACCESS_ONCE(inode->i_crypt_info); + if (crypt_info) { + if (!crypt_info->ci_keyring_key || + key_validate(crypt_info->ci_keyring_key) == 0) + return 0; + fscrypt_put_encryption_info(inode, crypt_info); + goto retry; + } + + res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + if (res < 0) { + if (!fscrypt_dummy_context_enabled(inode)) + return res; + ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS; + ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS; + ctx.flags = 0; + } else if (res != sizeof(ctx)) { + return -EINVAL; + } + res = 0; + + crypt_info = kmem_cache_alloc(fscrypt_info_cachep, GFP_NOFS); + if (!crypt_info) + return -ENOMEM; + + crypt_info->ci_flags = ctx.flags; + crypt_info->ci_data_mode = ctx.contents_encryption_mode; + crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; + crypt_info->ci_ctfm = NULL; + crypt_info->ci_keyring_key = NULL; + memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, + sizeof(crypt_info->ci_master_key)); + if (S_ISREG(inode->i_mode)) + mode = crypt_info->ci_data_mode; + else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + mode = crypt_info->ci_filename_mode; + else + BUG(); + + switch (mode) { + case FS_ENCRYPTION_MODE_AES_256_XTS: + cipher_str = "xts(aes)"; + break; + case FS_ENCRYPTION_MODE_AES_256_CTS: + cipher_str = "cts(cbc(aes))"; + break; + default: + printk_once(KERN_WARNING + "%s: unsupported key mode %d (ino %u)\n", + __func__, mode, (unsigned) inode->i_ino); + res = -ENOKEY; + goto out; + } + if (fscrypt_dummy_context_enabled(inode)) { + memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE); + goto got_key; + } + + res = validate_user_key(crypt_info, &ctx, raw_key, + FS_KEY_DESC_PREFIX, FS_KEY_DESC_PREFIX_SIZE); + if (res && inode->i_sb->s_cop->key_prefix) { + u8 *prefix = NULL; + int prefix_size, res2; + + prefix_size = inode->i_sb->s_cop->key_prefix(inode, &prefix); + res2 = validate_user_key(crypt_info, &ctx, raw_key, + prefix, prefix_size); + if (res2) { + if (res2 == -ENOKEY) + res = -ENOKEY; + goto out; + } + } else if (res) { + goto out; + } +got_key: + ctfm = crypto_alloc_skcipher(cipher_str, 0, 0); + if (!ctfm || IS_ERR(ctfm)) { + res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; + printk(KERN_DEBUG + "%s: error %d (inode %u) allocating crypto tfm\n", + __func__, res, (unsigned) inode->i_ino); + goto out; + } + crypt_info->ci_ctfm = ctfm; + crypto_skcipher_clear_flags(ctfm, ~0); + crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY); + res = crypto_skcipher_setkey(ctfm, raw_key, fscrypt_key_size(mode)); + if (res) + goto out; + + memzero_explicit(raw_key, sizeof(raw_key)); + if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) != NULL) { + put_crypt_info(crypt_info); + goto retry; + } + return 0; + +out: + if (res == -ENOKEY) + res = 0; + put_crypt_info(crypt_info); + memzero_explicit(raw_key, sizeof(raw_key)); + return res; +} + +void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci) +{ + struct fscrypt_info *prev; + + if (ci == NULL) + ci = ACCESS_ONCE(inode->i_crypt_info); + if (ci == NULL) + return; + + prev = cmpxchg(&inode->i_crypt_info, ci, NULL); + if (prev != ci) + return; + + put_crypt_info(ci); +} +EXPORT_SYMBOL(fscrypt_put_encryption_info); + +int fscrypt_get_encryption_info(struct inode *inode) +{ + struct fscrypt_info *ci = inode->i_crypt_info; + + if (!ci || + (ci->ci_keyring_key && + (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | + (1 << KEY_FLAG_REVOKED) | + (1 << KEY_FLAG_DEAD))))) + return get_crypt_info(inode); + return 0; +} +EXPORT_SYMBOL(fscrypt_get_encryption_info); diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c new file mode 100644 index 000000000000..ed115acb5dee --- /dev/null +++ b/fs/crypto/policy.c @@ -0,0 +1,246 @@ +/* + * Encryption policy functions for per-file encryption support. + * + * Copyright (C) 2015, Google, Inc. + * Copyright (C) 2015, Motorola Mobility. + * + * Written by Michael Halcrow, 2015. + * Modified by Jaegeuk Kim, 2015. + */ + +#include +#include +#include +#include + +static int inode_has_encryption_context(struct inode *inode) +{ + if (!inode->i_sb->s_cop->get_context) + return 0; + return (inode->i_sb->s_cop->get_context(inode, NULL, 0L) > 0); +} + +/* + * check whether the policy is consistent with the encryption context + * for the inode + */ +static int is_encryption_context_consistent_with_policy(struct inode *inode, + const struct fscrypt_policy *policy) +{ + struct fscrypt_context ctx; + int res; + + if (!inode->i_sb->s_cop->get_context) + return 0; + + res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + if (res != sizeof(ctx)) + return 0; + + return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor, + FS_KEY_DESCRIPTOR_SIZE) == 0 && + (ctx.flags == policy->flags) && + (ctx.contents_encryption_mode == + policy->contents_encryption_mode) && + (ctx.filenames_encryption_mode == + policy->filenames_encryption_mode)); +} + +static int create_encryption_context_from_policy(struct inode *inode, + const struct fscrypt_policy *policy) +{ + struct fscrypt_context ctx; + int res; + + if (!inode->i_sb->s_cop->set_context) + return -EOPNOTSUPP; + + if (inode->i_sb->s_cop->prepare_context) { + res = inode->i_sb->s_cop->prepare_context(inode); + if (res) + return res; + } + + ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; + memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, + FS_KEY_DESCRIPTOR_SIZE); + + if (!fscrypt_valid_contents_enc_mode( + policy->contents_encryption_mode)) { + printk(KERN_WARNING + "%s: Invalid contents encryption mode %d\n", __func__, + policy->contents_encryption_mode); + return -EINVAL; + } + + if (!fscrypt_valid_filenames_enc_mode( + policy->filenames_encryption_mode)) { + printk(KERN_WARNING + "%s: Invalid filenames encryption mode %d\n", __func__, + policy->filenames_encryption_mode); + return -EINVAL; + } + + if (policy->flags & ~FS_POLICY_FLAGS_VALID) + return -EINVAL; + + ctx.contents_encryption_mode = policy->contents_encryption_mode; + ctx.filenames_encryption_mode = policy->filenames_encryption_mode; + ctx.flags = policy->flags; + BUILD_BUG_ON(sizeof(ctx.nonce) != FS_KEY_DERIVATION_NONCE_SIZE); + get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE); + + return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL); +} + +int fscrypt_process_policy(struct file *filp, + const struct fscrypt_policy *policy) +{ + struct inode *inode = file_inode(filp); + int ret; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (policy->version != 0) + return -EINVAL; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + if (!inode_has_encryption_context(inode)) { + if (!S_ISDIR(inode->i_mode)) + ret = -EINVAL; + else if (!inode->i_sb->s_cop->empty_dir) + ret = -EOPNOTSUPP; + else if (!inode->i_sb->s_cop->empty_dir(inode)) + ret = -ENOTEMPTY; + else + ret = create_encryption_context_from_policy(inode, + policy); + } else if (!is_encryption_context_consistent_with_policy(inode, + policy)) { + printk(KERN_WARNING + "%s: Policy inconsistent with encryption context\n", + __func__); + ret = -EINVAL; + } + + mnt_drop_write_file(filp); + return ret; +} +EXPORT_SYMBOL(fscrypt_process_policy); + +int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy) +{ + struct fscrypt_context ctx; + int res; + + if (!inode->i_sb->s_cop->get_context || + !inode->i_sb->s_cop->is_encrypted(inode)) + return -ENODATA; + + res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + if (res != sizeof(ctx)) + return -ENODATA; + if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1) + return -EINVAL; + + policy->version = 0; + policy->contents_encryption_mode = ctx.contents_encryption_mode; + policy->filenames_encryption_mode = ctx.filenames_encryption_mode; + policy->flags = ctx.flags; + memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor, + FS_KEY_DESCRIPTOR_SIZE); + return 0; +} +EXPORT_SYMBOL(fscrypt_get_policy); + +int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) +{ + struct fscrypt_info *parent_ci, *child_ci; + int res; + + if ((parent == NULL) || (child == NULL)) { + printk(KERN_ERR "parent %p child %p\n", parent, child); + BUG_ON(1); + } + + /* no restrictions if the parent directory is not encrypted */ + if (!parent->i_sb->s_cop->is_encrypted(parent)) + return 1; + /* if the child directory is not encrypted, this is always a problem */ + if (!parent->i_sb->s_cop->is_encrypted(child)) + return 0; + res = fscrypt_get_encryption_info(parent); + if (res) + return 0; + res = fscrypt_get_encryption_info(child); + if (res) + return 0; + parent_ci = parent->i_crypt_info; + child_ci = child->i_crypt_info; + if (!parent_ci && !child_ci) + return 1; + if (!parent_ci || !child_ci) + return 0; + + return (memcmp(parent_ci->ci_master_key, + child_ci->ci_master_key, + FS_KEY_DESCRIPTOR_SIZE) == 0 && + (parent_ci->ci_data_mode == child_ci->ci_data_mode) && + (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) && + (parent_ci->ci_flags == child_ci->ci_flags)); +} +EXPORT_SYMBOL(fscrypt_has_permitted_context); + +/** + * fscrypt_inherit_context() - Sets a child context from its parent + * @parent: Parent inode from which the context is inherited. + * @child: Child inode that inherits the context from @parent. + * @fs_data: private data given by FS. + * @preload: preload child i_crypt_info + * + * Return: Zero on success, non-zero otherwise + */ +int fscrypt_inherit_context(struct inode *parent, struct inode *child, + void *fs_data, bool preload) +{ + struct fscrypt_context ctx; + struct fscrypt_info *ci; + int res; + + if (!parent->i_sb->s_cop->set_context) + return -EOPNOTSUPP; + + res = fscrypt_get_encryption_info(parent); + if (res < 0) + return res; + + ci = parent->i_crypt_info; + if (ci == NULL) + return -ENOKEY; + + ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; + if (fscrypt_dummy_context_enabled(parent)) { + ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS; + ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS; + ctx.flags = 0; + memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE); + res = 0; + } else { + ctx.contents_encryption_mode = ci->ci_data_mode; + ctx.filenames_encryption_mode = ci->ci_filename_mode; + ctx.flags = ci->ci_flags; + memcpy(ctx.master_key_descriptor, ci->ci_master_key, + FS_KEY_DESCRIPTOR_SIZE); + } + get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE); + res = parent->i_sb->s_cop->set_context(child, &ctx, + sizeof(ctx), fs_data); + if (res) + return res; + return preload ? fscrypt_get_encryption_info(child): 0; +} +EXPORT_SYMBOL(fscrypt_inherit_context); diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index b0a9dc929f88..1852d99df97b 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -1,6 +1,9 @@ config F2FS_FS tristate "F2FS filesystem support" depends on BLOCK + select CRYPTO + select KEYS + select CRYPTO_CRC32 help F2FS is based on Log-structured File System (LFS), which supports versatile "flash-friendly" features. The design has been focused on @@ -76,15 +79,7 @@ config F2FS_FS_ENCRYPTION bool "F2FS Encryption" depends on F2FS_FS depends on F2FS_FS_XATTR - select CRYPTO_AES - select CRYPTO_CBC - select CRYPTO_ECB - select CRYPTO_XTS - select CRYPTO_CTS - select CRYPTO_CTR - select CRYPTO_SHA256 - select KEYS - select ENCRYPTED_KEYS + select FS_ENCRYPTION help Enable encryption of f2fs files and directories. This feature is similar to ecryptfs, but it is more memory @@ -100,3 +95,11 @@ config F2FS_IO_TRACE information and block IO patterns in the filesystem level. If unsure, say N. + +config F2FS_FAULT_INJECTION + bool "F2FS fault injection facility" + depends on F2FS_FS + help + Test F2FS to inject faults such as ENOMEM, ENOSPC, and so on. + + If unsure, say N. diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index 08e101ed914c..ca949ea7c02f 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -7,5 +7,3 @@ f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o -f2fs-$(CONFIG_F2FS_FS_ENCRYPTION) += crypto_policy.o crypto.o \ - crypto_key.o crypto_fname.o diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index c8f25f7241f0..fb0744b94c2f 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -109,14 +109,16 @@ static struct posix_acl *f2fs_acl_from_disk(const char *value, size_t size) return ERR_PTR(-EINVAL); } -static void *f2fs_acl_to_disk(const struct posix_acl *acl, size_t *size) +static void *f2fs_acl_to_disk(struct f2fs_sb_info *sbi, + const struct posix_acl *acl, size_t *size) { struct f2fs_acl_header *f2fs_acl; struct f2fs_acl_entry *entry; int i; - f2fs_acl = kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count * - sizeof(struct f2fs_acl_entry), GFP_NOFS); + f2fs_acl = f2fs_kmalloc(sbi, sizeof(struct f2fs_acl_header) + + acl->a_count * sizeof(struct f2fs_acl_entry), + GFP_NOFS); if (!f2fs_acl) return ERR_PTR(-ENOMEM); @@ -175,7 +177,7 @@ static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type, retval = f2fs_getxattr(inode, name_index, "", NULL, 0, dpage); if (retval > 0) { - value = kmalloc(retval, GFP_F2FS_ZERO); + value = f2fs_kmalloc(F2FS_I_SB(inode), retval, GFP_F2FS_ZERO); if (!value) return ERR_PTR(-ENOMEM); retval = f2fs_getxattr(inode, name_index, "", value, @@ -204,7 +206,6 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type) static int __f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl, struct page *ipage) { - struct f2fs_inode_info *fi = F2FS_I(inode); int name_index; void *value = NULL; size_t size = 0; @@ -217,7 +218,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, error = posix_acl_equiv_mode(acl, &inode->i_mode); if (error < 0) return error; - set_acl_inode(fi, inode->i_mode); + set_acl_inode(inode, inode->i_mode); if (error == 0) acl = NULL; } @@ -234,9 +235,9 @@ static int __f2fs_set_acl(struct inode *inode, int type, } if (acl) { - value = f2fs_acl_to_disk(acl, &size); + value = f2fs_acl_to_disk(F2FS_I_SB(inode), acl, &size); if (IS_ERR(value)) { - clear_inode_flag(fi, FI_ACL_MODE); + clear_inode_flag(inode, FI_ACL_MODE); return (int)PTR_ERR(value); } } @@ -247,7 +248,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, if (!error) set_cached_acl(inode, type, acl); - clear_inode_flag(fi, FI_ACL_MODE); + clear_inode_flag(inode, FI_ACL_MODE); return error; } @@ -388,6 +389,8 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, if (error) return error; + f2fs_mark_inode_dirty_sync(inode); + if (default_acl) { error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl, ipage); diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h index 997ca8edb6cb..2c685185c24d 100644 --- a/fs/f2fs/acl.h +++ b/fs/f2fs/acl.h @@ -37,11 +37,10 @@ struct f2fs_acl_header { #ifdef CONFIG_F2FS_FS_POSIX_ACL extern struct posix_acl *f2fs_get_acl(struct inode *, int); -extern int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type); +extern int f2fs_set_acl(struct inode *, struct posix_acl *, int); extern int f2fs_init_acl(struct inode *, struct inode *, struct page *, struct page *); #else -#define f2fs_check_acl NULL #define f2fs_get_acl NULL #define f2fs_set_acl NULL diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f661d80474be..cb23d6cf676b 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -26,6 +26,14 @@ static struct kmem_cache *ino_entry_slab; struct kmem_cache *inode_entry_slab; +void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) +{ + set_ckpt_flags(sbi, CP_ERROR_FLAG); + sbi->sb->s_flags |= MS_RDONLY; + if (!end_io) + f2fs_flush_merged_bios(sbi); +} + /* * We guarantee no failure on the returned page. */ @@ -34,13 +42,14 @@ struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) struct address_space *mapping = META_MAPPING(sbi); struct page *page = NULL; repeat: - page = grab_cache_page(mapping, index); + page = f2fs_grab_cache_page(mapping, index, false); if (!page) { cond_resched(); goto repeat; } - f2fs_wait_on_page_writeback(page, META); - SetPageUptodate(page); + f2fs_wait_on_page_writeback(page, META, true); + if (!PageUptodate(page)) + SetPageUptodate(page); return page; } @@ -56,14 +65,15 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, .sbi = sbi, .type = META, .rw = READ_SYNC | REQ_META | REQ_PRIO, - .blk_addr = index, + .old_blkaddr = index, + .new_blkaddr = index, .encrypted_page = NULL, }; if (unlikely(!is_meta)) fio.rw &= ~REQ_META; repeat: - page = grab_cache_page(mapping, index); + page = f2fs_grab_cache_page(mapping, index, false); if (!page) { cond_resched(); goto repeat; @@ -90,7 +100,7 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, * meta page. */ if (unlikely(!PageUptodate(page))) - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); out: return page; } @@ -143,7 +153,6 @@ bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync) { - block_t prev_blk_addr = 0; struct page *page; block_t blkno = start; struct f2fs_io_info fio = { @@ -152,10 +161,12 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, .rw = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : READA, .encrypted_page = NULL, }; + struct blk_plug plug; if (unlikely(type == META_POR)) fio.rw &= ~REQ_META; + blk_start_plug(&plug); for (; nrpages-- > 0; blkno++) { if (!is_valid_blkaddr(sbi, blkno, type)) @@ -167,27 +178,25 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, NAT_BLOCK_OFFSET(NM_I(sbi)->max_nid))) blkno = 0; /* get nat block addr */ - fio.blk_addr = current_nat_addr(sbi, + fio.new_blkaddr = current_nat_addr(sbi, blkno * NAT_ENTRY_PER_BLOCK); break; case META_SIT: /* get sit block addr */ - fio.blk_addr = current_sit_addr(sbi, + fio.new_blkaddr = current_sit_addr(sbi, blkno * SIT_ENTRY_PER_BLOCK); - if (blkno != start && prev_blk_addr + 1 != fio.blk_addr) - goto out; - prev_blk_addr = fio.blk_addr; break; case META_SSA: case META_CP: case META_POR: - fio.blk_addr = blkno; + fio.new_blkaddr = blkno; break; default: BUG(); } - page = grab_cache_page(META_MAPPING(sbi), fio.blk_addr); + page = f2fs_grab_cache_page(META_MAPPING(sbi), + fio.new_blkaddr, false); if (!page) continue; if (PageUptodate(page)) { @@ -196,11 +205,13 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, } fio.page = page; + fio.old_blkaddr = fio.new_blkaddr; f2fs_submit_page_mbio(&fio); f2fs_put_page(page, 0); } out: f2fs_submit_merged_bio(sbi, META, READ); + blk_finish_plug(&plug); return blkno - start; } @@ -210,7 +221,7 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) bool readahead = false; page = find_get_page(META_MAPPING(sbi), index); - if (!page || (page && !PageUptodate(page))) + if (!page || !PageUptodate(page)) readahead = true; f2fs_put_page(page, 0); @@ -232,13 +243,17 @@ static int f2fs_write_meta_page(struct page *page, if (unlikely(f2fs_cp_error(sbi))) goto redirty_out; - f2fs_wait_on_page_writeback(page, META); write_meta_page(sbi, page); dec_page_count(sbi, F2FS_DIRTY_META); - unlock_page(page); if (wbc->for_reclaim) + f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, META, WRITE); + + unlock_page(page); + + if (unlikely(f2fs_cp_error(sbi))) f2fs_submit_merged_bio(sbi, META, WRITE); + return 0; redirty_out: @@ -252,13 +267,13 @@ static int f2fs_write_meta_pages(struct address_space *mapping, struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); long diff, written; - trace_f2fs_writepages(mapping->host, wbc, META); - /* collect a number of dirty meta pages and write together */ if (wbc->for_kupdate || get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) goto skip_write; + trace_f2fs_writepages(mapping->host, wbc, META); + /* if mounting is failed, skip writing node pages */ mutex_lock(&sbi->cp_mutex); diff = nr_pages_to_write(sbi, META, wbc); @@ -269,6 +284,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping, skip_write: wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_META); + trace_f2fs_writepages(mapping->host, wbc, META); return 0; } @@ -276,15 +292,18 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, long nr_to_write) { struct address_space *mapping = META_MAPPING(sbi); - pgoff_t index = 0, end = LONG_MAX, prev = LONG_MAX; + pgoff_t index = 0, end = ULONG_MAX, prev = ULONG_MAX; struct pagevec pvec; long nwritten = 0; struct writeback_control wbc = { .for_reclaim = 0, }; + struct blk_plug plug; pagevec_init(&pvec, 0); + blk_start_plug(&plug); + while (index <= end) { int i, nr_pages; nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, @@ -296,7 +315,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - if (prev == LONG_MAX) + if (prev == ULONG_MAX) prev = page->index - 1; if (nr_to_write != LONG_MAX && page->index != prev + 1) { pagevec_release(&pvec); @@ -315,6 +334,9 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, goto continue_unlock; } + f2fs_wait_on_page_writeback(page, META, true); + + BUG_ON(PageWriteback(page)); if (!clear_page_dirty_for_io(page)) goto continue_unlock; @@ -334,6 +356,8 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, if (nwritten) f2fs_submit_merged_bio(sbi, type, WRITE); + blk_finish_plug(&plug); + return nwritten; } @@ -341,9 +365,10 @@ static int f2fs_set_meta_page_dirty(struct page *page) { trace_f2fs_set_page_dirty(page, META); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); if (!PageDirty(page)) { - __set_page_dirty_nobuffers(page); + f2fs_set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META); SetPagePrivate(page); f2fs_trace_pid(page); @@ -358,6 +383,9 @@ const struct address_space_operations f2fs_meta_aops = { .set_page_dirty = f2fs_set_meta_page_dirty, .invalidatepage = f2fs_invalidate_page, .releasepage = f2fs_release_page, +#ifdef CONFIG_MIGRATION + .migratepage = f2fs_migrate_page, +#endif }; static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) @@ -410,13 +438,13 @@ static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) spin_unlock(&im->ino_lock); } -void add_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type) +void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { /* add new dirty ino entry into list */ __add_ino_entry(sbi, ino, type); } -void remove_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type) +void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { /* remove dirty ino entry from list */ __remove_ino_entry(sbi, ino, type); @@ -434,12 +462,12 @@ bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode) return e ? true : false; } -void release_dirty_inode(struct f2fs_sb_info *sbi) +void release_ino_entry(struct f2fs_sb_info *sbi, bool all) { struct ino_entry *e, *tmp; int i; - for (i = APPEND_INO; i <= UPDATE_INO; i++) { + for (i = all ? ORPHAN_INO: APPEND_INO; i <= UPDATE_INO; i++) { struct inode_management *im = &sbi->im[i]; spin_lock(&im->ino_lock); @@ -459,6 +487,13 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi) int err = 0; spin_lock(&im->ino_lock); + +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_ORPHAN)) { + spin_unlock(&im->ino_lock); + return -ENOSPC; + } +#endif if (unlikely(im->ino_num >= sbi->max_orphans)) err = -ENOSPC; else @@ -478,10 +513,11 @@ void release_orphan_inode(struct f2fs_sb_info *sbi) spin_unlock(&im->ino_lock); } -void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) +void add_orphan_inode(struct inode *inode) { /* add new orphan ino entry into list */ - __add_ino_entry(sbi, ino, ORPHAN_INO); + __add_ino_entry(F2FS_I_SB(inode), inode->i_ino, ORPHAN_INO); + update_inode_page(inode); } void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) @@ -493,8 +529,20 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { struct inode *inode; + struct node_info ni; + int err = acquire_orphan_inode(sbi); - inode = f2fs_iget(sbi->sb, ino); + if (err) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: orphan failed (ino=%x), run fsck to fix.", + __func__, ino); + return err; + } + + __add_ino_entry(sbi, ino, ORPHAN_INO); + + inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) { /* * there should be a bug that we can't find the entry @@ -508,6 +556,18 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) /* truncate all the data during iput */ iput(inode); + + get_node_info(sbi, ino, &ni); + + /* ENOMEM was fully retried in f2fs_evict_inode. */ + if (ni.blk_addr != NULL_ADDR) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: orphan failed (ino=%x), run fsck to fix.", + __func__, ino); + return -EIO; + } + __remove_ino_entry(sbi, ino, ORPHAN_INO); return 0; } @@ -516,7 +576,7 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) block_t start_blk, orphan_blocks, i, j; int err; - if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG)) + if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG)) return 0; start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); @@ -540,7 +600,7 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) f2fs_put_page(page, 1); } /* clear Orphan Flag */ - clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG); + clear_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG); return 0; } @@ -601,45 +661,55 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) } } +static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, + struct f2fs_checkpoint **cp_block, struct page **cp_page, + unsigned long long *version) +{ + unsigned long blk_size = sbi->blocksize; + size_t crc_offset = 0; + __u32 crc = 0; + + *cp_page = get_meta_page(sbi, cp_addr); + *cp_block = (struct f2fs_checkpoint *)page_address(*cp_page); + + crc_offset = le32_to_cpu((*cp_block)->checksum_offset); + if (crc_offset >= blk_size) { + f2fs_msg(sbi->sb, KERN_WARNING, + "invalid crc_offset: %zu", crc_offset); + return -EINVAL; + } + + crc = le32_to_cpu(*((__le32 *)((unsigned char *)*cp_block + + crc_offset))); + if (!f2fs_crc_valid(sbi, crc, *cp_block, crc_offset)) { + f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc value"); + return -EINVAL; + } + + *version = cur_cp_version(*cp_block); + return 0; +} + static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, block_t cp_addr, unsigned long long *version) { - struct page *cp_page_1, *cp_page_2 = NULL; - unsigned long blk_size = sbi->blocksize; - struct f2fs_checkpoint *cp_block; + struct page *cp_page_1 = NULL, *cp_page_2 = NULL; + struct f2fs_checkpoint *cp_block = NULL; unsigned long long cur_version = 0, pre_version = 0; - size_t crc_offset; - __u32 crc = 0; + int err; - /* Read the 1st cp block in this CP pack */ - cp_page_1 = get_meta_page(sbi, cp_addr); - - /* get the version number */ - cp_block = (struct f2fs_checkpoint *)page_address(cp_page_1); - crc_offset = le32_to_cpu(cp_block->checksum_offset); - if (crc_offset >= blk_size) + err = get_checkpoint_version(sbi, cp_addr, &cp_block, + &cp_page_1, version); + if (err) goto invalid_cp1; + pre_version = *version; - crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset))); - if (!f2fs_crc_valid(crc, cp_block, crc_offset)) - goto invalid_cp1; - - pre_version = cur_cp_version(cp_block); - - /* Read the 2nd cp block in this CP pack */ cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1; - cp_page_2 = get_meta_page(sbi, cp_addr); - - cp_block = (struct f2fs_checkpoint *)page_address(cp_page_2); - crc_offset = le32_to_cpu(cp_block->checksum_offset); - if (crc_offset >= blk_size) + err = get_checkpoint_version(sbi, cp_addr, &cp_block, + &cp_page_2, version); + if (err) goto invalid_cp2; - - crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset))); - if (!f2fs_crc_valid(crc, cp_block, crc_offset)) - goto invalid_cp2; - - cur_version = cur_cp_version(cp_block); + cur_version = *version; if (cur_version == pre_version) { *version = cur_version; @@ -696,6 +766,10 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) cp_block = (struct f2fs_checkpoint *)page_address(cur_page); memcpy(sbi->ckpt, cp_block, blk_size); + /* Sanity checking of checkpoint */ + if (sanity_check_ckpt(sbi)) + goto fail_no_cp; + if (cp_blks <= 1) goto done; @@ -722,118 +796,94 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) return -EINVAL; } -static int __add_dirty_inode(struct inode *inode, struct inode_entry *new) +static void __add_dirty_inode(struct inode *inode, enum inode_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE; - if (is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) - return -EEXIST; + if (is_inode_flag_set(inode, flag)) + return; - set_inode_flag(F2FS_I(inode), FI_DIRTY_DIR); - F2FS_I(inode)->dirty_dir = new; - list_add_tail(&new->list, &sbi->dir_inode_list); - stat_inc_dirty_dir(sbi); - return 0; + set_inode_flag(inode, flag); + list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]); + stat_inc_dirty_inode(sbi, type); +} + +static void __remove_dirty_inode(struct inode *inode, enum inode_type type) +{ + int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE; + + if (get_dirty_pages(inode) || !is_inode_flag_set(inode, flag)) + return; + + list_del_init(&F2FS_I(inode)->dirty_list); + clear_inode_flag(inode, flag); + stat_dec_dirty_inode(F2FS_I_SB(inode), type); } void update_dirty_page(struct inode *inode, struct page *page) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct inode_entry *new; - int ret = 0; + enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) return; - if (!S_ISDIR(inode->i_mode)) { - inode_inc_dirty_pages(inode); - goto out; - } - - new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); - new->inode = inode; - INIT_LIST_HEAD(&new->list); - - spin_lock(&sbi->dir_inode_lock); - ret = __add_dirty_inode(inode, new); + spin_lock(&sbi->inode_lock[type]); + if (type != FILE_INODE || test_opt(sbi, DATA_FLUSH)) + __add_dirty_inode(inode, type); inode_inc_dirty_pages(inode); - spin_unlock(&sbi->dir_inode_lock); + spin_unlock(&sbi->inode_lock[type]); - if (ret) - kmem_cache_free(inode_entry_slab, new); -out: SetPagePrivate(page); f2fs_trace_pid(page); } -void add_dirty_dir_inode(struct inode *inode) +void remove_dirty_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct inode_entry *new = - f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); - int ret = 0; + enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; - new->inode = inode; - INIT_LIST_HEAD(&new->list); - - spin_lock(&sbi->dir_inode_lock); - ret = __add_dirty_inode(inode, new); - spin_unlock(&sbi->dir_inode_lock); - - if (ret) - kmem_cache_free(inode_entry_slab, new); -} - -void remove_dirty_dir_inode(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct inode_entry *entry; - - if (!S_ISDIR(inode->i_mode)) + if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && + !S_ISLNK(inode->i_mode)) return; - spin_lock(&sbi->dir_inode_lock); - if (get_dirty_pages(inode) || - !is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) { - spin_unlock(&sbi->dir_inode_lock); + if (type == FILE_INODE && !test_opt(sbi, DATA_FLUSH)) return; - } - entry = F2FS_I(inode)->dirty_dir; - list_del(&entry->list); - F2FS_I(inode)->dirty_dir = NULL; - clear_inode_flag(F2FS_I(inode), FI_DIRTY_DIR); - stat_dec_dirty_dir(sbi); - spin_unlock(&sbi->dir_inode_lock); - kmem_cache_free(inode_entry_slab, entry); - - /* Only from the recovery routine */ - if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) { - clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT); - iput(inode); - } + spin_lock(&sbi->inode_lock[type]); + __remove_dirty_inode(inode, type); + spin_unlock(&sbi->inode_lock[type]); } -void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi) +int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) { struct list_head *head; - struct inode_entry *entry; struct inode *inode; + struct f2fs_inode_info *fi; + bool is_dir = (type == DIR_INODE); + + trace_f2fs_sync_dirty_inodes_enter(sbi->sb, is_dir, + get_pages(sbi, is_dir ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); retry: if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; - spin_lock(&sbi->dir_inode_lock); + spin_lock(&sbi->inode_lock[type]); - head = &sbi->dir_inode_list; + head = &sbi->inode_list[type]; if (list_empty(head)) { - spin_unlock(&sbi->dir_inode_lock); - return; + spin_unlock(&sbi->inode_lock[type]); + trace_f2fs_sync_dirty_inodes_exit(sbi->sb, is_dir, + get_pages(sbi, is_dir ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); + return 0; } - entry = list_entry(head->next, struct inode_entry, list); - inode = igrab(entry->inode); - spin_unlock(&sbi->dir_inode_lock); + fi = list_entry(head->next, struct f2fs_inode_info, dirty_list); + inode = igrab(&fi->vfs_inode); + spin_unlock(&sbi->inode_lock[type]); if (inode) { filemap_fdatawrite(inode->i_mapping); iput(inode); @@ -848,6 +898,34 @@ void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi) goto retry; } +int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) +{ + struct list_head *head = &sbi->inode_list[DIRTY_META]; + struct inode *inode; + struct f2fs_inode_info *fi; + s64 total = get_pages(sbi, F2FS_DIRTY_IMETA); + + while (total--) { + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + + spin_lock(&sbi->inode_lock[DIRTY_META]); + if (list_empty(head)) { + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return 0; + } + fi = list_entry(head->next, struct f2fs_inode_info, + gdirty_list); + inode = igrab(&fi->vfs_inode); + spin_unlock(&sbi->inode_lock[DIRTY_META]); + if (inode) { + update_inode_page(inode); + iput(inode); + } + }; + return 0; +} + /* * Freeze all the FS-operations for checkpoint. */ @@ -868,11 +946,17 @@ static int block_operations(struct f2fs_sb_info *sbi) /* write all the dirty dentry pages */ if (get_pages(sbi, F2FS_DIRTY_DENTS)) { f2fs_unlock_all(sbi); - sync_dirty_dir_inodes(sbi); - if (unlikely(f2fs_cp_error(sbi))) { - err = -EIO; + err = sync_dirty_inodes(sbi, DIR_INODE); + if (err) + goto out; + goto retry_flush_dents; + } + + if (get_pages(sbi, F2FS_DIRTY_IMETA)) { + f2fs_unlock_all(sbi); + err = f2fs_sync_inode_meta(sbi); + if (err) goto out; - } goto retry_flush_dents; } @@ -885,10 +969,9 @@ static int block_operations(struct f2fs_sb_info *sbi) if (get_pages(sbi, F2FS_DIRTY_NODES)) { up_write(&sbi->node_write); - sync_node_pages(sbi, 0, &wbc); - if (unlikely(f2fs_cp_error(sbi))) { + err = sync_node_pages(sbi, &wbc); + if (err) { f2fs_unlock_all(sbi); - err = -EIO; goto out; } goto retry_flush_nodes; @@ -901,6 +984,8 @@ static int block_operations(struct f2fs_sb_info *sbi) static void unblock_operations(struct f2fs_sb_info *sbi) { up_write(&sbi->node_write); + + build_free_nids(sbi); f2fs_unlock_all(sbi); } @@ -911,18 +996,48 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) for (;;) { prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE); - if (!get_pages(sbi, F2FS_WRITEBACK)) + if (!atomic_read(&sbi->nr_wb_bios)) break; - io_schedule(); + io_schedule_timeout(5*HZ); } finish_wait(&sbi->cp_wait, &wait); } -static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) +static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) +{ + unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + + spin_lock(&sbi->cp_lock); + + if (cpc->reason == CP_UMOUNT) + __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); + else + __clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); + + if (cpc->reason == CP_FASTBOOT) + __set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); + else + __clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); + + if (orphan_num) + __set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); + else + __clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); + + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) + __set_ckpt_flags(ckpt, CP_FSCK_FLAG); + + /* set this flag to activate crc|cp_ver for recovery */ + __set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG); + + spin_unlock(&sbi->cp_lock); +} + +static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); - struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; nid_t last_nid = nm_i->next_scan_nid; @@ -931,21 +1046,15 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) __u32 crc32 = 0; int i; int cp_payload_blks = __cp_payload(sbi); - block_t discard_blk = NEXT_FREE_BLKADDR(sbi, curseg); - bool invalidate = false; - - /* - * This avoids to conduct wrong roll-forward operations and uses - * metapages, so should be called prior to sync_meta_pages below. - */ - if (discard_next_dnode(sbi, discard_blk)) - invalidate = true; + struct super_block *sb = sbi->sb; + struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); + u64 kbytes_written; /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) { sync_meta_pages(sbi, META, LONG_MAX); if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; } next_free_nid(sbi, &last_nid); @@ -980,10 +1089,12 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* 2 cp + n data seg summary + orphan inode blocks */ data_sum_blocks = npages_for_summary_flush(sbi, false); + spin_lock(&sbi->cp_lock); if (data_sum_blocks < NR_CURSEG_DATA_TYPE) - set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); + __set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); else - clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); + __clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); + spin_unlock(&sbi->cp_lock); orphan_blocks = GET_ORPHAN_BLOCKS(orphan_num); ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks + @@ -998,29 +1109,14 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) cp_payload_blks + data_sum_blocks + orphan_blocks); - if (cpc->reason == CP_UMOUNT) - set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); - else - clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); - - if (cpc->reason == CP_FASTBOOT) - set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); - else - clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); - - if (orphan_num) - set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); - else - clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); - - if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) - set_ckpt_flags(ckpt, CP_FSCK_FLAG); + /* update ckpt flag for checkpoint */ + update_ckpt_flags(sbi, cpc); /* update SIT/NAT bitmap */ get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP)); get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP)); - crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset)); + crc32 = f2fs_crc32(sbi, ckpt, le32_to_cpu(ckpt->checksum_offset)); *((__le32 *)((unsigned char *)ckpt + le32_to_cpu(ckpt->checksum_offset))) = cpu_to_le32(crc32); @@ -1030,7 +1126,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* need to wait for end_io results */ wait_on_all_pages_writeback(sbi); if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; /* write out checkpoint buffer at block 0 */ update_meta_page(sbi, ckpt, start_blk++); @@ -1046,6 +1142,14 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) write_data_summaries(sbi, start_blk); start_blk += data_sum_blocks; + + /* Record write statistics in the hot node summary */ + kbytes_written = sbi->kbytes_written; + if (sb->s_bdev->bd_part) + kbytes_written += BD_PART_WRITTEN(sbi); + + seg_i->journal->info.kbytes_written = cpu_to_le64(kbytes_written); + if (__remain_node_summaries(cpc->reason)) { write_node_summaries(sbi, start_blk); start_blk += NR_CURSEG_NODE_TYPE; @@ -1058,14 +1162,14 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) wait_on_all_pages_writeback(sbi); if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; - filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX); - filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX); + filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LLONG_MAX); + filemap_fdatawait_range(META_MAPPING(sbi), 0, LLONG_MAX); /* update user_block_counts */ sbi->last_valid_block_count = sbi->total_valid_block_count; - sbi->alloc_valid_block_count = 0; + percpu_counter_set(&sbi->alloc_valid_block_count, 0); /* Here, we only have one bio having CP pack */ sync_meta_pages(sbi, META_FLUSH, LONG_MAX); @@ -1073,30 +1177,36 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* wait for previous submitted meta pages writeback */ wait_on_all_pages_writeback(sbi); - /* - * invalidate meta page which is used temporarily for zeroing out - * block at the end of warm node chain. - */ - if (invalidate) - invalidate_mapping_pages(META_MAPPING(sbi), discard_blk, - discard_blk); - - release_dirty_inode(sbi); + release_ino_entry(sbi, false); if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; clear_prefree_segments(sbi, cpc); clear_sbi_flag(sbi, SBI_IS_DIRTY); + clear_sbi_flag(sbi, SBI_NEED_CP); + + /* + * redirty superblock if metadata like node page or inode cache is + * updated during writing checkpoint. + */ + if (get_pages(sbi, F2FS_DIRTY_NODES) || + get_pages(sbi, F2FS_DIRTY_IMETA)) + set_sbi_flag(sbi, SBI_IS_DIRTY); + + f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_DENTS)); + + return 0; } /* * We guarantee that this checkpoint procedure will not fail. */ -void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) +int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long long ckpt_ver; + int err = 0; mutex_lock(&sbi->cp_mutex); @@ -1104,21 +1214,35 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) (cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC || (cpc->reason == CP_DISCARD && !sbi->discard_blks))) goto out; - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; goto out; - if (f2fs_readonly(sbi->sb)) + } + if (f2fs_readonly(sbi->sb)) { + err = -EROFS; goto out; + } trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops"); - if (block_operations(sbi)) + err = block_operations(sbi); + if (err) goto out; trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops"); - f2fs_submit_merged_bio(sbi, DATA, WRITE); - f2fs_submit_merged_bio(sbi, NODE, WRITE); - f2fs_submit_merged_bio(sbi, META, WRITE); + f2fs_flush_merged_bios(sbi); + + /* this is the case of multiple fstrims without any changes */ + if (cpc->reason == CP_DISCARD && !is_sbi_flag_set(sbi, SBI_IS_DIRTY)) { + f2fs_bug_on(sbi, NM_I(sbi)->dirty_nat_cnt); + f2fs_bug_on(sbi, SIT_I(sbi)->dirty_sentries); + f2fs_bug_on(sbi, prefree_segments(sbi)); + flush_sit_entries(sbi, cpc); + clear_prefree_segments(sbi, cpc); + unblock_operations(sbi); + goto out; + } /* * update checkpoint pack index @@ -1133,7 +1257,7 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) flush_sit_entries(sbi, cpc); /* unlock all the fs_lock[] in do_checkpoint() */ - do_checkpoint(sbi, cpc); + err = do_checkpoint(sbi, cpc); unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); @@ -1143,10 +1267,11 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) "checkpoint: version = %llx", ckpt_ver); /* do checkpoint periodically */ - sbi->cp_expires = round_jiffies_up(jiffies + HZ * sbi->cp_interval); + f2fs_update_time(sbi, CP_TIME); + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); out: mutex_unlock(&sbi->cp_mutex); - trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); + return err; } void init_ino_entry_info(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/crypto.c b/fs/f2fs/crypto.c deleted file mode 100644 index 4a62ef14e932..000000000000 --- a/fs/f2fs/crypto.c +++ /dev/null @@ -1,491 +0,0 @@ -/* - * linux/fs/f2fs/crypto.c - * - * Copied from linux/fs/ext4/crypto.c - * - * Copyright (C) 2015, Google, Inc. - * Copyright (C) 2015, Motorola Mobility - * - * This contains encryption functions for f2fs - * - * Written by Michael Halcrow, 2014. - * - * Filename encryption additions - * Uday Savagaonkar, 2014 - * Encryption policy handling additions - * Ildar Muslukhov, 2014 - * Remove ext4_encrypted_zeroout(), - * add f2fs_restore_and_release_control_page() - * Jaegeuk Kim, 2015. - * - * This has not yet undergone a rigorous security audit. - * - * The usage of AES-XTS should conform to recommendations in NIST - * Special Publication 800-38E and IEEE P1619/D16. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "f2fs.h" -#include "xattr.h" - -/* Encryption added and removed here! (L: */ - -static unsigned int num_prealloc_crypto_pages = 32; -static unsigned int num_prealloc_crypto_ctxs = 128; - -module_param(num_prealloc_crypto_pages, uint, 0444); -MODULE_PARM_DESC(num_prealloc_crypto_pages, - "Number of crypto pages to preallocate"); -module_param(num_prealloc_crypto_ctxs, uint, 0444); -MODULE_PARM_DESC(num_prealloc_crypto_ctxs, - "Number of crypto contexts to preallocate"); - -static mempool_t *f2fs_bounce_page_pool; - -static LIST_HEAD(f2fs_free_crypto_ctxs); -static DEFINE_SPINLOCK(f2fs_crypto_ctx_lock); - -static struct workqueue_struct *f2fs_read_workqueue; -static DEFINE_MUTEX(crypto_init); - -static struct kmem_cache *f2fs_crypto_ctx_cachep; -struct kmem_cache *f2fs_crypt_info_cachep; - -/** - * f2fs_release_crypto_ctx() - Releases an encryption context - * @ctx: The encryption context to release. - * - * If the encryption context was allocated from the pre-allocated pool, returns - * it to that pool. Else, frees it. - * - * If there's a bounce page in the context, this frees that. - */ -void f2fs_release_crypto_ctx(struct f2fs_crypto_ctx *ctx) -{ - unsigned long flags; - - if (ctx->flags & F2FS_WRITE_PATH_FL && ctx->w.bounce_page) { - mempool_free(ctx->w.bounce_page, f2fs_bounce_page_pool); - ctx->w.bounce_page = NULL; - } - ctx->w.control_page = NULL; - if (ctx->flags & F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL) { - kmem_cache_free(f2fs_crypto_ctx_cachep, ctx); - } else { - spin_lock_irqsave(&f2fs_crypto_ctx_lock, flags); - list_add(&ctx->free_list, &f2fs_free_crypto_ctxs); - spin_unlock_irqrestore(&f2fs_crypto_ctx_lock, flags); - } -} - -/** - * f2fs_get_crypto_ctx() - Gets an encryption context - * @inode: The inode for which we are doing the crypto - * - * Allocates and initializes an encryption context. - * - * Return: An allocated and initialized encryption context on success; error - * value or NULL otherwise. - */ -struct f2fs_crypto_ctx *f2fs_get_crypto_ctx(struct inode *inode) -{ - struct f2fs_crypto_ctx *ctx = NULL; - unsigned long flags; - struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; - - if (ci == NULL) - return ERR_PTR(-ENOKEY); - - /* - * We first try getting the ctx from a free list because in - * the common case the ctx will have an allocated and - * initialized crypto tfm, so it's probably a worthwhile - * optimization. For the bounce page, we first try getting it - * from the kernel allocator because that's just about as fast - * as getting it from a list and because a cache of free pages - * should generally be a "last resort" option for a filesystem - * to be able to do its job. - */ - spin_lock_irqsave(&f2fs_crypto_ctx_lock, flags); - ctx = list_first_entry_or_null(&f2fs_free_crypto_ctxs, - struct f2fs_crypto_ctx, free_list); - if (ctx) - list_del(&ctx->free_list); - spin_unlock_irqrestore(&f2fs_crypto_ctx_lock, flags); - if (!ctx) { - ctx = kmem_cache_zalloc(f2fs_crypto_ctx_cachep, GFP_NOFS); - if (!ctx) - return ERR_PTR(-ENOMEM); - ctx->flags |= F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL; - } else { - ctx->flags &= ~F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL; - } - ctx->flags &= ~F2FS_WRITE_PATH_FL; - return ctx; -} - -/* - * Call f2fs_decrypt on every single page, reusing the encryption - * context. - */ -static void completion_pages(struct work_struct *work) -{ - struct f2fs_crypto_ctx *ctx = - container_of(work, struct f2fs_crypto_ctx, r.work); - struct bio *bio = ctx->r.bio; - struct bio_vec *bv; - int i; - - bio_for_each_segment_all(bv, bio, i) { - struct page *page = bv->bv_page; - int ret = f2fs_decrypt(ctx, page); - - if (ret) { - WARN_ON_ONCE(1); - SetPageError(page); - } else - SetPageUptodate(page); - unlock_page(page); - } - f2fs_release_crypto_ctx(ctx); - bio_put(bio); -} - -void f2fs_end_io_crypto_work(struct f2fs_crypto_ctx *ctx, struct bio *bio) -{ - INIT_WORK(&ctx->r.work, completion_pages); - ctx->r.bio = bio; - queue_work(f2fs_read_workqueue, &ctx->r.work); -} - -static void f2fs_crypto_destroy(void) -{ - struct f2fs_crypto_ctx *pos, *n; - - list_for_each_entry_safe(pos, n, &f2fs_free_crypto_ctxs, free_list) - kmem_cache_free(f2fs_crypto_ctx_cachep, pos); - INIT_LIST_HEAD(&f2fs_free_crypto_ctxs); - if (f2fs_bounce_page_pool) - mempool_destroy(f2fs_bounce_page_pool); - f2fs_bounce_page_pool = NULL; -} - -/** - * f2fs_crypto_initialize() - Set up for f2fs encryption. - * - * We only call this when we start accessing encrypted files, since it - * results in memory getting allocated that wouldn't otherwise be used. - * - * Return: Zero on success, non-zero otherwise. - */ -int f2fs_crypto_initialize(void) -{ - int i, res = -ENOMEM; - - if (f2fs_bounce_page_pool) - return 0; - - mutex_lock(&crypto_init); - if (f2fs_bounce_page_pool) - goto already_initialized; - - for (i = 0; i < num_prealloc_crypto_ctxs; i++) { - struct f2fs_crypto_ctx *ctx; - - ctx = kmem_cache_zalloc(f2fs_crypto_ctx_cachep, GFP_KERNEL); - if (!ctx) - goto fail; - list_add(&ctx->free_list, &f2fs_free_crypto_ctxs); - } - - /* must be allocated at the last step to avoid race condition above */ - f2fs_bounce_page_pool = - mempool_create_page_pool(num_prealloc_crypto_pages, 0); - if (!f2fs_bounce_page_pool) - goto fail; - -already_initialized: - mutex_unlock(&crypto_init); - return 0; -fail: - f2fs_crypto_destroy(); - mutex_unlock(&crypto_init); - return res; -} - -/** - * f2fs_exit_crypto() - Shutdown the f2fs encryption system - */ -void f2fs_exit_crypto(void) -{ - f2fs_crypto_destroy(); - - if (f2fs_read_workqueue) - destroy_workqueue(f2fs_read_workqueue); - if (f2fs_crypto_ctx_cachep) - kmem_cache_destroy(f2fs_crypto_ctx_cachep); - if (f2fs_crypt_info_cachep) - kmem_cache_destroy(f2fs_crypt_info_cachep); -} - -int __init f2fs_init_crypto(void) -{ - int res = -ENOMEM; - - f2fs_read_workqueue = alloc_workqueue("f2fs_crypto", WQ_HIGHPRI, 0); - if (!f2fs_read_workqueue) - goto fail; - - f2fs_crypto_ctx_cachep = KMEM_CACHE(f2fs_crypto_ctx, - SLAB_RECLAIM_ACCOUNT); - if (!f2fs_crypto_ctx_cachep) - goto fail; - - f2fs_crypt_info_cachep = KMEM_CACHE(f2fs_crypt_info, - SLAB_RECLAIM_ACCOUNT); - if (!f2fs_crypt_info_cachep) - goto fail; - - return 0; -fail: - f2fs_exit_crypto(); - return res; -} - -void f2fs_restore_and_release_control_page(struct page **page) -{ - struct f2fs_crypto_ctx *ctx; - struct page *bounce_page; - - /* The bounce data pages are unmapped. */ - if ((*page)->mapping) - return; - - /* The bounce data page is unmapped. */ - bounce_page = *page; - ctx = (struct f2fs_crypto_ctx *)page_private(bounce_page); - - /* restore control page */ - *page = ctx->w.control_page; - - f2fs_restore_control_page(bounce_page); -} - -void f2fs_restore_control_page(struct page *data_page) -{ - struct f2fs_crypto_ctx *ctx = - (struct f2fs_crypto_ctx *)page_private(data_page); - - set_page_private(data_page, (unsigned long)NULL); - ClearPagePrivate(data_page); - unlock_page(data_page); - f2fs_release_crypto_ctx(ctx); -} - -/** - * f2fs_crypt_complete() - The completion callback for page encryption - * @req: The asynchronous encryption request context - * @res: The result of the encryption operation - */ -static void f2fs_crypt_complete(struct crypto_async_request *req, int res) -{ - struct f2fs_completion_result *ecr = req->data; - - if (res == -EINPROGRESS) - return; - ecr->res = res; - complete(&ecr->completion); -} - -typedef enum { - F2FS_DECRYPT = 0, - F2FS_ENCRYPT, -} f2fs_direction_t; - -static int f2fs_page_crypto(struct f2fs_crypto_ctx *ctx, - struct inode *inode, - f2fs_direction_t rw, - pgoff_t index, - struct page *src_page, - struct page *dest_page) -{ - u8 xts_tweak[F2FS_XTS_TWEAK_SIZE]; - struct ablkcipher_request *req = NULL; - DECLARE_F2FS_COMPLETION_RESULT(ecr); - struct scatterlist dst, src; - struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; - struct crypto_ablkcipher *tfm = ci->ci_ctfm; - int res = 0; - - req = ablkcipher_request_alloc(tfm, GFP_NOFS); - if (!req) { - printk_ratelimited(KERN_ERR - "%s: crypto_request_alloc() failed\n", - __func__); - return -ENOMEM; - } - ablkcipher_request_set_callback( - req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - f2fs_crypt_complete, &ecr); - - BUILD_BUG_ON(F2FS_XTS_TWEAK_SIZE < sizeof(index)); - memcpy(xts_tweak, &index, sizeof(index)); - memset(&xts_tweak[sizeof(index)], 0, - F2FS_XTS_TWEAK_SIZE - sizeof(index)); - - sg_init_table(&dst, 1); - sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0); - sg_init_table(&src, 1); - sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0); - ablkcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE, - xts_tweak); - if (rw == F2FS_DECRYPT) - res = crypto_ablkcipher_decrypt(req); - else - res = crypto_ablkcipher_encrypt(req); - if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); - wait_for_completion(&ecr.completion); - res = ecr.res; - } - ablkcipher_request_free(req); - if (res) { - printk_ratelimited(KERN_ERR - "%s: crypto_ablkcipher_encrypt() returned %d\n", - __func__, res); - return res; - } - return 0; -} - -static struct page *alloc_bounce_page(struct f2fs_crypto_ctx *ctx) -{ - ctx->w.bounce_page = mempool_alloc(f2fs_bounce_page_pool, GFP_NOWAIT); - if (ctx->w.bounce_page == NULL) - return ERR_PTR(-ENOMEM); - ctx->flags |= F2FS_WRITE_PATH_FL; - return ctx->w.bounce_page; -} - -/** - * f2fs_encrypt() - Encrypts a page - * @inode: The inode for which the encryption should take place - * @plaintext_page: The page to encrypt. Must be locked. - * - * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx - * encryption context. - * - * Called on the page write path. The caller must call - * f2fs_restore_control_page() on the returned ciphertext page to - * release the bounce buffer and the encryption context. - * - * Return: An allocated page with the encrypted content on success. Else, an - * error value or NULL. - */ -struct page *f2fs_encrypt(struct inode *inode, - struct page *plaintext_page) -{ - struct f2fs_crypto_ctx *ctx; - struct page *ciphertext_page = NULL; - int err; - - BUG_ON(!PageLocked(plaintext_page)); - - ctx = f2fs_get_crypto_ctx(inode); - if (IS_ERR(ctx)) - return (struct page *)ctx; - - /* The encryption operation will require a bounce page. */ - ciphertext_page = alloc_bounce_page(ctx); - if (IS_ERR(ciphertext_page)) - goto err_out; - - ctx->w.control_page = plaintext_page; - err = f2fs_page_crypto(ctx, inode, F2FS_ENCRYPT, plaintext_page->index, - plaintext_page, ciphertext_page); - if (err) { - ciphertext_page = ERR_PTR(err); - goto err_out; - } - - SetPagePrivate(ciphertext_page); - set_page_private(ciphertext_page, (unsigned long)ctx); - lock_page(ciphertext_page); - return ciphertext_page; - -err_out: - f2fs_release_crypto_ctx(ctx); - return ciphertext_page; -} - -/** - * f2fs_decrypt() - Decrypts a page in-place - * @ctx: The encryption context. - * @page: The page to decrypt. Must be locked. - * - * Decrypts page in-place using the ctx encryption context. - * - * Called from the read completion callback. - * - * Return: Zero on success, non-zero otherwise. - */ -int f2fs_decrypt(struct f2fs_crypto_ctx *ctx, struct page *page) -{ - BUG_ON(!PageLocked(page)); - - return f2fs_page_crypto(ctx, page->mapping->host, - F2FS_DECRYPT, page->index, page, page); -} - -/* - * Convenience function which takes care of allocating and - * deallocating the encryption context - */ -int f2fs_decrypt_one(struct inode *inode, struct page *page) -{ - struct f2fs_crypto_ctx *ctx = f2fs_get_crypto_ctx(inode); - int ret; - - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - ret = f2fs_decrypt(ctx, page); - f2fs_release_crypto_ctx(ctx); - return ret; -} - -bool f2fs_valid_contents_enc_mode(uint32_t mode) -{ - return (mode == F2FS_ENCRYPTION_MODE_AES_256_XTS); -} - -/** - * f2fs_validate_encryption_key_size() - Validate the encryption key size - * @mode: The key mode. - * @size: The key size to validate. - * - * Return: The validated key size for @mode. Zero if invalid. - */ -uint32_t f2fs_validate_encryption_key_size(uint32_t mode, uint32_t size) -{ - if (size == f2fs_encryption_key_size(mode)) - return size; - return 0; -} diff --git a/fs/f2fs/crypto_key.c b/fs/f2fs/crypto_key.c deleted file mode 100644 index 5de2d866a25c..000000000000 --- a/fs/f2fs/crypto_key.c +++ /dev/null @@ -1,254 +0,0 @@ -/* - * linux/fs/f2fs/crypto_key.c - * - * Copied from linux/fs/f2fs/crypto_key.c - * - * Copyright (C) 2015, Google, Inc. - * - * This contains encryption key functions for f2fs - * - * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. - */ -#include -#include -#include -#include -#include -#include -#include - -#include "f2fs.h" -#include "xattr.h" - -static void derive_crypt_complete(struct crypto_async_request *req, int rc) -{ - struct f2fs_completion_result *ecr = req->data; - - if (rc == -EINPROGRESS) - return; - - ecr->res = rc; - complete(&ecr->completion); -} - -/** - * f2fs_derive_key_aes() - Derive a key using AES-128-ECB - * @deriving_key: Encryption key used for derivatio. - * @source_key: Source key to which to apply derivation. - * @derived_key: Derived key. - * - * Return: Zero on success; non-zero otherwise. - */ -static int f2fs_derive_key_aes(char deriving_key[F2FS_AES_128_ECB_KEY_SIZE], - char source_key[F2FS_AES_256_XTS_KEY_SIZE], - char derived_key[F2FS_AES_256_XTS_KEY_SIZE]) -{ - int res = 0; - struct ablkcipher_request *req = NULL; - DECLARE_F2FS_COMPLETION_RESULT(ecr); - struct scatterlist src_sg, dst_sg; - struct crypto_ablkcipher *tfm = crypto_alloc_ablkcipher("ecb(aes)", 0, - 0); - - if (IS_ERR(tfm)) { - res = PTR_ERR(tfm); - tfm = NULL; - goto out; - } - crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY); - req = ablkcipher_request_alloc(tfm, GFP_NOFS); - if (!req) { - res = -ENOMEM; - goto out; - } - ablkcipher_request_set_callback(req, - CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - derive_crypt_complete, &ecr); - res = crypto_ablkcipher_setkey(tfm, deriving_key, - F2FS_AES_128_ECB_KEY_SIZE); - if (res < 0) - goto out; - - sg_init_one(&src_sg, source_key, F2FS_AES_256_XTS_KEY_SIZE); - sg_init_one(&dst_sg, derived_key, F2FS_AES_256_XTS_KEY_SIZE); - ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, - F2FS_AES_256_XTS_KEY_SIZE, NULL); - res = crypto_ablkcipher_encrypt(req); - if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); - wait_for_completion(&ecr.completion); - res = ecr.res; - } -out: - if (req) - ablkcipher_request_free(req); - if (tfm) - crypto_free_ablkcipher(tfm); - return res; -} - -static void f2fs_free_crypt_info(struct f2fs_crypt_info *ci) -{ - if (!ci) - return; - - key_put(ci->ci_keyring_key); - crypto_free_ablkcipher(ci->ci_ctfm); - kmem_cache_free(f2fs_crypt_info_cachep, ci); -} - -void f2fs_free_encryption_info(struct inode *inode, struct f2fs_crypt_info *ci) -{ - struct f2fs_inode_info *fi = F2FS_I(inode); - struct f2fs_crypt_info *prev; - - if (ci == NULL) - ci = ACCESS_ONCE(fi->i_crypt_info); - if (ci == NULL) - return; - prev = cmpxchg(&fi->i_crypt_info, ci, NULL); - if (prev != ci) - return; - - f2fs_free_crypt_info(ci); -} - -int _f2fs_get_encryption_info(struct inode *inode) -{ - struct f2fs_inode_info *fi = F2FS_I(inode); - struct f2fs_crypt_info *crypt_info; - char full_key_descriptor[F2FS_KEY_DESC_PREFIX_SIZE + - (F2FS_KEY_DESCRIPTOR_SIZE * 2) + 1]; - struct key *keyring_key = NULL; - struct f2fs_encryption_key *master_key; - struct f2fs_encryption_context ctx; - const struct user_key_payload *ukp; - struct crypto_ablkcipher *ctfm; - const char *cipher_str; - char raw_key[F2FS_MAX_KEY_SIZE]; - char mode; - int res; - - res = f2fs_crypto_initialize(); - if (res) - return res; -retry: - crypt_info = ACCESS_ONCE(fi->i_crypt_info); - if (crypt_info) { - if (!crypt_info->ci_keyring_key || - key_validate(crypt_info->ci_keyring_key) == 0) - return 0; - f2fs_free_encryption_info(inode, crypt_info); - goto retry; - } - - res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, - &ctx, sizeof(ctx), NULL); - if (res < 0) - return res; - else if (res != sizeof(ctx)) - return -EINVAL; - res = 0; - - crypt_info = kmem_cache_alloc(f2fs_crypt_info_cachep, GFP_NOFS); - if (!crypt_info) - return -ENOMEM; - - crypt_info->ci_flags = ctx.flags; - crypt_info->ci_data_mode = ctx.contents_encryption_mode; - crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; - crypt_info->ci_ctfm = NULL; - crypt_info->ci_keyring_key = NULL; - memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, - sizeof(crypt_info->ci_master_key)); - if (S_ISREG(inode->i_mode)) - mode = crypt_info->ci_data_mode; - else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - mode = crypt_info->ci_filename_mode; - else - BUG(); - - switch (mode) { - case F2FS_ENCRYPTION_MODE_AES_256_XTS: - cipher_str = "xts(aes)"; - break; - case F2FS_ENCRYPTION_MODE_AES_256_CTS: - cipher_str = "cts(cbc(aes))"; - break; - default: - printk_once(KERN_WARNING - "f2fs: unsupported key mode %d (ino %u)\n", - mode, (unsigned) inode->i_ino); - res = -ENOKEY; - goto out; - } - - memcpy(full_key_descriptor, F2FS_KEY_DESC_PREFIX, - F2FS_KEY_DESC_PREFIX_SIZE); - sprintf(full_key_descriptor + F2FS_KEY_DESC_PREFIX_SIZE, - "%*phN", F2FS_KEY_DESCRIPTOR_SIZE, - ctx.master_key_descriptor); - full_key_descriptor[F2FS_KEY_DESC_PREFIX_SIZE + - (2 * F2FS_KEY_DESCRIPTOR_SIZE)] = '\0'; - keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL); - if (IS_ERR(keyring_key)) { - res = PTR_ERR(keyring_key); - keyring_key = NULL; - goto out; - } - crypt_info->ci_keyring_key = keyring_key; - BUG_ON(keyring_key->type != &key_type_logon); - ukp = user_key_payload(keyring_key); - if (ukp->datalen != sizeof(struct f2fs_encryption_key)) { - res = -EINVAL; - goto out; - } - master_key = (struct f2fs_encryption_key *)ukp->data; - BUILD_BUG_ON(F2FS_AES_128_ECB_KEY_SIZE != - F2FS_KEY_DERIVATION_NONCE_SIZE); - BUG_ON(master_key->size != F2FS_AES_256_XTS_KEY_SIZE); - res = f2fs_derive_key_aes(ctx.nonce, master_key->raw, - raw_key); - if (res) - goto out; - - ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0); - if (!ctfm || IS_ERR(ctfm)) { - res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; - printk(KERN_DEBUG - "%s: error %d (inode %u) allocating crypto tfm\n", - __func__, res, (unsigned) inode->i_ino); - goto out; - } - crypt_info->ci_ctfm = ctfm; - crypto_ablkcipher_clear_flags(ctfm, ~0); - crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctfm), - CRYPTO_TFM_REQ_WEAK_KEY); - res = crypto_ablkcipher_setkey(ctfm, raw_key, - f2fs_encryption_key_size(mode)); - if (res) - goto out; - - memzero_explicit(raw_key, sizeof(raw_key)); - if (cmpxchg(&fi->i_crypt_info, NULL, crypt_info) != NULL) { - f2fs_free_crypt_info(crypt_info); - goto retry; - } - return 0; - -out: - if (res == -ENOKEY && !S_ISREG(inode->i_mode)) - res = 0; - - f2fs_free_crypt_info(crypt_info); - memzero_explicit(raw_key, sizeof(raw_key)); - return res; -} - -int f2fs_has_encryption_key(struct inode *inode) -{ - struct f2fs_inode_info *fi = F2FS_I(inode); - - return (fi->i_crypt_info != NULL); -} diff --git a/fs/f2fs/crypto_policy.c b/fs/f2fs/crypto_policy.c deleted file mode 100644 index e504f548b64e..000000000000 --- a/fs/f2fs/crypto_policy.c +++ /dev/null @@ -1,212 +0,0 @@ -/* - * copied from linux/fs/ext4/crypto_policy.c - * - * Copyright (C) 2015, Google, Inc. - * Copyright (C) 2015, Motorola Mobility. - * - * This contains encryption policy functions for f2fs with some modifications - * to support f2fs-specific xattr APIs. - * - * Written by Michael Halcrow, 2015. - * Modified by Jaegeuk Kim, 2015. - */ -#include -#include -#include -#include - -#include "f2fs.h" -#include "xattr.h" - -static int f2fs_inode_has_encryption_context(struct inode *inode) -{ - int res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, NULL, 0, NULL); - return (res > 0); -} - -/* - * check whether the policy is consistent with the encryption context - * for the inode - */ -static int f2fs_is_encryption_context_consistent_with_policy( - struct inode *inode, const struct f2fs_encryption_policy *policy) -{ - struct f2fs_encryption_context ctx; - int res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, - sizeof(ctx), NULL); - - if (res != sizeof(ctx)) - return 0; - - return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor, - F2FS_KEY_DESCRIPTOR_SIZE) == 0 && - (ctx.flags == policy->flags) && - (ctx.contents_encryption_mode == - policy->contents_encryption_mode) && - (ctx.filenames_encryption_mode == - policy->filenames_encryption_mode)); -} - -static int f2fs_create_encryption_context_from_policy( - struct inode *inode, const struct f2fs_encryption_policy *policy) -{ - struct f2fs_encryption_context ctx; - - ctx.format = F2FS_ENCRYPTION_CONTEXT_FORMAT_V1; - memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, - F2FS_KEY_DESCRIPTOR_SIZE); - - if (!f2fs_valid_contents_enc_mode(policy->contents_encryption_mode)) { - printk(KERN_WARNING - "%s: Invalid contents encryption mode %d\n", __func__, - policy->contents_encryption_mode); - return -EINVAL; - } - - if (!f2fs_valid_filenames_enc_mode(policy->filenames_encryption_mode)) { - printk(KERN_WARNING - "%s: Invalid filenames encryption mode %d\n", __func__, - policy->filenames_encryption_mode); - return -EINVAL; - } - - if (policy->flags & ~F2FS_POLICY_FLAGS_VALID) - return -EINVAL; - - ctx.contents_encryption_mode = policy->contents_encryption_mode; - ctx.filenames_encryption_mode = policy->filenames_encryption_mode; - ctx.flags = policy->flags; - BUILD_BUG_ON(sizeof(ctx.nonce) != F2FS_KEY_DERIVATION_NONCE_SIZE); - get_random_bytes(ctx.nonce, F2FS_KEY_DERIVATION_NONCE_SIZE); - - return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, - sizeof(ctx), NULL, XATTR_CREATE); -} - -int f2fs_process_policy(const struct f2fs_encryption_policy *policy, - struct inode *inode) -{ - if (!inode_owner_or_capable(inode)) - return -EACCES; - - if (policy->version != 0) - return -EINVAL; - - if (!S_ISDIR(inode->i_mode)) - return -EINVAL; - - if (!f2fs_inode_has_encryption_context(inode)) { - if (!f2fs_empty_dir(inode)) - return -ENOTEMPTY; - return f2fs_create_encryption_context_from_policy(inode, - policy); - } - - if (f2fs_is_encryption_context_consistent_with_policy(inode, policy)) - return 0; - - printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n", - __func__); - return -EINVAL; -} - -int f2fs_get_policy(struct inode *inode, struct f2fs_encryption_policy *policy) -{ - struct f2fs_encryption_context ctx; - int res; - - if (!f2fs_encrypted_inode(inode)) - return -ENODATA; - - res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, - &ctx, sizeof(ctx), NULL); - if (res != sizeof(ctx)) - return -ENODATA; - if (ctx.format != F2FS_ENCRYPTION_CONTEXT_FORMAT_V1) - return -EINVAL; - - policy->version = 0; - policy->contents_encryption_mode = ctx.contents_encryption_mode; - policy->filenames_encryption_mode = ctx.filenames_encryption_mode; - policy->flags = ctx.flags; - memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor, - F2FS_KEY_DESCRIPTOR_SIZE); - return 0; -} - -int f2fs_is_child_context_consistent_with_parent(struct inode *parent, - struct inode *child) -{ - struct f2fs_crypt_info *parent_ci, *child_ci; - int res; - - if ((parent == NULL) || (child == NULL)) { - pr_err("parent %p child %p\n", parent, child); - BUG_ON(1); - } - - /* no restrictions if the parent directory is not encrypted */ - if (!f2fs_encrypted_inode(parent)) - return 1; - /* if the child directory is not encrypted, this is always a problem */ - if (!f2fs_encrypted_inode(child)) - return 0; - res = f2fs_get_encryption_info(parent); - if (res) - return 0; - res = f2fs_get_encryption_info(child); - if (res) - return 0; - parent_ci = F2FS_I(parent)->i_crypt_info; - child_ci = F2FS_I(child)->i_crypt_info; - if (!parent_ci && !child_ci) - return 1; - if (!parent_ci || !child_ci) - return 0; - - return (memcmp(parent_ci->ci_master_key, - child_ci->ci_master_key, - F2FS_KEY_DESCRIPTOR_SIZE) == 0 && - (parent_ci->ci_data_mode == child_ci->ci_data_mode) && - (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) && - (parent_ci->ci_flags == child_ci->ci_flags)); -} - -/** - * f2fs_inherit_context() - Sets a child context from its parent - * @parent: Parent inode from which the context is inherited. - * @child: Child inode that inherits the context from @parent. - * - * Return: Zero on success, non-zero otherwise - */ -int f2fs_inherit_context(struct inode *parent, struct inode *child, - struct page *ipage) -{ - struct f2fs_encryption_context ctx; - struct f2fs_crypt_info *ci; - int res; - - res = f2fs_get_encryption_info(parent); - if (res < 0) - return res; - - ci = F2FS_I(parent)->i_crypt_info; - BUG_ON(ci == NULL); - - ctx.format = F2FS_ENCRYPTION_CONTEXT_FORMAT_V1; - - ctx.contents_encryption_mode = ci->ci_data_mode; - ctx.filenames_encryption_mode = ci->ci_filename_mode; - ctx.flags = ci->ci_flags; - memcpy(ctx.master_key_descriptor, ci->ci_master_key, - F2FS_KEY_DESCRIPTOR_SIZE); - - get_random_bytes(ctx.nonce, F2FS_KEY_DERIVATION_NONCE_SIZE); - return f2fs_setxattr(child, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, - sizeof(ctx), ipage, XATTR_CREATE); -} diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 972eab7ac071..7a3ac306a57c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include #include "f2fs.h" @@ -32,11 +34,16 @@ static void f2fs_read_end_io(struct bio *bio) struct bio_vec *bvec; int i; +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) + bio->bi_error = -EIO; +#endif + if (f2fs_bio_encrypted(bio)) { if (bio->bi_error) { - f2fs_release_crypto_ctx(bio->bi_private); + fscrypt_release_ctx(bio->bi_private); } else { - f2fs_end_io_crypto_work(bio->bi_private, bio); + fscrypt_decrypt_bio_pages(bio->bi_private, bio); return; } } @@ -45,7 +52,8 @@ static void f2fs_read_end_io(struct bio *bio) struct page *page = bvec->bv_page; if (!bio->bi_error) { - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); } else { ClearPageUptodate(page); SetPageError(page); @@ -64,19 +72,16 @@ static void f2fs_write_end_io(struct bio *bio) bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; - f2fs_restore_and_release_control_page(&page); + fscrypt_pullback_bio_page(&page, true); if (unlikely(bio->bi_error)) { - set_page_dirty(page); set_bit(AS_EIO, &page->mapping->flags); - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, true); } end_page_writeback(page); - dec_page_count(sbi, F2FS_WRITEBACK); } - - if (!get_pages(sbi, F2FS_WRITEBACK) && - !list_empty(&sbi->cp_wait.task_list)) + if (atomic_dec_and_test(&sbi->nr_wb_bios) && + wq_has_sleeper(&sbi->cp_wait)) wake_up(&sbi->cp_wait); bio_put(bio); @@ -100,6 +105,18 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, return bio; } +static inline void __submit_bio(struct f2fs_sb_info *sbi, int rw, + struct bio *bio, enum page_type type) +{ + if (!is_read_io(rw)) { + atomic_inc(&sbi->nr_wb_bios); + if (f2fs_sb_mounted_hmsmr(sbi->sb) && + current->plug && (type == DATA || type == NODE)) + blk_finish_plug(current->plug); + } + submit_bio(rw, bio); +} + static void __submit_merged_bio(struct f2fs_bio_info *io) { struct f2fs_io_info *fio = &io->fio; @@ -112,12 +129,58 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) else trace_f2fs_submit_write_bio(io->sbi->sb, fio, io->bio); - submit_bio(fio->rw, io->bio); + __submit_bio(io->sbi, fio->rw, io->bio, fio->type); io->bio = NULL; } -void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, - enum page_type type, int rw) +static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, + struct page *page, nid_t ino) +{ + struct bio_vec *bvec; + struct page *target; + int i; + + if (!io->bio) + return false; + + if (!inode && !page && !ino) + return true; + + bio_for_each_segment_all(bvec, io->bio, i) { + + if (bvec->bv_page->mapping) + target = bvec->bv_page; + else + target = fscrypt_control_page(bvec->bv_page); + + if (inode && inode == target->mapping->host) + return true; + if (page && page == target) + return true; + if (ino && ino == ino_of_node(target)) + return true; + } + + return false; +} + +static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode, + struct page *page, nid_t ino, + enum page_type type) +{ + enum page_type btype = PAGE_TYPE_OF_BIO(type); + struct f2fs_bio_info *io = &sbi->write_io[btype]; + bool ret; + + down_read(&io->io_rwsem); + ret = __has_merged_page(io, inode, page, ino); + up_read(&io->io_rwsem); + return ret; +} + +static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, + struct inode *inode, struct page *page, + nid_t ino, enum page_type type, int rw) { enum page_type btype = PAGE_TYPE_OF_BIO(type); struct f2fs_bio_info *io; @@ -126,6 +189,9 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, down_write(&io->io_rwsem); + if (!__has_merged_page(io, inode, page, ino)) + goto out; + /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { io->fio.type = META_FLUSH; @@ -135,9 +201,31 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO; } __submit_merged_bio(io); +out: up_write(&io->io_rwsem); } +void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, enum page_type type, + int rw) +{ + __f2fs_submit_merged_bio(sbi, NULL, NULL, 0, type, rw); +} + +void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *sbi, + struct inode *inode, struct page *page, + nid_t ino, enum page_type type, int rw) +{ + if (has_merged_page(sbi, inode, page, ino, type)) + __f2fs_submit_merged_bio(sbi, inode, page, ino, type, rw); +} + +void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi) +{ + f2fs_submit_merged_bio(sbi, DATA, WRITE); + f2fs_submit_merged_bio(sbi, NODE, WRITE); + f2fs_submit_merged_bio(sbi, META, WRITE); +} + /* * Fill the locked page with data located in the block address. * Return unlocked page. @@ -145,20 +233,21 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, int f2fs_submit_page_bio(struct f2fs_io_info *fio) { struct bio *bio; - struct page *page = fio->encrypted_page ? fio->encrypted_page : fio->page; + struct page *page = fio->encrypted_page ? + fio->encrypted_page : fio->page; trace_f2fs_submit_page_bio(page, fio); f2fs_trace_ios(fio, 0); /* Allocate a new bio */ - bio = __bio_alloc(fio->sbi, fio->blk_addr, 1, is_read_io(fio->rw)); + bio = __bio_alloc(fio->sbi, fio->new_blkaddr, 1, is_read_io(fio->rw)); - if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_put(bio); return -EFAULT; } - submit_bio(fio->rw, bio); + __submit_bio(fio->sbi, fio->rw, bio, fio->type); return 0; } @@ -172,39 +261,49 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) io = is_read ? &sbi->read_io : &sbi->write_io[btype]; - verify_block_addr(sbi, fio->blk_addr); + if (fio->old_blkaddr != NEW_ADDR) + verify_block_addr(sbi, fio->old_blkaddr); + verify_block_addr(sbi, fio->new_blkaddr); down_write(&io->io_rwsem); - if (!is_read) - inc_page_count(sbi, F2FS_WRITEBACK); - - if (io->bio && (io->last_block_in_bio != fio->blk_addr - 1 || + if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 || io->fio.rw != fio->rw)) __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { int bio_blocks = MAX_BIO_BLOCKS(sbi); - io->bio = __bio_alloc(sbi, fio->blk_addr, bio_blocks, is_read); + io->bio = __bio_alloc(sbi, fio->new_blkaddr, + bio_blocks, is_read); io->fio = *fio; } bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; - if (bio_add_page(io->bio, bio_page, PAGE_CACHE_SIZE, 0) < - PAGE_CACHE_SIZE) { + if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < + PAGE_SIZE) { __submit_merged_bio(io); goto alloc_new; } - io->last_block_in_bio = fio->blk_addr; + io->last_block_in_bio = fio->new_blkaddr; f2fs_trace_ios(fio, 0); up_write(&io->io_rwsem); trace_f2fs_submit_page_mbio(fio->page, fio); } +static void __set_data_blkaddr(struct dnode_of_data *dn) +{ + struct f2fs_node *rn = F2FS_NODE(dn->node_page); + __le32 *addr_array; + + /* Get physical address of data block */ + addr_array = blkaddr_in_node(rn); + addr_array[dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); +} + /* * Lock ordering for the change of data block address: * ->data_page @@ -213,39 +312,63 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) */ void set_data_blkaddr(struct dnode_of_data *dn) { - struct f2fs_node *rn; - __le32 *addr_array; - struct page *node_page = dn->node_page; - unsigned int ofs_in_node = dn->ofs_in_node; - - f2fs_wait_on_page_writeback(node_page, NODE); - - rn = F2FS_NODE(node_page); - - /* Get physical address of data block */ - addr_array = blkaddr_in_node(rn); - addr_array[ofs_in_node] = cpu_to_le32(dn->data_blkaddr); - set_page_dirty(node_page); + f2fs_wait_on_page_writeback(dn->node_page, NODE, true); + __set_data_blkaddr(dn); + if (set_page_dirty(dn->node_page)) + dn->node_changed = true; } -int reserve_new_block(struct dnode_of_data *dn) +void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) +{ + dn->data_blkaddr = blkaddr; + set_data_blkaddr(dn); + f2fs_update_extent_cache(dn); +} + +/* dn->ofs_in_node will be returned with up-to-date last block pointer */ +int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); - if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) + if (!count) + return 0; + + if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; - if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1))) + if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count))) return -ENOSPC; - trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node); + trace_f2fs_reserve_new_blocks(dn->inode, dn->nid, + dn->ofs_in_node, count); - dn->data_blkaddr = NEW_ADDR; - set_data_blkaddr(dn); - mark_inode_dirty(dn->inode); - sync_inode_page(dn); + f2fs_wait_on_page_writeback(dn->node_page, NODE, true); + + for (; count > 0; dn->ofs_in_node++) { + block_t blkaddr = + datablock_addr(dn->node_page, dn->ofs_in_node); + if (blkaddr == NULL_ADDR) { + dn->data_blkaddr = NEW_ADDR; + __set_data_blkaddr(dn); + count--; + } + } + + if (set_page_dirty(dn->node_page)) + dn->node_changed = true; return 0; } +/* Should keep dn->ofs_in_node unchanged */ +int reserve_new_block(struct dnode_of_data *dn) +{ + unsigned int ofs_in_node = dn->ofs_in_node; + int ret; + + ret = reserve_new_blocks(dn, 1); + dn->ofs_in_node = ofs_in_node; + return ret; +} + int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) { bool need_put = dn->inode_page ? false : true; @@ -325,13 +448,14 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, * see, f2fs_add_link -> get_new_data_page -> init_inode_metadata. */ if (dn.data_blkaddr == NEW_ADDR) { - zero_user_segment(page, 0, PAGE_CACHE_SIZE); - SetPageUptodate(page); + zero_user_segment(page, 0, PAGE_SIZE); + if (!PageUptodate(page)) + SetPageUptodate(page); unlock_page(page); return page; } - fio.blk_addr = dn.data_blkaddr; + fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; fio.page = page; err = f2fs_submit_page_bio(&fio); if (err) @@ -385,14 +509,14 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index, /* wait for read completion */ lock_page(page); - if (unlikely(!PageUptodate(page))) { - f2fs_put_page(page, 1); - return ERR_PTR(-EIO); - } if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); goto repeat; } + if (unlikely(!PageUptodate(page))) { + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } return page; } @@ -412,7 +536,7 @@ struct page *get_new_data_page(struct inode *inode, struct page *page; struct dnode_of_data dn; int err; -repeat: + page = f2fs_grab_cache_page(mapping, index, true); if (!page) { /* @@ -436,45 +560,42 @@ struct page *get_new_data_page(struct inode *inode, goto got_it; if (dn.data_blkaddr == NEW_ADDR) { - zero_user_segment(page, 0, PAGE_CACHE_SIZE); - SetPageUptodate(page); + zero_user_segment(page, 0, PAGE_SIZE); + if (!PageUptodate(page)) + SetPageUptodate(page); } else { f2fs_put_page(page, 1); - page = get_read_data_page(inode, index, READ_SYNC, true); + /* if ipage exists, blkaddr should be NEW_ADDR */ + f2fs_bug_on(F2FS_I_SB(inode), ipage); + page = get_lock_data_page(inode, index, true); if (IS_ERR(page)) - goto repeat; - - /* wait for read completion */ - lock_page(page); + return page; } got_it: if (new_i_size && i_size_read(inode) < - ((loff_t)(index + 1) << PAGE_CACHE_SHIFT)) { - i_size_write(inode, ((loff_t)(index + 1) << PAGE_CACHE_SHIFT)); - /* Only the directory inode sets new_i_size */ - set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR); - } + ((loff_t)(index + 1) << PAGE_SHIFT)) + f2fs_i_size_write(inode, ((loff_t)(index + 1) << PAGE_SHIFT)); return page; } static int __allocate_data_block(struct dnode_of_data *dn) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); - struct f2fs_inode_info *fi = F2FS_I(dn->inode); struct f2fs_summary sum; struct node_info ni; int seg = CURSEG_WARM_DATA; pgoff_t fofs; + blkcnt_t count = 1; - if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) + if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node); if (dn->data_blkaddr == NEW_ADDR) goto alloc; - if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1))) + if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count))) return -ENOSPC; alloc: @@ -489,72 +610,43 @@ static int __allocate_data_block(struct dnode_of_data *dn) set_data_blkaddr(dn); /* update i_size */ - fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + + fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + dn->ofs_in_node; - if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT)) - i_size_write(dn->inode, - ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT)); - - /* direct IO doesn't use extent cache to maximize the performance */ - f2fs_drop_largest_extent(dn->inode, fofs); - + if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_SHIFT)) + f2fs_i_size_write(dn->inode, + ((loff_t)(fofs + 1) << PAGE_SHIFT)); return 0; } -static void __allocate_data_blocks(struct inode *inode, loff_t offset, - size_t count) +ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct dnode_of_data dn; - u64 start = F2FS_BYTES_TO_BLK(offset); - u64 len = F2FS_BYTES_TO_BLK(count); - bool allocated; - u64 end_offset; + struct inode *inode = file_inode(iocb->ki_filp); + struct f2fs_map_blocks map; + ssize_t ret = 0; - while (len) { - f2fs_balance_fs(sbi); - f2fs_lock_op(sbi); + map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos); + map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from)); + if (map.m_len > map.m_lblk) + map.m_len -= map.m_lblk; + else + map.m_len = 0; - /* When reading holes, we need its node page */ - set_new_dnode(&dn, inode, NULL, NULL, 0); - if (get_dnode_of_data(&dn, start, ALLOC_NODE)) - goto out; + map.m_next_pgofs = NULL; - allocated = false; - end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); - - while (dn.ofs_in_node < end_offset && len) { - block_t blkaddr; - - if (unlikely(f2fs_cp_error(sbi))) - goto sync_out; - - blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); - if (blkaddr == NULL_ADDR || blkaddr == NEW_ADDR) { - if (__allocate_data_block(&dn)) - goto sync_out; - allocated = true; - } - len--; - start++; - dn.ofs_in_node++; - } - - if (allocated) - sync_inode_page(&dn); - - f2fs_put_dnode(&dn); - f2fs_unlock_op(sbi); + if (iocb->ki_flags & IOCB_DIRECT) { + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO); } - return; - -sync_out: - if (allocated) - sync_inode_page(&dn); - f2fs_put_dnode(&dn); -out: - f2fs_unlock_op(sbi); - return; + if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) { + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + } + if (!f2fs_has_inline_data(inode)) + return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); + return ret; } /* @@ -566,156 +658,181 @@ static void __allocate_data_blocks(struct inode *inode, loff_t offset, * b. do not use extent cache for better performance * c. give the block addresses to blockdev */ -static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, +int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int create, int flag) { unsigned int maxblocks = map->m_len; struct dnode_of_data dn; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA; - pgoff_t pgofs, end_offset; + int mode = create ? ALLOC_NODE : LOOKUP_NODE; + pgoff_t pgofs, end_offset, end; int err = 0, ofs = 1; + unsigned int ofs_in_node, last_ofs_in_node; + blkcnt_t prealloc; struct extent_info ei; bool allocated = false; + block_t blkaddr; + + if (!maxblocks) + return 0; map->m_len = 0; map->m_flags = 0; /* it only supports block size == page size */ pgofs = (pgoff_t)map->m_lblk; + end = pgofs + maxblocks; - if (f2fs_lookup_extent_cache(inode, pgofs, &ei)) { + if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) { map->m_pblk = ei.blk + pgofs - ei.fofs; map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgofs); map->m_flags = F2FS_MAP_MAPPED; goto out; } +next_dnode: if (create) - f2fs_lock_op(F2FS_I_SB(inode)); + f2fs_lock_op(sbi); /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, pgofs, mode); if (err) { - if (err == -ENOENT) + if (flag == F2FS_GET_BLOCK_BMAP) + map->m_pblk = 0; + if (err == -ENOENT) { err = 0; + if (map->m_next_pgofs) + *map->m_next_pgofs = + get_next_page_offset(&dn, pgofs); + } goto unlock_out; } - if (dn.data_blkaddr == NEW_ADDR || dn.data_blkaddr == NULL_ADDR) { + prealloc = 0; + ofs_in_node = dn.ofs_in_node; + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); + +next_block: + blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + + if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) { if (create) { if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; - goto put_out; + goto sync_out; + } + if (flag == F2FS_GET_BLOCK_PRE_AIO) { + if (blkaddr == NULL_ADDR) { + prealloc++; + last_ofs_in_node = dn.ofs_in_node; + } + } else { + err = __allocate_data_block(&dn); + if (!err) { + set_inode_flag(inode, FI_APPEND_WRITE); + allocated = true; + } } - err = __allocate_data_block(&dn); if (err) - goto put_out; - allocated = true; + goto sync_out; map->m_flags = F2FS_MAP_NEW; + blkaddr = dn.data_blkaddr; } else { - if (flag != F2FS_GET_BLOCK_FIEMAP || - dn.data_blkaddr != NEW_ADDR) { - if (flag == F2FS_GET_BLOCK_BMAP) - err = -ENOENT; - goto put_out; + if (flag == F2FS_GET_BLOCK_BMAP) { + map->m_pblk = 0; + goto sync_out; } - - /* - * preallocated unwritten block should be mapped - * for fiemap. - */ - if (dn.data_blkaddr == NEW_ADDR) - map->m_flags = F2FS_MAP_UNWRITTEN; + if (flag == F2FS_GET_BLOCK_FIEMAP && + blkaddr == NULL_ADDR) { + if (map->m_next_pgofs) + *map->m_next_pgofs = pgofs + 1; + } + if (flag != F2FS_GET_BLOCK_FIEMAP || + blkaddr != NEW_ADDR) + goto sync_out; } } - map->m_flags |= F2FS_MAP_MAPPED; - map->m_pblk = dn.data_blkaddr; - map->m_len = 1; + if (flag == F2FS_GET_BLOCK_PRE_AIO) + goto skip; - end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); + if (map->m_len == 0) { + /* preallocated unwritten block should be mapped for fiemap. */ + if (blkaddr == NEW_ADDR) + map->m_flags |= F2FS_MAP_UNWRITTEN; + map->m_flags |= F2FS_MAP_MAPPED; + + map->m_pblk = blkaddr; + map->m_len = 1; + } else if ((map->m_pblk != NEW_ADDR && + blkaddr == (map->m_pblk + ofs)) || + (map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) || + flag == F2FS_GET_BLOCK_PRE_DIO) { + ofs++; + map->m_len++; + } else { + goto sync_out; + } + +skip: dn.ofs_in_node++; pgofs++; -get_next: - if (dn.ofs_in_node >= end_offset) { - if (allocated) - sync_inode_page(&dn); - allocated = false; - f2fs_put_dnode(&dn); + /* preallocate blocks in batch for one dnode page */ + if (flag == F2FS_GET_BLOCK_PRE_AIO && + (pgofs == end || dn.ofs_in_node == end_offset)) { - set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, pgofs, mode); - if (err) { - if (err == -ENOENT) - err = 0; - goto unlock_out; + dn.ofs_in_node = ofs_in_node; + err = reserve_new_blocks(&dn, prealloc); + if (err) + goto sync_out; + allocated = dn.node_changed; + + map->m_len += dn.ofs_in_node - ofs_in_node; + if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) { + err = -ENOSPC; + goto sync_out; } - - end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); + dn.ofs_in_node = end_offset; } - if (maxblocks > map->m_len) { - block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + if (pgofs >= end) + goto sync_out; + else if (dn.ofs_in_node < end_offset) + goto next_block; - if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) { - if (create) { - if (unlikely(f2fs_cp_error(sbi))) { - err = -EIO; - goto sync_out; - } - err = __allocate_data_block(&dn); - if (err) - goto sync_out; - allocated = true; - map->m_flags |= F2FS_MAP_NEW; - blkaddr = dn.data_blkaddr; - } else { - /* - * we only merge preallocated unwritten blocks - * for fiemap. - */ - if (flag != F2FS_GET_BLOCK_FIEMAP || - blkaddr != NEW_ADDR) - goto sync_out; - } - } + f2fs_put_dnode(&dn); - /* Give more consecutive addresses for the readahead */ - if ((map->m_pblk != NEW_ADDR && - blkaddr == (map->m_pblk + ofs)) || - (map->m_pblk == NEW_ADDR && - blkaddr == NEW_ADDR)) { - ofs++; - dn.ofs_in_node++; - pgofs++; - map->m_len++; - goto get_next; - } + if (create) { + f2fs_unlock_op(sbi); + f2fs_balance_fs(sbi, allocated); } + allocated = false; + goto next_dnode; + sync_out: - if (allocated) - sync_inode_page(&dn); -put_out: f2fs_put_dnode(&dn); unlock_out: - if (create) - f2fs_unlock_op(F2FS_I_SB(inode)); + if (create) { + f2fs_unlock_op(sbi); + f2fs_balance_fs(sbi, allocated); + } out: trace_f2fs_map_blocks(inode, map, err); return err; } static int __get_data_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create, int flag) + struct buffer_head *bh, int create, int flag, + pgoff_t *next_pgofs) { struct f2fs_map_blocks map; int ret; map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; + map.m_next_pgofs = next_pgofs; ret = f2fs_map_blocks(inode, &map, create, flag); if (!ret) { @@ -727,23 +844,29 @@ static int __get_data_block(struct inode *inode, sector_t iblock, } static int get_data_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create, int flag) + struct buffer_head *bh_result, int create, int flag, + pgoff_t *next_pgofs) { - return __get_data_block(inode, iblock, bh_result, create, flag); + return __get_data_block(inode, iblock, bh_result, create, + flag, next_pgofs); } static int get_data_block_dio(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_DIO); + F2FS_GET_BLOCK_DIO, NULL); } static int get_data_block_bmap(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { + /* Block number less than F2FS MAX BLOCKS */ + if (unlikely(iblock >= F2FS_I_SB(inode)->max_file_blocks)) + return -EFBIG; + return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_BMAP); + F2FS_GET_BLOCK_BMAP, NULL); } static inline sector_t logical_to_blk(struct inode *inode, loff_t offset) @@ -761,10 +884,10 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, { struct buffer_head map_bh; sector_t start_blk, last_blk; - loff_t isize = i_size_read(inode); + pgoff_t next_pgofs; + loff_t isize; u64 logical = 0, phys = 0, size = 0; u32 flags = 0; - bool past_eof = false, whole_file = false; int ret = 0; ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); @@ -777,82 +900,64 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return ret; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); - if (len >= isize) { - whole_file = true; - len = isize; - } + isize = i_size_read(inode); + if (start >= isize) + goto out; + + if (start + len > isize) + len = isize - start; if (logical_to_blk(inode, len) == 0) len = blk_to_logical(inode, 1); start_blk = logical_to_blk(inode, start); last_blk = logical_to_blk(inode, start + len - 1); + next: memset(&map_bh, 0, sizeof(struct buffer_head)); map_bh.b_size = len; ret = get_data_block(inode, start_blk, &map_bh, 0, - F2FS_GET_BLOCK_FIEMAP); + F2FS_GET_BLOCK_FIEMAP, &next_pgofs); if (ret) goto out; /* HOLE */ if (!buffer_mapped(&map_bh)) { - start_blk++; - - if (!past_eof && blk_to_logical(inode, start_blk) >= isize) - past_eof = 1; - - if (past_eof && size) { - flags |= FIEMAP_EXTENT_LAST; - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); - } else if (size) { - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); - size = 0; - } - - /* if we have holes up to/past EOF then we're done */ - if (start_blk > last_blk || past_eof || ret) - goto out; - } else { - if (start_blk > last_blk && !whole_file) { - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); - goto out; - } - - /* - * if size != 0 then we know we already have an extent - * to add, so add it. + start_blk = next_pgofs; + /* Go through holes util pass the EOF */ + if (blk_to_logical(inode, start_blk) < isize) + goto prep_next; + /* Found a hole beyond isize means no more extents. + * Note that the premise is that filesystems don't + * punch holes beyond isize and keep size unchanged. */ - if (size) { - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); - if (ret) - goto out; - } - - logical = blk_to_logical(inode, start_blk); - phys = blk_to_logical(inode, map_bh.b_blocknr); - size = map_bh.b_size; - flags = 0; - if (buffer_unwritten(&map_bh)) - flags = FIEMAP_EXTENT_UNWRITTEN; - - start_blk += logical_to_blk(inode, size); - - /* - * If we are past the EOF, then we need to make sure as - * soon as we find a hole that the last extent we found - * is marked with FIEMAP_EXTENT_LAST - */ - if (!past_eof && logical + size >= isize) - past_eof = true; + flags |= FIEMAP_EXTENT_LAST; } + + if (size) { + if (f2fs_encrypted_inode(inode)) + flags |= FIEMAP_EXTENT_DATA_ENCRYPTED; + + ret = fiemap_fill_next_extent(fieinfo, logical, + phys, size, flags); + } + + if (start_blk > last_blk || ret) + goto out; + + logical = blk_to_logical(inode, start_blk); + phys = blk_to_logical(inode, map_bh.b_blocknr); + size = map_bh.b_size; + flags = 0; + if (buffer_unwritten(&map_bh)) + flags = FIEMAP_EXTENT_UNWRITTEN; + + start_blk += logical_to_blk(inode, size); + +prep_next: cond_resched(); if (fatal_signal_pending(current)) ret = -EINTR; @@ -862,10 +967,41 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret == 1) ret = 0; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } +static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, + unsigned nr_pages) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct fscrypt_ctx *ctx = NULL; + struct block_device *bdev = sbi->sb->s_bdev; + struct bio *bio; + + if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { + ctx = fscrypt_get_ctx(inode, GFP_NOFS); + if (IS_ERR(ctx)) + return ERR_CAST(ctx); + + /* wait the page to be moved by cleaning */ + f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); + } + + bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); + if (!bio) { + if (ctx) + fscrypt_release_ctx(ctx); + return ERR_PTR(-ENOMEM); + } + bio->bi_bdev = bdev; + bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blkaddr); + bio->bi_end_io = f2fs_read_end_io; + bio->bi_private = ctx; + + return bio; +} + /* * This function was originally taken from fs/mpage.c, and customized for f2fs. * Major change was from block_size == page_size in f2fs by default. @@ -884,13 +1020,13 @@ static int f2fs_mpage_readpages(struct address_space *mapping, sector_t last_block; sector_t last_block_in_file; sector_t block_nr; - struct block_device *bdev = inode->i_sb->s_bdev; struct f2fs_map_blocks map; map.m_pblk = 0; map.m_lblk = 0; map.m_len = 0; map.m_flags = 0; + map.m_next_pgofs = NULL; for (page_idx = 0; nr_pages; page_idx++, nr_pages--) { @@ -929,7 +1065,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, map.m_len = last_block - block_in_file; if (f2fs_map_blocks(inode, &map, 0, - F2FS_GET_BLOCK_READ)) + F2FS_GET_BLOCK_READ)) goto set_error_page; } got_it: @@ -942,8 +1078,9 @@ static int f2fs_mpage_readpages(struct address_space *mapping, goto confused; } } else { - zero_user_segment(page, 0, PAGE_CACHE_SIZE); - SetPageUptodate(page); + zero_user_segment(page, 0, PAGE_SIZE); + if (!PageUptodate(page)) + SetPageUptodate(page); unlock_page(page); goto next_page; } @@ -954,35 +1091,15 @@ static int f2fs_mpage_readpages(struct address_space *mapping, */ if (bio && (last_block_in_bio != block_nr - 1)) { submit_and_realloc: - submit_bio(READ, bio); + __submit_bio(F2FS_I_SB(inode), READ, bio, DATA); bio = NULL; } if (bio == NULL) { - struct f2fs_crypto_ctx *ctx = NULL; - - if (f2fs_encrypted_inode(inode) && - S_ISREG(inode->i_mode)) { - - ctx = f2fs_get_crypto_ctx(inode); - if (IS_ERR(ctx)) - goto set_error_page; - - /* wait the page to be moved by cleaning */ - f2fs_wait_on_encrypted_page_writeback( - F2FS_I_SB(inode), block_nr); - } - - bio = bio_alloc(GFP_KERNEL, - min_t(int, nr_pages, BIO_MAX_PAGES)); - if (!bio) { - if (ctx) - f2fs_release_crypto_ctx(ctx); + bio = f2fs_grab_bio(inode, block_nr, nr_pages); + if (IS_ERR(bio)) { + bio = NULL; goto set_error_page; } - bio->bi_bdev = bdev; - bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(block_nr); - bio->bi_end_io = f2fs_read_end_io; - bio->bi_private = ctx; } if (bio_add_page(bio, page, blocksize, 0) < blocksize) @@ -992,22 +1109,22 @@ static int f2fs_mpage_readpages(struct address_space *mapping, goto next_page; set_error_page: SetPageError(page); - zero_user_segment(page, 0, PAGE_CACHE_SIZE); + zero_user_segment(page, 0, PAGE_SIZE); unlock_page(page); goto next_page; confused: if (bio) { - submit_bio(READ, bio); + __submit_bio(F2FS_I_SB(inode), READ, bio, DATA); bio = NULL; } unlock_page(page); next_page: if (pages) - page_cache_release(page); + put_page(page); } BUG_ON(pages && !list_empty(pages)); if (bio) - submit_bio(READ, bio); + __submit_bio(F2FS_I_SB(inode), READ, bio, DATA); return 0; } @@ -1054,23 +1171,33 @@ int do_write_data_page(struct f2fs_io_info *fio) if (err) return err; - fio->blk_addr = dn.data_blkaddr; + fio->old_blkaddr = dn.data_blkaddr; /* This page is already truncated */ - if (fio->blk_addr == NULL_ADDR) { + if (fio->old_blkaddr == NULL_ADDR) { ClearPageUptodate(page); goto out_writepage; } if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { + gfp_t gfp_flags = GFP_NOFS; /* wait for GCed encrypted page writeback */ f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode), - fio->blk_addr); - - fio->encrypted_page = f2fs_encrypt(inode, fio->page); + fio->old_blkaddr); +retry_encrypt: + fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, + gfp_flags); if (IS_ERR(fio->encrypted_page)) { err = PTR_ERR(fio->encrypted_page); + if (err == -ENOMEM) { + /* flush pending ios and wait for a while */ + f2fs_flush_merged_bios(F2FS_I_SB(inode)); + congestion_wait(BLK_RW_ASYNC, HZ/50); + gfp_flags |= __GFP_NOFAIL; + err = 0; + goto retry_encrypt; + } goto out_writepage; } } @@ -1081,20 +1208,19 @@ int do_write_data_page(struct f2fs_io_info *fio) * If current allocation needs SSR, * it had better in-place writes for updated data. */ - if (unlikely(fio->blk_addr != NEW_ADDR && + if (unlikely(fio->old_blkaddr != NEW_ADDR && !is_cold_data(page) && + !IS_ATOMIC_WRITTEN_PAGE(page) && need_inplace_update(inode))) { rewrite_data_page(fio); - set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE); + set_inode_flag(inode, FI_UPDATE_WRITE); trace_f2fs_do_write_data_page(page, IPU); } else { write_data_page(&dn, fio); - set_data_blkaddr(&dn); - f2fs_update_extent_cache(&dn); trace_f2fs_do_write_data_page(page, OPU); - set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); + set_inode_flag(inode, FI_APPEND_WRITE); if (page->index == 0) - set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); } out_writepage: f2fs_put_dnode(&dn); @@ -1108,7 +1234,8 @@ static int f2fs_write_data_page(struct page *page, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); loff_t i_size = i_size_read(inode); const pgoff_t end_index = ((unsigned long long) i_size) - >> PAGE_CACHE_SHIFT; + >> PAGE_SHIFT; + loff_t psize = (page->index + 1) << PAGE_SHIFT; unsigned offset = 0; bool need_balance_fs = false; int err = 0; @@ -1129,37 +1256,37 @@ static int f2fs_write_data_page(struct page *page, * If the offset is out-of-range of file size, * this page does not have to be written to disk. */ - offset = i_size & (PAGE_CACHE_SIZE - 1); + offset = i_size & (PAGE_SIZE - 1); if ((page->index >= end_index + 1) || !offset) goto out; - zero_user_segment(page, offset, PAGE_CACHE_SIZE); + zero_user_segment(page, offset, PAGE_SIZE); write: if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; if (f2fs_is_drop_cache(inode)) goto out; - if (f2fs_is_volatile_file(inode) && !wbc->for_reclaim && - available_free_memory(sbi, BASE_CHECK)) + /* we should not write 0'th page having journal header */ + if (f2fs_is_volatile_file(inode) && (!page->index || + (!wbc->for_reclaim && + available_free_memory(sbi, BASE_CHECK)))) goto redirty_out; + /* we should bypass data pages to proceed the kworkder jobs */ + if (unlikely(f2fs_cp_error(sbi))) { + mapping_set_error(page->mapping, -EIO); + goto out; + } + /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { - if (unlikely(f2fs_cp_error(sbi))) - goto redirty_out; err = do_write_data_page(&fio); goto done; } - /* we should bypass data pages to proceed the kworkder jobs */ - if (unlikely(f2fs_cp_error(sbi))) { - SetPageError(page); - goto out; - } - if (!wbc->for_reclaim) need_balance_fs = true; - else if (has_not_enough_free_secs(sbi, 0)) + else if (has_not_enough_free_secs(sbi, 0, 0)) goto redirty_out; err = -EAGAIN; @@ -1168,6 +1295,8 @@ static int f2fs_write_data_page(struct page *page, err = f2fs_write_inline_data(inode, page); if (err == -EAGAIN) err = do_write_data_page(&fio); + if (F2FS_I(inode)->last_disk_size < psize) + F2FS_I(inode)->last_disk_size = psize; f2fs_unlock_op(sbi); done: if (err && err != -ENOENT) @@ -1178,25 +1307,24 @@ static int f2fs_write_data_page(struct page *page, inode_dec_dirty_pages(inode); if (err) ClearPageUptodate(page); + + if (wbc->for_reclaim) { + f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, DATA, WRITE); + remove_dirty_inode(inode); + } + unlock_page(page); - if (need_balance_fs) - f2fs_balance_fs(sbi); - if (wbc->for_reclaim) + f2fs_balance_fs(sbi, need_balance_fs); + + if (unlikely(f2fs_cp_error(sbi))) f2fs_submit_merged_bio(sbi, DATA, WRITE); + return 0; redirty_out: redirty_page_for_writepage(wbc, page); - return AOP_WRITEPAGE_ACTIVATE; -} - -static int __f2fs_writepage(struct page *page, struct writeback_control *wbc, - void *data) -{ - struct address_space *mapping = data; - int ret = mapping->a_ops->writepage(page, wbc); - mapping_set_error(mapping, ret); - return ret; + unlock_page(page); + return err; } /* @@ -1205,8 +1333,7 @@ static int __f2fs_writepage(struct page *page, struct writeback_control *wbc, * warm/hot data page. */ static int f2fs_write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc, writepage_t writepage, - void *data) + struct writeback_control *wbc) { int ret = 0; int done = 0; @@ -1219,10 +1346,10 @@ static int f2fs_write_cache_pages(struct address_space *mapping, int cycled; int range_whole = 0; int tag; - int step = 0; + int nwritten = 0; pagevec_init(&pvec, 0); -next: + if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; @@ -1232,8 +1359,8 @@ static int f2fs_write_cache_pages(struct address_space *mapping, cycled = 0; end = -1; } else { - index = wbc->range_start >> PAGE_CACHE_SHIFT; - end = wbc->range_end >> PAGE_CACHE_SHIFT; + index = wbc->range_start >> PAGE_SHIFT; + end = wbc->range_end >> PAGE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; cycled = 1; /* ignore range_cyclic tests */ @@ -1277,12 +1404,10 @@ static int f2fs_write_cache_pages(struct address_space *mapping, goto continue_unlock; } - if (step == is_cold_data(page)) - goto continue_unlock; - if (PageWriteback(page)) { if (wbc->sync_mode != WB_SYNC_NONE) - f2fs_wait_on_page_writeback(page, DATA); + f2fs_wait_on_page_writeback(page, + DATA, true); else goto continue_unlock; } @@ -1291,16 +1416,13 @@ static int f2fs_write_cache_pages(struct address_space *mapping, if (!clear_page_dirty_for_io(page)) goto continue_unlock; - ret = (*writepage)(page, wbc, data); + ret = mapping->a_ops->writepage(page, wbc); if (unlikely(ret)) { - if (ret == AOP_WRITEPAGE_ACTIVATE) { - unlock_page(page); - ret = 0; - } else { - done_index = page->index + 1; - done = 1; - break; - } + done_index = page->index + 1; + done = 1; + break; + } else { + nwritten++; } if (--wbc->nr_to_write <= 0 && @@ -1313,11 +1435,6 @@ static int f2fs_write_cache_pages(struct address_space *mapping, cond_resched(); } - if (step < 1) { - step++; - goto next; - } - if (!cycled && !done) { cycled = 1; index = 0; @@ -1327,6 +1444,10 @@ static int f2fs_write_cache_pages(struct address_space *mapping, if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; + if (nwritten) + f2fs_submit_merged_bio_cond(F2FS_M_SB(mapping), mapping->host, + NULL, 0, DATA, WRITE); + return ret; } @@ -1335,11 +1456,8 @@ static int f2fs_write_data_pages(struct address_space *mapping, { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - bool locked = false; + struct blk_plug plug; int ret; - long diff; - - trace_f2fs_writepages(mapping->host, wbc, DATA); /* deal with chardevs and other special file */ if (!mapping->a_ops->writepage) @@ -1354,41 +1472,119 @@ static int f2fs_write_data_pages(struct address_space *mapping, available_free_memory(sbi, DIRTY_DENTS)) goto skip_write; + /* skip writing during file defragment */ + if (is_inode_flag_set(inode, FI_DO_DEFRAG)) + goto skip_write; + /* during POR, we don't need to trigger writepage at all. */ if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto skip_write; - diff = nr_pages_to_write(sbi, DATA, wbc); + trace_f2fs_writepages(mapping->host, wbc, DATA); - if (!S_ISDIR(inode->i_mode)) { - mutex_lock(&sbi->writepages); - locked = true; - } - ret = f2fs_write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); - f2fs_submit_merged_bio(sbi, DATA, WRITE); - if (locked) - mutex_unlock(&sbi->writepages); + blk_start_plug(&plug); + ret = f2fs_write_cache_pages(mapping, wbc); + blk_finish_plug(&plug); + /* + * if some pages were truncated, we cannot guarantee its mapping->host + * to detect pending bios. + */ - remove_dirty_dir_inode(inode); - - wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); + remove_dirty_inode(inode); return ret; skip_write: wbc->pages_skipped += get_dirty_pages(inode); + trace_f2fs_writepages(mapping->host, wbc, DATA); return 0; } static void f2fs_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; + loff_t i_size = i_size_read(inode); - if (to > inode->i_size) { - truncate_pagecache(inode, inode->i_size); - truncate_blocks(inode, inode->i_size, true); + if (to > i_size) { + truncate_pagecache(inode, i_size); + truncate_blocks(inode, i_size, true); } } +static int prepare_write_begin(struct f2fs_sb_info *sbi, + struct page *page, loff_t pos, unsigned len, + block_t *blk_addr, bool *node_changed) +{ + struct inode *inode = page->mapping->host; + pgoff_t index = page->index; + struct dnode_of_data dn; + struct page *ipage; + bool locked = false; + struct extent_info ei; + int err = 0; + + /* + * we already allocated all the blocks, so we don't need to get + * the block addresses when there is no need to fill the page. + */ + if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE) + return 0; + + if (f2fs_has_inline_data(inode) || + (pos & PAGE_MASK) >= i_size_read(inode)) { + f2fs_lock_op(sbi); + locked = true; + } +restart: + /* check inline_data */ + ipage = get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto unlock_out; + } + + set_new_dnode(&dn, inode, ipage, ipage, 0); + + if (f2fs_has_inline_data(inode)) { + if (pos + len <= MAX_INLINE_DATA) { + read_inline_data(page, ipage); + set_inode_flag(inode, FI_DATA_EXIST); + if (inode->i_nlink) + set_inline_node(ipage); + } else { + err = f2fs_convert_inline_page(&dn, page); + if (err) + goto out; + if (dn.data_blkaddr == NULL_ADDR) + err = f2fs_get_block(&dn, index); + } + } else if (locked) { + err = f2fs_get_block(&dn, index); + } else { + if (f2fs_lookup_extent_cache(inode, index, &ei)) { + dn.data_blkaddr = ei.blk + index - ei.fofs; + } else { + /* hole case */ + err = get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (err || dn.data_blkaddr == NULL_ADDR) { + f2fs_put_dnode(&dn); + f2fs_lock_op(sbi); + locked = true; + goto restart; + } + } + } + + /* convert_inline_page can make node_changed */ + *blk_addr = dn.data_blkaddr; + *node_changed = dn.node_changed; +out: + f2fs_put_dnode(&dn); +unlock_out: + if (locked) + f2fs_unlock_op(sbi); + return err; +} + static int f2fs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -1396,15 +1592,13 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *page = NULL; - struct page *ipage; - pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT; - struct dnode_of_data dn; + pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT; + bool need_balance = false; + block_t blkaddr = NULL_ADDR; int err = 0; trace_f2fs_write_begin(inode, pos, len, flags); - f2fs_balance_fs(sbi); - /* * We should check this at this moment to avoid deadlock on inode page * and #0 page. The locking rule for inline_data conversion should be: @@ -1424,98 +1618,63 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, *pagep = page; - f2fs_lock_op(sbi); - - /* check inline_data */ - ipage = get_node_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) { - err = PTR_ERR(ipage); - goto unlock_fail; - } - - set_new_dnode(&dn, inode, ipage, ipage, 0); - - if (f2fs_has_inline_data(inode)) { - if (pos + len <= MAX_INLINE_DATA) { - read_inline_data(page, ipage); - set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); - sync_inode_page(&dn); - goto put_next; - } - err = f2fs_convert_inline_page(&dn, page); - if (err) - goto put_fail; - } - - err = f2fs_get_block(&dn, index); + err = prepare_write_begin(sbi, page, pos, len, + &blkaddr, &need_balance); if (err) - goto put_fail; -put_next: - f2fs_put_dnode(&dn); - f2fs_unlock_op(sbi); + goto fail; - f2fs_wait_on_page_writeback(page, DATA); + if (need_balance && has_not_enough_free_secs(sbi, 0, 0)) { + unlock_page(page); + f2fs_balance_fs(sbi, true); + lock_page(page); + if (page->mapping != mapping) { + /* The page got truncated from under us */ + f2fs_put_page(page, 1); + goto repeat; + } + } + + f2fs_wait_on_page_writeback(page, DATA, false); /* wait for GCed encrypted page writeback */ if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr); + f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); - if (len == PAGE_CACHE_SIZE) - goto out_update; - if (PageUptodate(page)) - goto out_clear; + if (len == PAGE_SIZE || PageUptodate(page)) + return 0; - if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) { - unsigned start = pos & (PAGE_CACHE_SIZE - 1); - unsigned end = start + len; - - /* Reading beyond i_size is simple: memset to zero */ - zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE); - goto out_update; - } - - if (dn.data_blkaddr == NEW_ADDR) { - zero_user_segment(page, 0, PAGE_CACHE_SIZE); + if (blkaddr == NEW_ADDR) { + zero_user_segment(page, 0, PAGE_SIZE); + SetPageUptodate(page); } else { - struct f2fs_io_info fio = { - .sbi = sbi, - .type = DATA, - .rw = READ_SYNC, - .blk_addr = dn.data_blkaddr, - .page = page, - .encrypted_page = NULL, - }; - err = f2fs_submit_page_bio(&fio); - if (err) - goto fail; + struct bio *bio; - lock_page(page); - if (unlikely(!PageUptodate(page))) { - err = -EIO; + bio = f2fs_grab_bio(inode, blkaddr, 1); + if (IS_ERR(bio)) { + err = PTR_ERR(bio); goto fail; } + + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + bio_put(bio); + err = -EFAULT; + goto fail; + } + + __submit_bio(sbi, READ_SYNC, bio, DATA); + + lock_page(page); if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); goto repeat; } - - /* avoid symlink page */ - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { - err = f2fs_decrypt_one(inode, page); - if (err) - goto fail; + if (unlikely(!PageUptodate(page))) { + err = -EIO; + goto fail; } } -out_update: - SetPageUptodate(page); -out_clear: - clear_cold_data(page); return 0; -put_fail: - f2fs_put_dnode(&dn); -unlock_fail: - f2fs_unlock_op(sbi); fail: f2fs_put_page(page, 1); f2fs_write_failed(mapping, pos + len); @@ -1531,15 +1690,28 @@ static int f2fs_write_end(struct file *file, trace_f2fs_write_end(inode, pos, len, copied); - set_page_dirty(page); - - if (pos + copied > i_size_read(inode)) { - i_size_write(inode, pos + copied); - mark_inode_dirty(inode); - update_inode_page(inode); + /* + * This should be come from len == PAGE_SIZE, and we expect copied + * should be PAGE_SIZE. Otherwise, we treat it with zero copied and + * let generic_perform_write() try to copy data again through copied=0. + */ + if (!PageUptodate(page)) { + if (unlikely(copied != PAGE_SIZE)) + copied = 0; + else + SetPageUptodate(page); } + if (!copied) + goto unlock_out; + set_page_dirty(page); + clear_cold_data(page); + + if (pos + copied > i_size_read(inode)) + f2fs_i_size_write(inode, pos + copied); +unlock_out: f2fs_put_page(page, 1); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return copied; } @@ -1558,44 +1730,37 @@ static int check_direct_IO(struct inode *inode, struct iov_iter *iter, } static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) + loff_t offset) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; + struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; size_t count = iov_iter_count(iter); + int rw = iov_iter_rw(iter); int err; - /* we don't need to use inline_data strictly */ - if (f2fs_has_inline_data(inode)) { - err = f2fs_convert_inline_inode(inode); - if (err) - return err; - } - - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - return 0; - err = check_direct_IO(inode, iter, offset); if (err) return err; - trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); + if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + return 0; + if (test_opt(F2FS_I_SB(inode), LFS)) + return 0; - if (iov_iter_rw(iter) == WRITE) { - __allocate_data_blocks(inode, offset, count); - if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) { - err = -EIO; - goto out; - } + trace_f2fs_direct_IO_enter(inode, offset, count, rw); + + down_read(&F2FS_I(inode)->dio_rwsem[rw]); + err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio); + up_read(&F2FS_I(inode)->dio_rwsem[rw]); + + if (rw == WRITE) { + if (err > 0) + set_inode_flag(inode, FI_UPDATE_WRITE); + else if (err < 0) + f2fs_write_failed(mapping, offset + count); } - err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio); -out: - if (err < 0 && iov_iter_rw(iter) == WRITE) - f2fs_write_failed(mapping, offset + count); - - trace_f2fs_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), err); + trace_f2fs_direct_IO_exit(inode, offset, count, rw, err); return err; } @@ -1607,7 +1772,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); if (inode->i_ino >= F2FS_ROOT_INO(sbi) && - (offset % PAGE_CACHE_SIZE || length != PAGE_CACHE_SIZE)) + (offset % PAGE_SIZE || length != PAGE_SIZE)) return; if (PageDirty(page)) { @@ -1623,6 +1788,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, if (IS_ATOMIC_WRITTEN_PAGE(page)) return; + set_page_private(page, 0); ClearPagePrivate(page); } @@ -1636,10 +1802,42 @@ int f2fs_release_page(struct page *page, gfp_t wait) if (IS_ATOMIC_WRITTEN_PAGE(page)) return 0; + set_page_private(page, 0); ClearPagePrivate(page); return 1; } +/* + * This was copied from __set_page_dirty_buffers which gives higher performance + * in very high speed storages. (e.g., pmem) + */ +void f2fs_set_page_dirty_nobuffers(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct mem_cgroup *memcg; + unsigned long flags; + + if (unlikely(!mapping)) + return; + + spin_lock(&mapping->private_lock); + memcg = mem_cgroup_begin_page_stat(page); + SetPageDirty(page); + spin_unlock(&mapping->private_lock); + + spin_lock_irqsave(&mapping->tree_lock, flags); + WARN_ON_ONCE(!PageUptodate(page)); + account_page_dirtied(page, mapping, memcg); + radix_tree_tag_set(&mapping->page_tree, + page_index(page), PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + + mem_cgroup_end_page_stat(memcg); + + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + return; +} + static int f2fs_set_data_page_dirty(struct page *page) { struct address_space *mapping = page->mapping; @@ -1647,7 +1845,8 @@ static int f2fs_set_data_page_dirty(struct page *page) trace_f2fs_set_page_dirty(page, DATA); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); if (f2fs_is_atomic_file(inode)) { if (!IS_ATOMIC_WRITTEN_PAGE(page)) { @@ -1662,7 +1861,7 @@ static int f2fs_set_data_page_dirty(struct page *page) } if (!PageDirty(page)) { - __set_page_dirty_nobuffers(page); + f2fs_set_page_dirty_nobuffers(page); update_dirty_page(inode, page); return 1; } @@ -1683,6 +1882,58 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) return generic_block_bmap(mapping, block, get_data_block_bmap); } +#ifdef CONFIG_MIGRATION +#include + +int f2fs_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page, enum migrate_mode mode) +{ + int rc, extra_count; + struct f2fs_inode_info *fi = F2FS_I(mapping->host); + bool atomic_written = IS_ATOMIC_WRITTEN_PAGE(page); + + BUG_ON(PageWriteback(page)); + + /* migrating an atomic written page is safe with the inmem_lock hold */ + if (atomic_written && !mutex_trylock(&fi->inmem_lock)) + return -EAGAIN; + + /* + * A reference is expected if PagePrivate set when move mapping, + * however F2FS breaks this for maintaining dirty page counts when + * truncating pages. So here adjusting the 'extra_count' make it work. + */ + extra_count = (atomic_written ? 1 : 0) - page_has_private(page); + rc = migrate_page_move_mapping(mapping, newpage, + page, NULL, mode, extra_count); + if (rc != MIGRATEPAGE_SUCCESS) { + if (atomic_written) + mutex_unlock(&fi->inmem_lock); + return rc; + } + + if (atomic_written) { + struct inmem_pages *cur; + list_for_each_entry(cur, &fi->inmem_pages, list) + if (cur->page == page) { + cur->page = newpage; + break; + } + mutex_unlock(&fi->inmem_lock); + put_page(page); + get_page(newpage); + } + + if (PagePrivate(page)) + SetPagePrivate(newpage); + set_page_private(newpage, page_private(page)); + + migrate_page_copy(newpage, page); + + return MIGRATEPAGE_SUCCESS; +} +#endif + const struct address_space_operations f2fs_dblock_aops = { .readpage = f2fs_read_data_page, .readpages = f2fs_read_data_pages, @@ -1695,4 +1946,7 @@ const struct address_space_operations f2fs_dblock_aops = { .releasepage = f2fs_release_page, .direct_IO = f2fs_direct_IO, .bmap = f2fs_bmap, +#ifdef CONFIG_MIGRATION + .migratepage = f2fs_migrate_page, +#endif }; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 478e5d54154f..fb245bd302e4 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -38,23 +38,30 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->hit_rbtree = atomic64_read(&sbi->read_hit_rbtree); si->hit_total = si->hit_largest + si->hit_cached + si->hit_rbtree; si->total_ext = atomic64_read(&sbi->total_hit_ext); - si->ext_tree = sbi->total_ext_tree; + si->ext_tree = atomic_read(&sbi->total_ext_tree); + si->zombie_tree = atomic_read(&sbi->total_zombie_tree); si->ext_node = atomic_read(&sbi->total_ext_node); si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS); - si->ndirty_dirs = sbi->n_dirty_dirs; si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META); + si->ndirty_data = get_pages(sbi, F2FS_DIRTY_DATA); + si->ndirty_imeta = get_pages(sbi, F2FS_DIRTY_IMETA); + si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE]; + si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; + si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); - si->wb_pages = get_pages(sbi, F2FS_WRITEBACK); + si->wb_bios = atomic_read(&sbi->nr_wb_bios); si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); si->valid_count = valid_user_blocks(sbi); + si->discard_blks = discard_blocks(sbi); si->valid_node_count = valid_node_count(sbi); si->valid_inode_count = valid_inode_count(sbi); si->inline_xattr = atomic_read(&sbi->inline_xattr); si->inline_inode = atomic_read(&sbi->inline_inode); si->inline_dir = atomic_read(&sbi->inline_dir); + si->orphans = sbi->im[ORPHAN_INO].ino_num; si->utilization = utilization(sbi); si->free_segs = free_segments(sbi); @@ -105,7 +112,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi) bimodal = 0; total_vblocks = 0; - blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg); + blks_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg; hblks_per_sec = blks_per_sec / 2; for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); @@ -140,6 +147,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem = sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize; si->base_mem += 2 * sizeof(struct f2fs_inode_info); si->base_mem += sizeof(*sbi->ckpt); + si->base_mem += sizeof(struct percpu_counter) * NR_COUNT_TYPE; /* build sm */ si->base_mem += sizeof(struct f2fs_sm_info); @@ -148,7 +156,9 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem += sizeof(struct sit_info); si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry); si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi)); - si->base_mem += 3 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); + si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); + if (f2fs_discard_en(sbi)) + si->base_mem += SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); si->base_mem += SIT_VBLOCK_MAP_SIZE; if (sbi->segs_per_sec > 1) si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry); @@ -161,7 +171,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi) /* build curseg */ si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE; - si->base_mem += PAGE_CACHE_SIZE * NR_CURSEG_TYPE; + si->base_mem += PAGE_SIZE * NR_CURSEG_TYPE; /* build dirty segmap */ si->base_mem += sizeof(struct dirty_seglist_info); @@ -189,18 +199,18 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->cache_mem += NM_I(sbi)->dirty_nat_cnt * sizeof(struct nat_entry_set); si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages); - si->cache_mem += sbi->n_dirty_dirs * sizeof(struct inode_entry); - for (i = 0; i <= UPDATE_INO; i++) + for (i = 0; i <= ORPHAN_INO; i++) si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); - si->cache_mem += sbi->total_ext_tree * sizeof(struct extent_tree); + si->cache_mem += atomic_read(&sbi->total_ext_tree) * + sizeof(struct extent_tree); si->cache_mem += atomic_read(&sbi->total_ext_node) * sizeof(struct extent_node); si->page_mem = 0; npages = NODE_MAPPING(sbi)->nrpages; - si->page_mem += (unsigned long long)npages << PAGE_CACHE_SHIFT; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; npages = META_MAPPING(sbi)->nrpages; - si->page_mem += (unsigned long long)npages << PAGE_CACHE_SHIFT; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; } static int stat_show(struct seq_file *s, void *v) @@ -211,20 +221,24 @@ static int stat_show(struct seq_file *s, void *v) mutex_lock(&f2fs_stat_mutex); list_for_each_entry(si, &f2fs_stat_list, stat_list) { - char devname[BDEVNAME_SIZE]; - update_general_status(si->sbi); - seq_printf(s, "\n=====[ partition info(%s). #%d ]=====\n", - bdevname(si->sbi->sb->s_bdev, devname), i++); + seq_printf(s, "\n=====[ partition info(%pg). #%d, %s]=====\n", + si->sbi->sb->s_bdev, i++, + f2fs_readonly(si->sbi->sb) ? "RO": "RW"); seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ", si->sit_area_segs, si->nat_area_segs); seq_printf(s, "[SSA: %d] [MAIN: %d", si->ssa_area_segs, si->main_area_segs); seq_printf(s, "(OverProv:%d Resv:%d)]\n\n", si->overp_segs, si->rsvd_segs); - seq_printf(s, "Utilization: %d%% (%d valid blocks)\n", - si->utilization, si->valid_count); + if (test_opt(si->sbi, DISCARD)) + seq_printf(s, "Utilization: %u%% (%u valid blocks, %u discard blocks)\n", + si->utilization, si->valid_count, si->discard_blks); + else + seq_printf(s, "Utilization: %u%% (%u valid blocks)\n", + si->utilization, si->valid_count); + seq_printf(s, " - Node: %u (Inode: %u, ", si->valid_node_count, si->valid_inode_count); seq_printf(s, "Other: %u)\n - Data: %u\n", @@ -236,6 +250,8 @@ static int stat_show(struct seq_file *s, void *v) si->inline_inode); seq_printf(s, " - Inline_dentry Inode: %u\n", si->inline_dir); + seq_printf(s, " - Orphan Inode: %u\n", + si->orphans); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", si->main_area_segs, si->main_area_sections, si->main_area_zones); @@ -269,7 +285,8 @@ static int stat_show(struct seq_file *s, void *v) si->dirty_count); seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n", si->prefree_count, si->free_segs, si->free_secs); - seq_printf(s, "CP calls: %d\n", si->cp_count); + seq_printf(s, "CP calls: %d (BG: %d)\n", + si->cp_count, si->bg_cp_count); seq_printf(s, "GC calls: %d (BG: %d)\n", si->call_count, si->bg_gc); seq_printf(s, " - data segments : %d (%d)\n", @@ -290,17 +307,21 @@ static int stat_show(struct seq_file *s, void *v) !si->total_ext ? 0 : div64_u64(si->hit_total * 100, si->total_ext), si->hit_total, si->total_ext); - seq_printf(s, " - Inner Struct Count: tree: %d, node: %d\n", - si->ext_tree, si->ext_node); + seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", + si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - inmem: %4d, wb: %4d\n", - si->inmem_pages, si->wb_pages); - seq_printf(s, " - nodes: %4d in %4d\n", + seq_printf(s, " - inmem: %4lld, wb_bios: %4d\n", + si->inmem_pages, si->wb_bios); + seq_printf(s, " - nodes: %4lld in %4d\n", si->ndirty_node, si->node_pages); - seq_printf(s, " - dents: %4d in dirs:%4d\n", - si->ndirty_dent, si->ndirty_dirs); - seq_printf(s, " - meta: %4d in %4d\n", + seq_printf(s, " - dents: %4lld in dirs:%4d (%4d)\n", + si->ndirty_dent, si->ndirty_dirs, si->ndirty_all); + seq_printf(s, " - datas: %4lld in files:%4d\n", + si->ndirty_data, si->ndirty_files); + seq_printf(s, " - meta: %4lld in %4d\n", si->ndirty_meta, si->meta_pages); + seq_printf(s, " - imeta: %4lld\n", + si->ndirty_imeta); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); seq_printf(s, " - free_nids: %9d\n", @@ -406,20 +427,23 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi) kfree(si); } -void __init f2fs_create_root_stats(void) +int __init f2fs_create_root_stats(void) { struct dentry *file; f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL); if (!f2fs_debugfs_root) - return; + return -ENOMEM; file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, NULL, &stat_fops); if (!file) { debugfs_remove(f2fs_debugfs_root); f2fs_debugfs_root = NULL; + return -ENOMEM; } + + return 0; } void f2fs_destroy_root_stats(void) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 7c1678ba8f92..e634a637c443 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -17,8 +17,8 @@ static unsigned long dir_blocks(struct inode *inode) { - return ((unsigned long long) (i_size_read(inode) + PAGE_CACHE_SIZE - 1)) - >> PAGE_CACHE_SHIFT; + return ((unsigned long long) (i_size_read(inode) + PAGE_SIZE - 1)) + >> PAGE_SHIFT; } static unsigned int dir_buckets(unsigned int level, int dir_level) @@ -37,7 +37,7 @@ static unsigned int bucket_blocks(unsigned int level) return 4; } -unsigned char f2fs_filetype_table[F2FS_FT_MAX] = { +static unsigned char f2fs_filetype_table[F2FS_FT_MAX] = { [F2FS_FT_UNKNOWN] = DT_UNKNOWN, [F2FS_FT_REG_FILE] = DT_REG, [F2FS_FT_DIR] = DT_DIR, @@ -48,7 +48,6 @@ unsigned char f2fs_filetype_table[F2FS_FT_MAX] = { [F2FS_FT_SYMLINK] = DT_LNK, }; -#define S_SHIFT 12 static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFREG >> S_SHIFT] = F2FS_FT_REG_FILE, [S_IFDIR >> S_SHIFT] = F2FS_FT_DIR, @@ -64,6 +63,13 @@ void set_de_type(struct f2fs_dir_entry *de, umode_t mode) de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; } +unsigned char get_de_type(struct f2fs_dir_entry *de) +{ + if (de->file_type < F2FS_FT_MAX) + return f2fs_filetype_table[de->file_type]; + return DT_UNKNOWN; +} + static unsigned long dir_block_index(unsigned int level, int dir_level, unsigned int idx) { @@ -77,7 +83,7 @@ static unsigned long dir_block_index(unsigned int level, } static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, - struct f2fs_filename *fname, + struct fscrypt_name *fname, f2fs_hash_t namehash, int *max_slots, struct page **res_page) @@ -95,23 +101,18 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, else kunmap(dentry_page); - /* - * For the most part, it should be a bug when name_len is zero. - * We stop here for figuring out where the bugs has occurred. - */ - f2fs_bug_on(F2FS_P_SB(dentry_page), d.max < 0); return de; } -struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *fname, +struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, f2fs_hash_t namehash, int *max_slots, struct f2fs_dentry_ptr *d) { struct f2fs_dir_entry *de; unsigned long bit_pos = 0; int max_len = 0; - struct f2fs_str de_name = FSTR_INIT(NULL, 0); - struct f2fs_str *name = &fname->disk_name; + struct fscrypt_str de_name = FSTR_INIT(NULL, 0); + struct fscrypt_str *name = &fname->disk_name; if (max_slots) *max_slots = 0; @@ -124,6 +125,11 @@ struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *fname, de = &d->dentry[bit_pos]; + if (unlikely(!de->name_len)) { + bit_pos++; + continue; + } + /* encrypted case */ de_name.name = d->filename[bit_pos]; de_name.len = le16_to_cpu(de->name_len); @@ -141,10 +147,6 @@ struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *fname, *max_slots = max_len; max_len = 0; - /* remain bug on condition */ - if (unlikely(!de->name_len)) - d->max = -1; - bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); } @@ -157,7 +159,7 @@ struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *fname, static struct f2fs_dir_entry *find_in_level(struct inode *dir, unsigned int level, - struct f2fs_filename *fname, + struct fscrypt_name *fname, struct page **res_page) { struct qstr name = FSTR_TO_QSTR(&fname->disk_name); @@ -170,9 +172,10 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, int max_slots; f2fs_hash_t namehash; - namehash = f2fs_dentry_hash(&name); - - f2fs_bug_on(F2FS_I_SB(dir), level > MAX_DIR_HASH_DEPTH); + if(fname->hash) + namehash = cpu_to_le32(fname->hash); + else + namehash = f2fs_dentry_hash(&name); nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); @@ -185,8 +188,13 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, /* no need to allocate new dentry pages to all the indices */ dentry_page = find_data_page(dir, bidx); if (IS_ERR(dentry_page)) { - room = true; - continue; + if (PTR_ERR(dentry_page) == -ENOENT) { + room = true; + continue; + } else { + *res_page = dentry_page; + break; + } } de = find_in_block(dentry_page, fname, namehash, &max_slots, @@ -207,6 +215,44 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, return de; } +struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, + struct fscrypt_name *fname, struct page **res_page) +{ + unsigned long npages = dir_blocks(dir); + struct f2fs_dir_entry *de = NULL; + unsigned int max_depth; + unsigned int level; + + if (f2fs_has_inline_dentry(dir)) { + *res_page = NULL; + de = find_in_inline_dir(dir, fname, res_page); + goto out; + } + + if (npages == 0) { + *res_page = NULL; + goto out; + } + + max_depth = F2FS_I(dir)->i_current_depth; + if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) { + f2fs_msg(F2FS_I_SB(dir)->sb, KERN_WARNING, + "Corrupted max_depth of %lu: %u", + dir->i_ino, max_depth); + max_depth = MAX_DIR_HASH_DEPTH; + f2fs_i_depth_write(dir, max_depth); + } + + for (level = 0; level < max_depth; level++) { + *res_page = NULL; + de = find_in_level(dir, level, fname, res_page); + if (de || IS_ERR(*res_page)) + break; + } +out: + return de; +} + /* * Find an entry in the specified directory with the wanted name. * It returns the page where the entry was found (as a parameter - res_page), @@ -214,72 +260,42 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, * Entry is guaranteed to be valid. */ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, - struct qstr *child, struct page **res_page) + const struct qstr *child, struct page **res_page) { - unsigned long npages = dir_blocks(dir); struct f2fs_dir_entry *de = NULL; - unsigned int max_depth; - unsigned int level; - struct f2fs_filename fname; + struct fscrypt_name fname; int err; - *res_page = NULL; - - err = f2fs_fname_setup_filename(dir, child, 1, &fname); - if (err) + err = fscrypt_setup_filename(dir, child, 1, &fname); + if (err) { + *res_page = ERR_PTR(err); return NULL; - - if (f2fs_has_inline_dentry(dir)) { - de = find_in_inline_dir(dir, &fname, res_page); - goto out; } - if (npages == 0) - goto out; + de = __f2fs_find_entry(dir, &fname, res_page); - max_depth = F2FS_I(dir)->i_current_depth; - - for (level = 0; level < max_depth; level++) { - de = find_in_level(dir, level, &fname, res_page); - if (de) - break; - } -out: - f2fs_fname_free_filename(&fname); + fscrypt_free_filename(&fname); return de; } struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p) { - struct page *page; - struct f2fs_dir_entry *de; - struct f2fs_dentry_block *dentry_blk; + struct qstr dotdot = QSTR_INIT("..", 2); - if (f2fs_has_inline_dentry(dir)) - return f2fs_parent_inline_dir(dir, p); - - page = get_lock_data_page(dir, 0, false); - if (IS_ERR(page)) - return NULL; - - dentry_blk = kmap(page); - de = &dentry_blk->dentry[1]; - *p = page; - unlock_page(page); - return de; + return f2fs_find_entry(dir, &dotdot, p); } -ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr) +ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, + struct page **page) { ino_t res = 0; struct f2fs_dir_entry *de; - struct page *page; - de = f2fs_find_entry(dir, qstr, &page); + de = f2fs_find_entry(dir, qstr, page); if (de) { res = le32_to_cpu(de->ino); - f2fs_dentry_kunmap(dir, page); - f2fs_put_page(page, 0); + f2fs_dentry_kunmap(dir, *page); + f2fs_put_page(*page, 0); } return res; @@ -290,14 +306,14 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, { enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA; lock_page(page); - f2fs_wait_on_page_writeback(page, type); + f2fs_wait_on_page_writeback(page, type, true); de->ino = cpu_to_le32(inode->i_ino); set_de_type(de, inode->i_mode); f2fs_dentry_kunmap(dir, page); set_page_dirty(page); - dir->i_mtime = dir->i_ctime = CURRENT_TIME; - mark_inode_dirty(dir); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + f2fs_mark_inode_dirty_sync(dir); f2fs_put_page(page, 1); } @@ -305,7 +321,7 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage) { struct f2fs_inode *ri; - f2fs_wait_on_page_writeback(ipage, NODE); + f2fs_wait_on_page_writeback(ipage, NODE, true); /* copy name info. to this inode page */ ri = F2FS_INODE(ipage); @@ -335,24 +351,14 @@ int update_dent_inode(struct inode *inode, struct inode *to, void do_make_empty_dir(struct inode *inode, struct inode *parent, struct f2fs_dentry_ptr *d) { - struct f2fs_dir_entry *de; + struct qstr dot = QSTR_INIT(".", 1); + struct qstr dotdot = QSTR_INIT("..", 2); - de = &d->dentry[0]; - de->name_len = cpu_to_le16(1); - de->hash_code = 0; - de->ino = cpu_to_le32(inode->i_ino); - memcpy(d->filename[0], ".", 1); - set_de_type(de, inode->i_mode); + /* update dirent of "." */ + f2fs_update_dentry(inode->i_ino, inode->i_mode, d, &dot, 0, 0); - de = &d->dentry[1]; - de->hash_code = 0; - de->name_len = cpu_to_le16(2); - de->ino = cpu_to_le32(parent->i_ino); - memcpy(d->filename[1], "..", 2); - set_de_type(de, parent->i_mode); - - test_and_set_bit_le(0, (void *)d->bitmap); - test_and_set_bit_le(1, (void *)d->bitmap); + /* update dirent of ".." */ + f2fs_update_dentry(parent->i_ino, parent->i_mode, d, &dotdot, 0, 1); } static int make_empty_dir(struct inode *inode, @@ -382,32 +388,38 @@ static int make_empty_dir(struct inode *inode, } struct page *init_inode_metadata(struct inode *inode, struct inode *dir, - const struct qstr *name, struct page *dpage) + const struct qstr *new_name, const struct qstr *orig_name, + struct page *dpage) { struct page *page; int err; - if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { + if (is_inode_flag_set(inode, FI_NEW_INODE)) { page = new_inode_page(inode); if (IS_ERR(page)) return page; if (S_ISDIR(inode->i_mode)) { + /* in order to handle error case */ + get_page(page); err = make_empty_dir(inode, dir, page); - if (err) - goto error; + if (err) { + lock_page(page); + goto put_error; + } + put_page(page); } err = f2fs_init_acl(inode, dir, page, dpage); if (err) goto put_error; - err = f2fs_init_security(inode, dir, name, page); + err = f2fs_init_security(inode, dir, orig_name, page); if (err) goto put_error; if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) { - err = f2fs_inherit_context(dir, inode, page); + err = fscrypt_inherit_context(dir, inode, page, false); if (err) goto put_error; } @@ -419,14 +431,14 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, set_cold_node(inode, page); } - if (name) - init_dent_inode(name, page); + if (new_name) + init_dent_inode(new_name, page); /* * This file should be checkpointed during fsync. * We lost i_pino from now on. */ - if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) { + if (is_inode_flag_set(inode, FI_INC_LINK)) { file_lost_pino(inode); /* * If link the tmpfile to alias through linkat path, @@ -434,41 +446,33 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, */ if (inode->i_nlink == 0) remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino); - inc_nlink(inode); + f2fs_i_links_write(inode, true); } return page; put_error: + clear_nlink(inode); + update_inode(inode, page); f2fs_put_page(page, 1); -error: - /* once the failed inode becomes a bad inode, i_mode is S_IFREG */ - truncate_inode_pages(&inode->i_data, 0); - truncate_blocks(inode, 0, false); - remove_dirty_dir_inode(inode); - remove_inode_page(inode); return ERR_PTR(err); } void update_parent_metadata(struct inode *dir, struct inode *inode, unsigned int current_depth) { - if (inode && is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { - if (S_ISDIR(inode->i_mode)) { - inc_nlink(dir); - set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); - } - clear_inode_flag(F2FS_I(inode), FI_NEW_INODE); + if (inode && is_inode_flag_set(inode, FI_NEW_INODE)) { + if (S_ISDIR(inode->i_mode)) + f2fs_i_links_write(dir, true); + clear_inode_flag(inode, FI_NEW_INODE); } dir->i_mtime = dir->i_ctime = CURRENT_TIME; - mark_inode_dirty(dir); + f2fs_mark_inode_dirty_sync(dir); - if (F2FS_I(dir)->i_current_depth != current_depth) { - F2FS_I(dir)->i_current_depth = current_depth; - set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); - } + if (F2FS_I(dir)->i_current_depth != current_depth) + f2fs_i_depth_write(dir, current_depth); - if (inode && is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) - clear_inode_flag(F2FS_I(inode), FI_INC_LINK); + if (inode && is_inode_flag_set(inode, FI_INC_LINK)) + clear_inode_flag(inode, FI_INC_LINK); } int room_for_filename(const void *bitmap, int slots, int max_slots) @@ -505,15 +509,16 @@ void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, memcpy(d->filename[bit_pos], name->name, name->len); de->ino = cpu_to_le32(ino); set_de_type(de, mode); - for (i = 0; i < slots; i++) - test_and_set_bit_le(bit_pos + i, (void *)d->bitmap); + for (i = 0; i < slots; i++) { + __set_bit_le(bit_pos + i, (void *)d->bitmap); + /* avoid wrong garbage data for readdir */ + if (i) + (de + i)->name_len = 0; + } } -/* - * Caller should grab and release a rwsem by calling f2fs_lock_op() and - * f2fs_unlock_op(). - */ -int __f2fs_add_link(struct inode *dir, const struct qstr *name, +int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, + const struct qstr *orig_name, struct inode *inode, nid_t ino, umode_t mode) { unsigned int bit_pos; @@ -526,28 +531,11 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct f2fs_dentry_block *dentry_blk = NULL; struct f2fs_dentry_ptr d; struct page *page = NULL; - struct f2fs_filename fname; - struct qstr new_name; - int slots, err; - - err = f2fs_fname_setup_filename(dir, name, 0, &fname); - if (err) - return err; - - new_name.name = fname_name(&fname); - new_name.len = fname_len(&fname); - - if (f2fs_has_inline_dentry(dir)) { - err = f2fs_add_inline_entry(dir, &new_name, inode, ino, mode); - if (!err || err != -EAGAIN) - goto out; - else - err = 0; - } + int slots, err = 0; level = 0; - slots = GET_DENTRY_SLOTS(new_name.len); - dentry_hash = f2fs_dentry_hash(&new_name); + slots = GET_DENTRY_SLOTS(new_name->len); + dentry_hash = f2fs_dentry_hash(new_name); current_depth = F2FS_I(dir)->i_current_depth; if (F2FS_I(dir)->chash == dentry_hash) { @@ -556,10 +544,12 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, } start: - if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) { - err = -ENOSPC; - goto out; - } +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) + return -ENOSPC; +#endif + if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) + return -ENOSPC; /* Increase the depth, if required */ if (level == current_depth) @@ -573,10 +563,8 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, for (block = bidx; block <= (bidx + nblock - 1); block++) { dentry_page = get_new_data_page(dir, NULL, block, true); - if (IS_ERR(dentry_page)) { - err = PTR_ERR(dentry_page); - goto out; - } + if (IS_ERR(dentry_page)) + return PTR_ERR(dentry_page); dentry_blk = kmap(dentry_page); bit_pos = room_for_filename(&dentry_blk->dentry_bitmap, @@ -592,11 +580,12 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, ++level; goto start; add_dentry: - f2fs_wait_on_page_writeback(dentry_page, DATA); + f2fs_wait_on_page_writeback(dentry_page, DATA, true); if (inode) { down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, &new_name, NULL); + page = init_inode_metadata(inode, dir, new_name, + orig_name, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; @@ -606,14 +595,12 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, } make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); - f2fs_update_dentry(ino, mode, &d, &new_name, dentry_hash, bit_pos); + f2fs_update_dentry(ino, mode, &d, new_name, dentry_hash, bit_pos); set_page_dirty(dentry_page); if (inode) { - /* we don't need to mark_inode_dirty now */ - F2FS_I(inode)->i_pino = dir->i_ino; - update_inode(inode, page); + f2fs_i_pino_write(inode, dir->i_ino); f2fs_put_page(page, 1); } @@ -622,14 +609,49 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, if (inode) up_write(&F2FS_I(inode)->i_sem); - if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) { - update_inode_page(dir); - clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); - } kunmap(dentry_page); f2fs_put_page(dentry_page, 1); -out: - f2fs_fname_free_filename(&fname); + + return err; +} + +int __f2fs_do_add_link(struct inode *dir, struct fscrypt_name *fname, + struct inode *inode, nid_t ino, umode_t mode) +{ + struct qstr new_name; + int err = -EAGAIN; + + new_name.name = fname_name(fname); + new_name.len = fname_len(fname); + + if (f2fs_has_inline_dentry(dir)) + err = f2fs_add_inline_entry(dir, &new_name, fname->usr_fname, + inode, ino, mode); + if (err == -EAGAIN) + err = f2fs_add_regular_entry(dir, &new_name, fname->usr_fname, + inode, ino, mode); + + f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); + return err; +} + +/* + * Caller should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op(). + */ +int __f2fs_add_link(struct inode *dir, const struct qstr *name, + struct inode *inode, nid_t ino, umode_t mode) +{ + struct fscrypt_name fname; + int err; + + err = fscrypt_setup_filename(dir, name, 0, &fname); + if (err) + return err; + + err = __f2fs_do_add_link(dir, &fname, inode, ino, mode); + + fscrypt_free_filename(&fname); return err; } @@ -639,46 +661,39 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir) int err = 0; down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, NULL, NULL); + page = init_inode_metadata(inode, dir, NULL, NULL, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; } - /* we don't need to mark_inode_dirty now */ - update_inode(inode, page); f2fs_put_page(page, 1); - clear_inode_flag(F2FS_I(inode), FI_NEW_INODE); + clear_inode_flag(inode, FI_NEW_INODE); fail: up_write(&F2FS_I(inode)->i_sem); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return err; } -void f2fs_drop_nlink(struct inode *dir, struct inode *inode, struct page *page) +void f2fs_drop_nlink(struct inode *dir, struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); down_write(&F2FS_I(inode)->i_sem); - if (S_ISDIR(inode->i_mode)) { - drop_nlink(dir); - if (page) - update_inode(dir, page); - else - update_inode_page(dir); - } + if (S_ISDIR(inode->i_mode)) + f2fs_i_links_write(dir, false); inode->i_ctime = CURRENT_TIME; - drop_nlink(inode); + f2fs_i_links_write(inode, false); if (S_ISDIR(inode->i_mode)) { - drop_nlink(inode); - i_size_write(inode, 0); + f2fs_i_links_write(inode, false); + f2fs_i_size_write(inode, 0); } up_write(&F2FS_I(inode)->i_sem); - update_inode_page(inode); if (inode->i_nlink == 0) - add_orphan_inode(sbi, inode->i_ino); + add_orphan_inode(inode); else release_orphan_inode(sbi); } @@ -695,11 +710,13 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); int i; + f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); + if (f2fs_has_inline_dentry(dir)) return f2fs_delete_inline_entry(dentry, page, dir, inode); lock_page(page); - f2fs_wait_on_page_writeback(page, DATA); + f2fs_wait_on_page_writeback(page, DATA, true); dentry_blk = page_address(page); bit_pos = dentry - dentry_blk->dentry; @@ -714,9 +731,10 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, set_page_dirty(page); dir->i_ctime = dir->i_mtime = CURRENT_TIME; + f2fs_mark_inode_dirty_sync(dir); if (inode) - f2fs_drop_nlink(dir, inode, NULL); + f2fs_drop_nlink(dir, inode); if (bit_pos == NR_DENTRY_IN_BLOCK && !truncate_hole(dir, page->index, page->index + 1)) { @@ -767,12 +785,12 @@ bool f2fs_empty_dir(struct inode *dir) } bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, - unsigned int start_pos, struct f2fs_str *fstr) + unsigned int start_pos, struct fscrypt_str *fstr) { unsigned char d_type = DT_UNKNOWN; unsigned int bit_pos; struct f2fs_dir_entry *de = NULL; - struct f2fs_str de_name = FSTR_INIT(NULL, 0); + struct fscrypt_str de_name = FSTR_INIT(NULL, 0); bit_pos = ((unsigned long)ctx->pos % d->max); @@ -782,10 +800,13 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, break; de = &d->dentry[bit_pos]; - if (de->file_type < F2FS_FT_MAX) - d_type = f2fs_filetype_table[de->file_type]; - else - d_type = DT_UNKNOWN; + if (de->name_len == 0) { + bit_pos++; + ctx->pos = start_pos + bit_pos; + continue; + } + + d_type = get_de_type(de); de_name.name = d->filename[bit_pos]; de_name.len = le16_to_cpu(de->name_len); @@ -794,15 +815,9 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, int save_len = fstr->len; int ret; - de_name.name = kmalloc(de_name.len, GFP_NOFS); - if (!de_name.name) - return false; - - memcpy(de_name.name, d->filename[bit_pos], de_name.len); - - ret = f2fs_fname_disk_to_usr(d->inode, &de->hash_code, - &de_name, fstr); - kfree(de_name.name); + ret = fscrypt_fname_disk_to_usr(d->inode, + (u32)de->hash_code, 0, + &de_name, fstr); if (ret < 0) return true; @@ -829,16 +844,15 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) struct file_ra_state *ra = &file->f_ra; unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK); struct f2fs_dentry_ptr d; - struct f2fs_str fstr = FSTR_INIT(NULL, 0); + struct fscrypt_str fstr = FSTR_INIT(NULL, 0); int err = 0; if (f2fs_encrypted_inode(inode)) { - err = f2fs_get_encryption_info(inode); - if (err) + err = fscrypt_get_encryption_info(inode); + if (err && err != -ENOKEY) return err; - err = f2fs_fname_crypto_alloc_buffer(inode, F2FS_NAME_LEN, - &fstr); + err = fscrypt_fname_alloc_buffer(inode, F2FS_NAME_LEN, &fstr); if (err < 0) return err; } @@ -855,36 +869,47 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) for (; n < npages; n++) { dentry_page = get_lock_data_page(inode, n, false); - if (IS_ERR(dentry_page)) - continue; + if (IS_ERR(dentry_page)) { + err = PTR_ERR(dentry_page); + if (err == -ENOENT) + continue; + else + goto out; + } dentry_blk = kmap(dentry_page); make_dentry_ptr(inode, &d, (void *)dentry_blk, 1); - if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) - goto stop; + if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) { + kunmap(dentry_page); + f2fs_put_page(dentry_page, 1); + break; + } ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK; kunmap(dentry_page); f2fs_put_page(dentry_page, 1); - dentry_page = NULL; - } -stop: - if (dentry_page && !IS_ERR(dentry_page)) { - kunmap(dentry_page); - f2fs_put_page(dentry_page, 1); } + err = 0; out: - f2fs_fname_crypto_free_buffer(&fstr); + fscrypt_fname_free_buffer(&fstr); return err; } +static int f2fs_dir_open(struct inode *inode, struct file *filp) +{ + if (f2fs_encrypted_inode(inode)) + return fscrypt_get_encryption_info(inode) ? -EACCES : 0; + return 0; +} + const struct file_operations f2fs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate = f2fs_readdir, .fsync = f2fs_sync_file, + .open = f2fs_dir_open, .unlocked_ioctl = f2fs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = f2fs_compat_ioctl, diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 7ddba812e11b..2b06d4fcd954 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -33,10 +33,11 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, en->ei = *ei; INIT_LIST_HEAD(&en->list); + en->et = et; rb_link_node(&en->rb_node, parent, p); rb_insert_color(&en->rb_node, &et->root); - et->count++; + atomic_inc(&et->node_cnt); atomic_inc(&sbi->total_ext_node); return en; } @@ -45,11 +46,29 @@ static void __detach_extent_node(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_node *en) { rb_erase(&en->rb_node, &et->root); - et->count--; + atomic_dec(&et->node_cnt); atomic_dec(&sbi->total_ext_node); if (et->cached_en == en) et->cached_en = NULL; + kmem_cache_free(extent_node_slab, en); +} + +/* + * Flow to release an extent_node: + * 1. list_del_init + * 2. __detach_extent_node + * 3. kmem_cache_free. + */ +static void __release_extent_node(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_node *en) +{ + spin_lock(&sbi->extent_lock); + f2fs_bug_on(sbi, list_empty(&en->list)); + list_del_init(&en->list); + spin_unlock(&sbi->extent_lock); + + __detach_extent_node(sbi, et, en); } static struct extent_tree *__grab_extent_tree(struct inode *inode) @@ -68,11 +87,13 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) et->root = RB_ROOT; et->cached_en = NULL; rwlock_init(&et->lock); - atomic_set(&et->refcount, 0); - et->count = 0; - sbi->total_ext_tree++; + INIT_LIST_HEAD(&et->list); + atomic_set(&et->node_cnt, 0); + atomic_inc(&sbi->total_ext_tree); + } else { + atomic_dec(&sbi->total_zombie_tree); + list_del_init(&et->list); } - atomic_inc(&et->refcount); up_write(&sbi->extent_tree_lock); /* never died until evict_inode */ @@ -127,32 +148,21 @@ static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi, } static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, - struct extent_tree *et, bool free_all) + struct extent_tree *et) { struct rb_node *node, *next; struct extent_node *en; - unsigned int count = et->count; + unsigned int count = atomic_read(&et->node_cnt); node = rb_first(&et->root); while (node) { next = rb_next(node); en = rb_entry(node, struct extent_node, rb_node); - - if (free_all) { - spin_lock(&sbi->extent_lock); - if (!list_empty(&en->list)) - list_del_init(&en->list); - spin_unlock(&sbi->extent_lock); - } - - if (free_all || list_empty(&en->list)) { - __detach_extent_node(sbi, et, en); - kmem_cache_free(extent_node_slab, en); - } + __release_extent_node(sbi, et, en); node = next; } - return count - et->count; + return count - atomic_read(&et->node_cnt); } static void __drop_largest_extent(struct inode *inode, @@ -160,38 +170,38 @@ static void __drop_largest_extent(struct inode *inode, { struct extent_info *largest = &F2FS_I(inode)->extent_tree->largest; - if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) + if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) { largest->len = 0; + f2fs_mark_inode_dirty_sync(inode); + } } -void f2fs_drop_largest_extent(struct inode *inode, pgoff_t fofs) -{ - if (!f2fs_may_extent_tree(inode)) - return; - - __drop_largest_extent(inode, fofs, 1); -} - -void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) +/* return true, if inode page is changed */ +bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree *et; struct extent_node *en; struct extent_info ei; - if (!f2fs_may_extent_tree(inode)) - return; + if (!f2fs_may_extent_tree(inode)) { + /* drop largest extent */ + if (i_ext && i_ext->len) { + i_ext->len = 0; + return true; + } + return false; + } et = __grab_extent_tree(inode); - if (!i_ext || le32_to_cpu(i_ext->len) < F2FS_MIN_EXTENT_LEN) - return; + if (!i_ext || !i_ext->len) + return false; - set_extent_info(&ei, le32_to_cpu(i_ext->fofs), - le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len)); + get_extent_info(&ei, i_ext); write_lock(&et->lock); - if (et->count) + if (atomic_read(&et->node_cnt)) goto out; en = __init_extent_tree(sbi, et, &ei); @@ -202,6 +212,7 @@ void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) } out: write_unlock(&et->lock); + return false; } static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, @@ -230,9 +241,10 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, if (en) { *ei = en->ei; spin_lock(&sbi->extent_lock); - if (!list_empty(&en->list)) + if (!list_empty(&en->list)) { list_move_tail(&en->list, &sbi->extent_list); - et->cached_en = en; + et->cached_en = en; + } spin_unlock(&sbi->extent_lock); ret = true; } @@ -325,12 +337,12 @@ static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et, return en; } -static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, +static struct extent_node *__try_merge_extent_node(struct inode *inode, struct extent_tree *et, struct extent_info *ei, - struct extent_node **den, struct extent_node *prev_ex, struct extent_node *next_ex) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_node *en = NULL; if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei)) { @@ -340,28 +352,34 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, } if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) { - if (en) { - __detach_extent_node(sbi, et, prev_ex); - *den = prev_ex; - } + if (en) + __release_extent_node(sbi, et, prev_ex); next_ex->ei.fofs = ei->fofs; next_ex->ei.blk = ei->blk; next_ex->ei.len += ei->len; en = next_ex; } - if (en) { - __try_update_largest_extent(et, en); + if (!en) + return NULL; + + __try_update_largest_extent(inode, et, en); + + spin_lock(&sbi->extent_lock); + if (!list_empty(&en->list)) { + list_move_tail(&en->list, &sbi->extent_list); et->cached_en = en; } + spin_unlock(&sbi->extent_lock); return en; } -static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, +static struct extent_node *__insert_extent_tree(struct inode *inode, struct extent_tree *et, struct extent_info *ei, struct rb_node **insert_p, struct rb_node *insert_parent) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct rb_node **p = &et->root.rb_node; struct rb_node *parent = NULL; struct extent_node *en = NULL; @@ -388,8 +406,13 @@ static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, if (!en) return NULL; - __try_update_largest_extent(et, en); + __try_update_largest_extent(inode, et, en); + + /* update in global extent list */ + spin_lock(&sbi->extent_lock); + list_add_tail(&en->list, &sbi->extent_list); et->cached_en = en; + spin_unlock(&sbi->extent_lock); return en; } @@ -412,7 +435,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, write_lock(&et->lock); - if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) { + if (is_inode_flag_set(inode, FI_NO_EXTENT)) { write_unlock(&et->lock); return false; } @@ -454,7 +477,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, set_extent_info(&ei, end, end - dei.fofs + dei.blk, org_end - end); - en1 = __insert_extent_tree(sbi, et, &ei, + en1 = __insert_extent_tree(inode, et, &ei, NULL, NULL); next_en = en1; } else { @@ -475,9 +498,9 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, } if (parts) - __try_update_largest_extent(et, en); + __try_update_largest_extent(inode, et, en); else - __detach_extent_node(sbi, et, en); + __release_extent_node(sbi, et, en); /* * if original extent is split into zero or two parts, extent @@ -488,58 +511,28 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, insert_p = NULL; insert_parent = NULL; } - - /* update in global extent list */ - spin_lock(&sbi->extent_lock); - if (!parts && !list_empty(&en->list)) - list_del(&en->list); - if (en1) - list_add_tail(&en1->list, &sbi->extent_list); - spin_unlock(&sbi->extent_lock); - - /* release extent node */ - if (!parts) - kmem_cache_free(extent_node_slab, en); - en = next_en; } /* 3. update extent in extent cache */ if (blkaddr) { - struct extent_node *den = NULL; set_extent_info(&ei, fofs, blkaddr, len); - en1 = __try_merge_extent_node(sbi, et, &ei, &den, - prev_en, next_en); - if (!en1) - en1 = __insert_extent_tree(sbi, et, &ei, + if (!__try_merge_extent_node(inode, et, &ei, prev_en, next_en)) + __insert_extent_tree(inode, et, &ei, insert_p, insert_parent); /* give up extent_cache, if split and small updates happen */ if (dei.len >= 1 && prev.len < F2FS_MIN_EXTENT_LEN && et->largest.len < F2FS_MIN_EXTENT_LEN) { - et->largest.len = 0; - set_inode_flag(F2FS_I(inode), FI_NO_EXTENT); + __drop_largest_extent(inode, 0, UINT_MAX); + set_inode_flag(inode, FI_NO_EXTENT); } - - spin_lock(&sbi->extent_lock); - if (en1) { - if (list_empty(&en1->list)) - list_add_tail(&en1->list, &sbi->extent_list); - else - list_move_tail(&en1->list, &sbi->extent_list); - } - if (den && !list_empty(&den->list)) - list_del(&den->list); - spin_unlock(&sbi->extent_lock); - - if (den) - kmem_cache_free(extent_node_slab, den); } - if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) - __free_extent_tree(sbi, et, true); + if (is_inode_flag_set(inode, FI_NO_EXTENT)) + __free_extent_tree(sbi, et); write_unlock(&et->lock); @@ -548,46 +541,42 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) { - struct extent_tree *treevec[EXT_TREE_VEC_SIZE]; - struct extent_node *en, *tmp; - unsigned long ino = F2FS_ROOT_INO(sbi); - struct radix_tree_root *root = &sbi->extent_tree_root; - unsigned int found; + struct extent_tree *et, *next; + struct extent_node *en; unsigned int node_cnt = 0, tree_cnt = 0; int remained; if (!test_opt(sbi, EXTENT_CACHE)) return 0; + if (!atomic_read(&sbi->total_zombie_tree)) + goto free_node; + if (!down_write_trylock(&sbi->extent_tree_lock)) goto out; /* 1. remove unreferenced extent tree */ - while ((found = radix_tree_gang_lookup(root, - (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { - unsigned i; - - ino = treevec[found - 1]->ino + 1; - for (i = 0; i < found; i++) { - struct extent_tree *et = treevec[i]; - - if (!atomic_read(&et->refcount)) { - write_lock(&et->lock); - node_cnt += __free_extent_tree(sbi, et, true); - write_unlock(&et->lock); - - radix_tree_delete(root, et->ino); - kmem_cache_free(extent_tree_slab, et); - sbi->total_ext_tree--; - tree_cnt++; - - if (node_cnt + tree_cnt >= nr_shrink) - goto unlock_out; - } + list_for_each_entry_safe(et, next, &sbi->zombie_list, list) { + if (atomic_read(&et->node_cnt)) { + write_lock(&et->lock); + node_cnt += __free_extent_tree(sbi, et); + write_unlock(&et->lock); } + f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); + list_del_init(&et->list); + radix_tree_delete(&sbi->extent_tree_root, et->ino); + kmem_cache_free(extent_tree_slab, et); + atomic_dec(&sbi->total_ext_tree); + atomic_dec(&sbi->total_zombie_tree); + tree_cnt++; + + if (node_cnt + tree_cnt >= nr_shrink) + goto unlock_out; + cond_resched(); } up_write(&sbi->extent_tree_lock); +free_node: /* 2. remove LRU extent entries */ if (!down_write_trylock(&sbi->extent_tree_lock)) goto out; @@ -595,34 +584,29 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) remained = nr_shrink - (node_cnt + tree_cnt); spin_lock(&sbi->extent_lock); - list_for_each_entry_safe(en, tmp, &sbi->extent_list, list) { - if (!remained--) + for (; remained > 0; remained--) { + if (list_empty(&sbi->extent_list)) break; + en = list_first_entry(&sbi->extent_list, + struct extent_node, list); + et = en->et; + if (!write_trylock(&et->lock)) { + /* refresh this extent node's position in extent list */ + list_move_tail(&en->list, &sbi->extent_list); + continue; + } + list_del_init(&en->list); + spin_unlock(&sbi->extent_lock); + + __detach_extent_node(sbi, et, en); + + write_unlock(&et->lock); + node_cnt++; + spin_lock(&sbi->extent_lock); } spin_unlock(&sbi->extent_lock); - /* - * reset ino for searching victims from beginning of global extent tree. - */ - ino = F2FS_ROOT_INO(sbi); - - while ((found = radix_tree_gang_lookup(root, - (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { - unsigned i; - - ino = treevec[found - 1]->ino + 1; - for (i = 0; i < found; i++) { - struct extent_tree *et = treevec[i]; - - write_lock(&et->lock); - node_cnt += __free_extent_tree(sbi, et, false); - write_unlock(&et->lock); - - if (node_cnt + tree_cnt >= nr_shrink) - goto unlock_out; - } - } unlock_out: up_write(&sbi->extent_tree_lock); out: @@ -637,16 +621,29 @@ unsigned int f2fs_destroy_extent_node(struct inode *inode) struct extent_tree *et = F2FS_I(inode)->extent_tree; unsigned int node_cnt = 0; - if (!et) + if (!et || !atomic_read(&et->node_cnt)) return 0; write_lock(&et->lock); - node_cnt = __free_extent_tree(sbi, et, true); + node_cnt = __free_extent_tree(sbi, et); write_unlock(&et->lock); return node_cnt; } +void f2fs_drop_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree; + + set_inode_flag(inode, FI_NO_EXTENT); + + write_lock(&et->lock); + __free_extent_tree(sbi, et); + __drop_largest_extent(inode, 0, UINT_MAX); + write_unlock(&et->lock); +} + void f2fs_destroy_extent_tree(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -656,8 +653,12 @@ void f2fs_destroy_extent_tree(struct inode *inode) if (!et) return; - if (inode->i_nlink && !is_bad_inode(inode) && et->count) { - atomic_dec(&et->refcount); + if (inode->i_nlink && !is_bad_inode(inode) && + atomic_read(&et->node_cnt)) { + down_write(&sbi->extent_tree_lock); + list_add_tail(&et->list, &sbi->zombie_list); + atomic_inc(&sbi->total_zombie_tree); + up_write(&sbi->extent_tree_lock); return; } @@ -666,11 +667,10 @@ void f2fs_destroy_extent_tree(struct inode *inode) /* delete extent tree entry in radix tree */ down_write(&sbi->extent_tree_lock); - atomic_dec(&et->refcount); - f2fs_bug_on(sbi, atomic_read(&et->refcount) || et->count); + f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); radix_tree_delete(&sbi->extent_tree_root, inode->i_ino); kmem_cache_free(extent_tree_slab, et); - sbi->total_ext_tree--; + atomic_dec(&sbi->total_ext_tree); up_write(&sbi->extent_tree_lock); F2FS_I(inode)->extent_tree = NULL; @@ -689,20 +689,20 @@ bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, void f2fs_update_extent_cache(struct dnode_of_data *dn) { - struct f2fs_inode_info *fi = F2FS_I(dn->inode); pgoff_t fofs; + block_t blkaddr; if (!f2fs_may_extent_tree(dn->inode)) return; - f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR); + if (dn->data_blkaddr == NEW_ADDR) + blkaddr = NULL_ADDR; + else + blkaddr = dn->data_blkaddr; - - fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + - dn->ofs_in_node; - - if (f2fs_update_extent_tree_range(dn->inode, fofs, dn->data_blkaddr, 1)) - sync_inode_page(dn); + fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + + dn->ofs_in_node; + f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, 1); } void f2fs_update_extent_cache_range(struct dnode_of_data *dn, @@ -712,8 +712,7 @@ void f2fs_update_extent_cache_range(struct dnode_of_data *dn, if (!f2fs_may_extent_tree(dn->inode)) return; - if (f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len)) - sync_inode_page(dn); + f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len); } void init_extent_cache_info(struct f2fs_sb_info *sbi) @@ -722,7 +721,9 @@ void init_extent_cache_info(struct f2fs_sb_info *sbi) init_rwsem(&sbi->extent_tree_lock); INIT_LIST_HEAD(&sbi->extent_list); spin_lock_init(&sbi->extent_lock); - sbi->total_ext_tree = 0; + atomic_set(&sbi->total_ext_tree, 0); + INIT_LIST_HEAD(&sbi->zombie_list); + atomic_set(&sbi->total_zombie_tree, 0); atomic_set(&sbi->total_ext_node, 0); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9db5500d63d9..af293e84e5cd 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -21,10 +21,12 @@ #include #include #include +#include +#include +#include #ifdef CONFIG_F2FS_CHECK_FS #define f2fs_bug_on(sbi, condition) BUG_ON(condition) -#define f2fs_down_write(x, y) down_write_nest_lock(x, y) #else #define f2fs_bug_on(sbi, condition) \ do { \ @@ -33,7 +35,30 @@ set_sbi_flag(sbi, SBI_NEED_FSCK); \ } \ } while (0) -#define f2fs_down_write(x, y) down_write(x) +#endif + +#ifdef CONFIG_F2FS_FAULT_INJECTION +enum { + FAULT_KMALLOC, + FAULT_PAGE_ALLOC, + FAULT_ALLOC_NID, + FAULT_ORPHAN, + FAULT_BLOCK, + FAULT_DIR_DEPTH, + FAULT_EVICT_INODE, + FAULT_IO, + FAULT_CHECKPOINT, + FAULT_MAX, +}; + +struct f2fs_fault_info { + atomic_t inject_ops; + unsigned int inject_rate; + unsigned int inject_type; +}; + +extern char *fault_name[FAULT_MAX]; +#define IS_FAULT_SET(fi, type) (fi->inject_type & (1 << (type))) #endif /* @@ -54,6 +79,10 @@ #define F2FS_MOUNT_FASTBOOT 0x00001000 #define F2FS_MOUNT_EXTENT_CACHE 0x00002000 #define F2FS_MOUNT_FORCE_FG_GC 0x00004000 +#define F2FS_MOUNT_DATA_FLUSH 0x00008000 +#define F2FS_MOUNT_FAULT_INJECTION 0x00010000 +#define F2FS_MOUNT_ADAPTIVE 0x00020000 +#define F2FS_MOUNT_LFS 0x00040000 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) @@ -74,6 +103,7 @@ struct f2fs_mount_info { }; #define F2FS_FEATURE_ENCRYPT 0x0001 +#define F2FS_FEATURE_HMSMR 0x0002 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -82,25 +112,30 @@ struct f2fs_mount_info { #define F2FS_CLEAR_FEATURE(sb, mask) \ F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask) -#define CRCPOLY_LE 0xedb88320 - -static inline __u32 f2fs_crc32(void *buf, size_t len) +/** + * wq_has_sleeper - check if there are any waiting processes + * @wq: wait queue head + * + * Returns true if wq has waiting processes + * + * Please refer to the comment for waitqueue_active. + */ +static inline bool wq_has_sleeper(wait_queue_head_t *wq) { - unsigned char *p = (unsigned char *)buf; - __u32 crc = F2FS_SUPER_MAGIC; - int i; - - while (len--) { - crc ^= *p++; - for (i = 0; i < 8; i++) - crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_LE : 0); - } - return crc; + /* + * We need to be sure we are in sync with the + * add_wait_queue modifications to the wait queue. + * + * This memory barrier should be paired with one on the + * waiting side. + */ + smp_mb(); + return waitqueue_active(wq); } -static inline bool f2fs_crc_valid(__u32 blk_crc, void *buf, size_t buf_size) +static inline void inode_nohighmem(struct inode *inode) { - return f2fs_crc32(buf, buf_size) == blk_crc; + mapping_set_gfp_mask(inode->i_mapping, GFP_USER); } /* @@ -119,12 +154,13 @@ enum { CP_DISCARD, }; -#define DEF_BATCHED_TRIM_SECTIONS 32 +#define DEF_BATCHED_TRIM_SECTIONS 2 #define BATCHED_TRIM_SEGMENTS(sbi) \ (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec) #define BATCHED_TRIM_BLOCKS(sbi) \ (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) #define DEF_CP_INTERVAL 60 /* 60 secs */ +#define DEF_IDLE_INTERVAL 5 /* 5 secs */ struct cp_control { int reason; @@ -158,13 +194,7 @@ struct ino_entry { nid_t ino; /* inode number */ }; -/* - * for the list of directory inodes or gc inodes. - * NOTE: there are two slab users for this structure, if we add/modify/delete - * fields in structure for one of slab users, it may affect fields or size of - * other one, in this condition, it's better to split both of slab and related - * data structure. - */ +/* for the list of inodes to be GCed */ struct inode_entry { struct list_head list; /* list head */ struct inode *inode; /* vfs inode pointer */ @@ -177,46 +207,52 @@ struct discard_entry { int len; /* # of consecutive blocks of the discard */ }; +struct bio_entry { + struct list_head list; + struct bio *bio; + struct completion event; + int error; +}; + /* for the list of fsync inodes, used only during recovery */ struct fsync_inode_entry { struct list_head list; /* list head */ struct inode *inode; /* vfs inode pointer */ block_t blkaddr; /* block address locating the last fsync */ block_t last_dentry; /* block address locating the last dentry */ - block_t last_inode; /* block address locating the last inode */ }; -#define nats_in_cursum(sum) (le16_to_cpu(sum->n_nats)) -#define sits_in_cursum(sum) (le16_to_cpu(sum->n_sits)) +#define nats_in_cursum(jnl) (le16_to_cpu(jnl->n_nats)) +#define sits_in_cursum(jnl) (le16_to_cpu(jnl->n_sits)) -#define nat_in_journal(sum, i) (sum->nat_j.entries[i].ne) -#define nid_in_journal(sum, i) (sum->nat_j.entries[i].nid) -#define sit_in_journal(sum, i) (sum->sit_j.entries[i].se) -#define segno_in_journal(sum, i) (sum->sit_j.entries[i].segno) +#define nat_in_journal(jnl, i) (jnl->nat_j.entries[i].ne) +#define nid_in_journal(jnl, i) (jnl->nat_j.entries[i].nid) +#define sit_in_journal(jnl, i) (jnl->sit_j.entries[i].se) +#define segno_in_journal(jnl, i) (jnl->sit_j.entries[i].segno) -#define MAX_NAT_JENTRIES(sum) (NAT_JOURNAL_ENTRIES - nats_in_cursum(sum)) -#define MAX_SIT_JENTRIES(sum) (SIT_JOURNAL_ENTRIES - sits_in_cursum(sum)) +#define MAX_NAT_JENTRIES(jnl) (NAT_JOURNAL_ENTRIES - nats_in_cursum(jnl)) +#define MAX_SIT_JENTRIES(jnl) (SIT_JOURNAL_ENTRIES - sits_in_cursum(jnl)) -static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i) +static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i) { - int before = nats_in_cursum(rs); - rs->n_nats = cpu_to_le16(before + i); + int before = nats_in_cursum(journal); + journal->n_nats = cpu_to_le16(before + i); return before; } -static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i) +static inline int update_sits_in_cursum(struct f2fs_journal *journal, int i) { - int before = sits_in_cursum(rs); - rs->n_sits = cpu_to_le16(before + i); + int before = sits_in_cursum(journal); + journal->n_sits = cpu_to_le16(before + i); return before; } -static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, - int type) +static inline bool __has_cursum_space(struct f2fs_journal *journal, + int size, int type) { if (type == NAT_JOURNAL) - return size <= MAX_NAT_JENTRIES(sum); - return size <= MAX_SIT_JENTRIES(sum); + return size <= MAX_NAT_JENTRIES(journal); + return size <= MAX_SIT_JENTRIES(journal); } /* @@ -234,13 +270,13 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, #define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) #define F2FS_IOC_GARBAGE_COLLECT _IO(F2FS_IOCTL_MAGIC, 6) #define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7) +#define F2FS_IOC_DEFRAGMENT _IO(F2FS_IOCTL_MAGIC, 8) +#define F2FS_IOC_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \ + struct f2fs_move_range) -#define F2FS_IOC_SET_ENCRYPTION_POLICY \ - _IOR('f', 19, struct f2fs_encryption_policy) -#define F2FS_IOC_GET_ENCRYPTION_PWSALT \ - _IOW('f', 20, __u8[16]) -#define F2FS_IOC_GET_ENCRYPTION_POLICY \ - _IOW('f', 21, struct f2fs_encryption_policy) +#define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY +#define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY +#define F2FS_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT /* * should be same as XFS_IOC_GOINGDOWN. @@ -256,33 +292,27 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, /* * ioctl commands in 32 bit emulation */ -#define F2FS_IOC32_GETFLAGS FS_IOC32_GETFLAGS -#define F2FS_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define F2FS_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define F2FS_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define F2FS_IOC32_GETVERSION FS_IOC32_GETVERSION #endif +struct f2fs_defragment { + u64 start; + u64 len; +}; + +struct f2fs_move_range { + u32 dst_fd; /* destination fd */ + u64 pos_in; /* start position in src_fd */ + u64 pos_out; /* start position in dst_fd */ + u64 len; /* size to move */ +}; + /* * For INODE and NODE manager */ /* for directory operations */ -struct f2fs_str { - unsigned char *name; - u32 len; -}; - -struct f2fs_filename { - const struct qstr *usr_fname; - struct f2fs_str disk_name; - f2fs_hash_t hash; -#ifdef CONFIG_F2FS_FS_ENCRYPTION - struct f2fs_str crypto_buf; -#endif -}; - -#define FSTR_INIT(n, l) { .name = n, .len = l } -#define FSTR_TO_QSTR(f) QSTR_INIT((f)->name, (f)->len) -#define fname_name(p) ((p)->disk_name.name) -#define fname_len(p) ((p)->disk_name.len) - struct f2fs_dentry_ptr { struct inode *inode; const void *bitmap; @@ -350,6 +380,7 @@ struct extent_node { struct rb_node rb_node; /* rb node located in rb-tree */ struct list_head list; /* node in global extent list of sbi */ struct extent_info ei; /* extent info */ + struct extent_tree *et; /* extent tree pointer */ }; struct extent_tree { @@ -357,9 +388,9 @@ struct extent_tree { struct rb_root root; /* root of extent info rb-tree */ struct extent_node *cached_en; /* recently accessed extent node */ struct extent_info largest; /* largested extent info */ + struct list_head list; /* to be used by sbi->zombie_list */ rwlock_t lock; /* protect extent info rb-tree */ - atomic_t refcount; /* reference count of rb-tree */ - unsigned int count; /* # of extent node in rb-tree*/ + atomic_t node_cnt; /* # of extent node in rb-tree*/ }; /* @@ -378,6 +409,7 @@ struct f2fs_map_blocks { block_t m_lblk; unsigned int m_len; unsigned int m_flags; + pgoff_t *m_next_pgofs; /* point next possible non-hole pgofs */ }; /* for flag in get_data_block */ @@ -385,6 +417,8 @@ struct f2fs_map_blocks { #define F2FS_GET_BLOCK_DIO 1 #define F2FS_GET_BLOCK_FIEMAP 2 #define F2FS_GET_BLOCK_BMAP 3 +#define F2FS_GET_BLOCK_PRE_DIO 4 +#define F2FS_GET_BLOCK_PRE_AIO 5 /* * i_advise uses FADVISE_XXX_BIT. We can add additional hints later. @@ -406,15 +440,6 @@ struct f2fs_map_blocks { #define file_enc_name(inode) is_file(inode, FADVISE_ENC_NAME_BIT) #define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT) -/* Encryption algorithms */ -#define F2FS_ENCRYPTION_MODE_INVALID 0 -#define F2FS_ENCRYPTION_MODE_AES_256_XTS 1 -#define F2FS_ENCRYPTION_MODE_AES_256_GCM 2 -#define F2FS_ENCRYPTION_MODE_AES_256_CBC 3 -#define F2FS_ENCRYPTION_MODE_AES_256_CTS 4 - -#include "f2fs_crypto.h" - #define DEF_DIR_LEVEL 0 struct f2fs_inode_info { @@ -429,30 +454,27 @@ struct f2fs_inode_info { /* Use below internally in f2fs*/ unsigned long flags; /* use to pass per-file flags */ struct rw_semaphore i_sem; /* protect fi info */ - atomic_t dirty_pages; /* # of dirty pages */ + struct percpu_counter dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ nid_t i_xattr_nid; /* node id that contains xattrs */ unsigned long long xattr_ver; /* cp version of xattr modification */ - struct inode_entry *dirty_dir; /* the pointer of dirty dir */ + loff_t last_disk_size; /* lastly written file size */ + struct list_head dirty_list; /* dirty list for dirs and files */ + struct list_head gdirty_list; /* linked in global dirty list */ struct list_head inmem_pages; /* inmemory pages managed by f2fs */ struct mutex inmem_lock; /* lock for inmemory pages */ - struct extent_tree *extent_tree; /* cached extent_tree entry */ - -#ifdef CONFIG_F2FS_FS_ENCRYPTION - /* Encryption params */ - struct f2fs_crypt_info *i_crypt_info; -#endif + struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */ }; static inline void get_extent_info(struct extent_info *ext, - struct f2fs_extent i_ext) + struct f2fs_extent *i_ext) { - ext->fofs = le32_to_cpu(i_ext.fofs); - ext->blk = le32_to_cpu(i_ext.blk); - ext->len = le32_to_cpu(i_ext.len); + ext->fofs = le32_to_cpu(i_ext->fofs); + ext->blk = le32_to_cpu(i_ext->blk); + ext->len = le32_to_cpu(i_ext->len); } static inline void set_raw_extent(struct extent_info *ext, @@ -497,11 +519,14 @@ static inline bool __is_front_mergeable(struct extent_info *cur, return __is_extent_mergeable(cur, front); } -static inline void __try_update_largest_extent(struct extent_tree *et, - struct extent_node *en) +extern void f2fs_mark_inode_dirty_sync(struct inode *); +static inline void __try_update_largest_extent(struct inode *inode, + struct extent_tree *et, struct extent_node *en) { - if (en->ei.len > et->largest.len) + if (en->ei.len > et->largest.len) { et->largest = en->ei; + f2fs_mark_inode_dirty_sync(inode); + } } struct f2fs_nm_info { @@ -511,6 +536,7 @@ struct f2fs_nm_info { nid_t next_scan_nid; /* the next nid to be scanned */ unsigned int ram_thresh; /* control the memory footprint */ unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */ + unsigned int dirty_nats_ratio; /* control dirty nats ratio threshold */ /* NAT cache management */ struct radix_tree_root nat_root;/* root of the nat entry cache */ @@ -544,6 +570,9 @@ struct dnode_of_data { nid_t nid; /* node id of the direct node block */ unsigned int ofs_in_node; /* data offset in the node page */ bool inode_page_locked; /* inode page is locked or not */ + bool node_changed; /* is node block changed */ + char cur_level; /* level of hole node page */ + char max_level; /* level of current page located */ block_t data_blkaddr; /* block address of the node block */ }; @@ -594,6 +623,7 @@ struct flush_cmd { struct flush_cmd_control { struct task_struct *f2fs_issue_flush; /* flush thread */ wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ + atomic_t submit_flush; /* # of issued flushes */ struct llist_head issue_list; /* list for command issue */ struct llist_node *dispatch_list; /* list for command dispatch */ }; @@ -618,6 +648,7 @@ struct f2fs_sm_info { /* for small discard management */ struct list_head discard_list; /* 4KB discard list */ + struct list_head wait_list; /* linked with issued discard bio */ int nr_discards; /* # of discards in the list */ int max_discards; /* max. discards to be issued */ @@ -645,11 +676,12 @@ struct f2fs_sm_info { * dirty dentry blocks, dirty node blocks, and dirty meta blocks. */ enum count_type { - F2FS_WRITEBACK, F2FS_DIRTY_DENTS, + F2FS_DIRTY_DATA, F2FS_DIRTY_NODES, F2FS_DIRTY_META, F2FS_INMEM_PAGES, + F2FS_DIRTY_IMETA, NR_COUNT_TYPE, }; @@ -673,6 +705,7 @@ enum page_type { META_FLUSH, INMEM, /* the below types are used by tracepoints only. */ INMEM_DROP, + INMEM_REVOKE, IPU, OPU, }; @@ -681,7 +714,8 @@ struct f2fs_io_info { struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ int rw; /* contains R/RS/W/WS with REQ_META/REQ_PRIO */ - block_t blk_addr; /* block address to be written */ + block_t new_blkaddr; /* new block address to be written */ + block_t old_blkaddr; /* old block address before Cow */ struct page *page; /* page to be written */ struct page *encrypted_page; /* encrypted page */ }; @@ -695,6 +729,13 @@ struct f2fs_bio_info { struct rw_semaphore io_rwsem; /* blocking op for bio */ }; +enum inode_type { + DIR_INODE, /* for dirty dir inode */ + FILE_INODE, /* for dirty regular/symlink inode */ + DIRTY_META, /* for all dirtied inode metadata */ + NR_INODE_TYPE, +}; + /* for inner inode cache management */ struct inode_management { struct radix_tree_root ino_root; /* ino entry array */ @@ -709,15 +750,31 @@ enum { SBI_IS_CLOSE, /* specify unmounting */ SBI_NEED_FSCK, /* need fsck.f2fs to fix */ SBI_POR_DOING, /* recovery is doing or not */ + SBI_NEED_SB_WRITE, /* need to recover superblock */ + SBI_NEED_CP, /* need to checkpoint */ }; +enum { + CP_TIME, + REQ_TIME, + MAX_TIME, +}; + +#ifdef CONFIG_F2FS_FS_ENCRYPTION +#define F2FS_KEY_DESC_PREFIX "f2fs:" +#define F2FS_KEY_DESC_PREFIX_SIZE 5 +#endif struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ - struct buffer_head *raw_super_buf; /* buffer head of raw sb */ struct f2fs_super_block *raw_super; /* raw super block pointer */ - int s_flag; /* flags for sbi */ + int valid_super_block; /* valid super block no */ + unsigned long s_flag; /* flags for sbi */ +#ifdef CONFIG_F2FS_FS_ENCRYPTION + u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE]; + u8 key_prefix_size; +#endif /* for node-related operations */ struct f2fs_nm_info *nm_info; /* node manager */ struct inode *node_inode; /* cache node blocks */ @@ -728,32 +785,36 @@ struct f2fs_sb_info { /* for bio operations */ struct f2fs_bio_info read_io; /* for read bios */ struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */ + struct mutex wio_mutex[NODE + 1]; /* bio ordering for NODE/DATA */ /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ + spinlock_t cp_lock; /* for flag in ckpt */ struct inode *meta_inode; /* cache meta blocks */ struct mutex cp_mutex; /* checkpoint procedure lock */ struct rw_semaphore cp_rwsem; /* blocking FS operations */ struct rw_semaphore node_write; /* locking node writes */ - struct mutex writepages; /* mutex for writepages() */ wait_queue_head_t cp_wait; - long cp_expires, cp_interval; /* next expected periodic cp */ + unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ + long interval_time[MAX_TIME]; /* to store thresholds */ struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ /* for orphan inode, use 0'th array */ unsigned int max_orphans; /* max orphan inodes */ - /* for directory inode management */ - struct list_head dir_inode_list; /* dir inode list */ - spinlock_t dir_inode_lock; /* for dir inode list lock */ + /* for inode management */ + struct list_head inode_list[NR_INODE_TYPE]; /* dirty inode list */ + spinlock_t inode_lock[NR_INODE_TYPE]; /* for dirty inode list lock */ /* for extent tree cache */ struct radix_tree_root extent_tree_root;/* cache extent cache entries */ struct rw_semaphore extent_tree_lock; /* locking extent radix tree */ struct list_head extent_list; /* lru list for shrinker */ spinlock_t extent_lock; /* locking extent lru list */ - int total_ext_tree; /* extent tree count */ + atomic_t total_ext_tree; /* extent tree count */ + struct list_head zombie_list; /* extent zombie tree list */ + atomic_t total_zombie_tree; /* extent zombie tree count */ atomic_t total_ext_node; /* extent info count */ /* basic filesystem units */ @@ -770,17 +831,24 @@ struct f2fs_sb_info { unsigned int total_sections; /* total section count */ unsigned int total_node_count; /* total node block count */ unsigned int total_valid_node_count; /* valid node block count */ - unsigned int total_valid_inode_count; /* valid inode count */ + loff_t max_file_blocks; /* max block index of file */ int active_logs; /* # of active logs */ int dir_level; /* directory level */ block_t user_block_count; /* # of user blocks */ block_t total_valid_block_count; /* # of valid blocks */ - block_t alloc_valid_block_count; /* # of allocated blocks */ block_t discard_blks; /* discard command candidats */ block_t last_valid_block_count; /* for recovery */ u32 s_next_generation; /* for NFS support */ - atomic_t nr_pages[NR_COUNT_TYPE]; /* # of pages, see count_type */ + atomic_t nr_wb_bios; /* # of writeback bios */ + + /* # of pages, see count_type */ + struct percpu_counter nr_pages[NR_COUNT_TYPE]; + /* # of allocated blocks */ + struct percpu_counter alloc_valid_block_count; + + /* valid inode count */ + struct percpu_counter total_valid_inode_count; struct f2fs_mount_info mount_opt; /* mount options */ @@ -809,7 +877,7 @@ struct f2fs_sb_info { atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ int bg_gc; /* background gc calls */ - unsigned int n_dirty_dirs; /* # of dir inodes */ + unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ #endif unsigned int last_victim[2]; /* last victim segment # */ spinlock_t stat_lock; /* lock for stat operations */ @@ -822,11 +890,102 @@ struct f2fs_sb_info { struct list_head s_list; struct mutex umount_mutex; unsigned int shrinker_run_no; + + /* For write statistics */ + u64 sectors_written_start; + u64 kbytes_written; + + /* Reference to checksum algorithm driver via cryptoapi */ + struct crypto_shash *s_chksum_driver; + + /* For fault injection */ +#ifdef CONFIG_F2FS_FAULT_INJECTION + struct f2fs_fault_info fault_info; +#endif }; +#ifdef CONFIG_F2FS_FAULT_INJECTION +static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) +{ + struct f2fs_fault_info *ffi = &sbi->fault_info; + + if (!ffi->inject_rate) + return false; + + if (!IS_FAULT_SET(ffi, type)) + return false; + + atomic_inc(&ffi->inject_ops); + if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) { + atomic_set(&ffi->inject_ops, 0); + printk("%sF2FS-fs : inject %s in %pF\n", + KERN_INFO, + fault_name[type], + __builtin_return_address(0)); + return true; + } + return false; +} +#endif + +/* For write statistics. Suppose sector size is 512 bytes, + * and the return value is in kbytes. s is of struct f2fs_sb_info. + */ +#define BD_PART_WRITTEN(s) \ +(((u64)part_stat_read(s->sb->s_bdev->bd_part, sectors[1]) - \ + s->sectors_written_start) >> 1) + +static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) +{ + sbi->last_time[type] = jiffies; +} + +static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type) +{ + struct timespec ts = {sbi->interval_time[type], 0}; + unsigned long interval = timespec_to_jiffies(&ts); + + return time_after(jiffies, sbi->last_time[type] + interval); +} + +static inline bool is_idle(struct f2fs_sb_info *sbi) +{ + struct block_device *bdev = sbi->sb->s_bdev; + struct request_queue *q = bdev_get_queue(bdev); + struct request_list *rl = &q->root_rl; + + if (rl->count[BLK_RW_SYNC] || rl->count[BLK_RW_ASYNC]) + return 0; + + return f2fs_time_over(sbi, REQ_TIME); +} + /* * Inline functions */ +static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address, + unsigned int length) +{ + SHASH_DESC_ON_STACK(shash, sbi->s_chksum_driver); + u32 *ctx = (u32 *)shash_desc_ctx(shash); + int err; + + shash->tfm = sbi->s_chksum_driver; + shash->flags = 0; + *ctx = F2FS_SUPER_MAGIC; + + err = crypto_shash_update(shash, address, length); + BUG_ON(err); + + return *ctx; +} + +static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc, + void *buf, size_t buf_size) +{ + return f2fs_crc32(sbi, buf, buf_size) == blk_crc; +} + static inline struct f2fs_inode_info *F2FS_I(struct inode *inode) { return container_of(inode, struct f2fs_inode_info, vfs_inode); @@ -909,17 +1068,17 @@ static inline struct address_space *NODE_MAPPING(struct f2fs_sb_info *sbi) static inline bool is_sbi_flag_set(struct f2fs_sb_info *sbi, unsigned int type) { - return sbi->s_flag & (0x01 << type); + return test_bit(type, &sbi->s_flag); } static inline void set_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type) { - sbi->s_flag |= (0x01 << type); + set_bit(type, &sbi->s_flag); } static inline void clear_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type) { - sbi->s_flag &= ~(0x01 << type); + clear_bit(type, &sbi->s_flag); } static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp) @@ -927,26 +1086,57 @@ static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp) return le64_to_cpu(cp->checkpoint_ver); } -static inline bool is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +static inline bool __is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) { unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); + return ckpt_flags & f; } -static inline void set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +static inline bool is_set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) { - unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); + return __is_set_ckpt_flags(F2FS_CKPT(sbi), f); +} + +static inline void __set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +{ + unsigned int ckpt_flags; + + ckpt_flags = le32_to_cpu(cp->ckpt_flags); ckpt_flags |= f; cp->ckpt_flags = cpu_to_le32(ckpt_flags); } -static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +static inline void set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) { - unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); + spin_lock(&sbi->cp_lock); + __set_ckpt_flags(F2FS_CKPT(sbi), f); + spin_unlock(&sbi->cp_lock); +} + +static inline void __clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +{ + unsigned int ckpt_flags; + + ckpt_flags = le32_to_cpu(cp->ckpt_flags); ckpt_flags &= (~f); cp->ckpt_flags = cpu_to_le32(ckpt_flags); } +static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) +{ + spin_lock(&sbi->cp_lock); + __clear_ckpt_flags(F2FS_CKPT(sbi), f); + spin_unlock(&sbi->cp_lock); +} + +static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) +{ + struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); + + return blk_queue_discard(q); +} + static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { down_read(&sbi->cp_rwsem); @@ -959,7 +1149,7 @@ static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi) static inline void f2fs_lock_all(struct f2fs_sb_info *sbi) { - f2fs_down_write(&sbi->cp_rwsem, &sbi->cp_mutex); + down_write(&sbi->cp_rwsem); } static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) @@ -985,8 +1175,8 @@ static inline bool __remain_node_summaries(int reason) static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi) { - return (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG) || - is_set_ckpt_flags(F2FS_CKPT(sbi), CP_FASTBOOT_FLAG)); + return (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG) || + is_set_ckpt_flags(sbi, CP_FASTBOOT_FLAG)); } /* @@ -1019,22 +1209,37 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs) return ofs == XATTR_NODE_OFFSET; } +static inline void f2fs_i_blocks_write(struct inode *, blkcnt_t, bool); static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, - struct inode *inode, blkcnt_t count) + struct inode *inode, blkcnt_t *count) { - block_t valid_block_count; + blkcnt_t diff; + +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_BLOCK)) + return false; +#endif + /* + * let's increase this in prior to actual block count change in order + * for f2fs_sync_file to avoid data races when deciding checkpoint. + */ + percpu_counter_add(&sbi->alloc_valid_block_count, (*count)); spin_lock(&sbi->stat_lock); - valid_block_count = - sbi->total_valid_block_count + (block_t)count; - if (unlikely(valid_block_count > sbi->user_block_count)) { - spin_unlock(&sbi->stat_lock); - return false; + sbi->total_valid_block_count += (block_t)(*count); + if (unlikely(sbi->total_valid_block_count > sbi->user_block_count)) { + diff = sbi->total_valid_block_count - sbi->user_block_count; + *count -= diff; + sbi->total_valid_block_count = sbi->user_block_count; + if (!*count) { + spin_unlock(&sbi->stat_lock); + percpu_counter_sub(&sbi->alloc_valid_block_count, diff); + return false; + } } - inode->i_blocks += count; - sbi->total_valid_block_count = valid_block_count; - sbi->alloc_valid_block_count += (block_t)count; spin_unlock(&sbi->stat_lock); + + f2fs_i_blocks_write(inode, *count, true); return true; } @@ -1045,27 +1250,31 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count); f2fs_bug_on(sbi, inode->i_blocks < count); - inode->i_blocks -= count; sbi->total_valid_block_count -= (block_t)count; spin_unlock(&sbi->stat_lock); + f2fs_i_blocks_write(inode, count, false); } static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) { - atomic_inc(&sbi->nr_pages[count_type]); + percpu_counter_inc(&sbi->nr_pages[count_type]); + + if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES) + return; + set_sbi_flag(sbi, SBI_IS_DIRTY); } static inline void inode_inc_dirty_pages(struct inode *inode) { - atomic_inc(&F2FS_I(inode)->dirty_pages); - if (S_ISDIR(inode->i_mode)) - inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS); + percpu_counter_inc(&F2FS_I(inode)->dirty_pages); + inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) { - atomic_dec(&sbi->nr_pages[count_type]); + percpu_counter_dec(&sbi->nr_pages[count_type]); } static inline void inode_dec_dirty_pages(struct inode *inode) @@ -1074,28 +1283,28 @@ static inline void inode_dec_dirty_pages(struct inode *inode) !S_ISLNK(inode->i_mode)) return; - atomic_dec(&F2FS_I(inode)->dirty_pages); - - if (S_ISDIR(inode->i_mode)) - dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS); + percpu_counter_dec(&F2FS_I(inode)->dirty_pages); + dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } -static inline int get_pages(struct f2fs_sb_info *sbi, int count_type) +static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type) { - return atomic_read(&sbi->nr_pages[count_type]); + return percpu_counter_sum_positive(&sbi->nr_pages[count_type]); } -static inline int get_dirty_pages(struct inode *inode) +static inline s64 get_dirty_pages(struct inode *inode) { - return atomic_read(&F2FS_I(inode)->dirty_pages); + return percpu_counter_sum_positive(&F2FS_I(inode)->dirty_pages); } static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) { - unsigned int pages_per_sec = sbi->segs_per_sec * - (1 << sbi->log_blocks_per_seg); - return ((get_pages(sbi, block_type) + pages_per_sec - 1) - >> sbi->log_blocks_per_seg) / sbi->segs_per_sec; + unsigned int pages_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg; + unsigned int segs = (get_pages(sbi, block_type) + pages_per_sec - 1) >> + sbi->log_blocks_per_seg; + + return segs / sbi->segs_per_sec; } static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi) @@ -1103,6 +1312,11 @@ static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi) return sbi->total_valid_block_count; } +static inline block_t discard_blocks(struct f2fs_sb_info *sbi) +{ + return sbi->discard_blks; +} + static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); @@ -1182,13 +1396,13 @@ static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi, } if (inode) - inode->i_blocks++; + f2fs_i_blocks_write(inode, 1, true); - sbi->alloc_valid_block_count++; sbi->total_valid_node_count++; sbi->total_valid_block_count++; spin_unlock(&sbi->stat_lock); + percpu_counter_inc(&sbi->alloc_valid_block_count); return true; } @@ -1201,7 +1415,7 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, !sbi->total_valid_node_count); f2fs_bug_on(sbi, !inode->i_blocks); - inode->i_blocks--; + f2fs_i_blocks_write(inode, 1, false); sbi->total_valid_node_count--; sbi->total_valid_block_count--; @@ -1215,28 +1429,30 @@ static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi) static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi) { - spin_lock(&sbi->stat_lock); - f2fs_bug_on(sbi, sbi->total_valid_inode_count == sbi->total_node_count); - sbi->total_valid_inode_count++; - spin_unlock(&sbi->stat_lock); + percpu_counter_inc(&sbi->total_valid_inode_count); } static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi) { - spin_lock(&sbi->stat_lock); - f2fs_bug_on(sbi, !sbi->total_valid_inode_count); - sbi->total_valid_inode_count--; - spin_unlock(&sbi->stat_lock); + percpu_counter_dec(&sbi->total_valid_inode_count); } -static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi) +static inline s64 valid_inode_count(struct f2fs_sb_info *sbi) { - return sbi->total_valid_inode_count; + return percpu_counter_sum_positive(&sbi->total_valid_inode_count); } static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, pgoff_t index, bool for_write) { +#ifdef CONFIG_F2FS_FAULT_INJECTION + struct page *page = find_lock_page(mapping, index); + if (page) + return page; + + if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) + return NULL; +#endif if (!for_write) return grab_cache_page(mapping, index); return grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); @@ -1261,7 +1477,7 @@ static inline void f2fs_put_page(struct page *page, int unlock) f2fs_bug_on(F2FS_P_SB(page), !PageLocked(page)); unlock_page(page); } - page_cache_release(page); + put_page(page); } static inline void f2fs_put_dnode(struct dnode_of_data *dn) @@ -1396,13 +1612,12 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) enum { FI_NEW_INODE, /* indicate newly allocated inode */ FI_DIRTY_INODE, /* indicate inode is dirty or not */ + FI_AUTO_RECOVER, /* indicate inode is recoverable */ FI_DIRTY_DIR, /* indicate directory has dirty pages */ FI_INC_LINK, /* need to increment i_nlink */ FI_ACL_MODE, /* indicate acl mode */ FI_NO_ALLOC, /* should not allocate any blocks */ FI_FREE_NID, /* free allocated nide */ - FI_UPDATE_DIR, /* should update inode block for consistency */ - FI_DELAY_IPUT, /* used for the recovery */ FI_NO_EXTENT, /* not to use the extent cache */ FI_INLINE_XATTR, /* used for inline xattr */ FI_INLINE_DATA, /* used for inline data*/ @@ -1416,71 +1631,152 @@ enum { FI_DROP_CACHE, /* drop dirty page cache */ FI_DATA_EXIST, /* indicate data exists */ FI_INLINE_DOTS, /* indicate inline dot dentries */ + FI_DO_DEFRAG, /* indicate defragment is running */ + FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ }; -static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) +static inline void __mark_inode_dirty_flag(struct inode *inode, + int flag, bool set) { - if (!test_bit(flag, &fi->flags)) - set_bit(flag, &fi->flags); + switch (flag) { + case FI_INLINE_XATTR: + case FI_INLINE_DATA: + case FI_INLINE_DENTRY: + if (set) + return; + case FI_DATA_EXIST: + case FI_INLINE_DOTS: + f2fs_mark_inode_dirty_sync(inode); + } } -static inline int is_inode_flag_set(struct f2fs_inode_info *fi, int flag) +static inline void set_inode_flag(struct inode *inode, int flag) { - return test_bit(flag, &fi->flags); + if (!test_bit(flag, &F2FS_I(inode)->flags)) + set_bit(flag, &F2FS_I(inode)->flags); + __mark_inode_dirty_flag(inode, flag, true); } -static inline void clear_inode_flag(struct f2fs_inode_info *fi, int flag) +static inline int is_inode_flag_set(struct inode *inode, int flag) { - if (test_bit(flag, &fi->flags)) - clear_bit(flag, &fi->flags); + return test_bit(flag, &F2FS_I(inode)->flags); } -static inline void set_acl_inode(struct f2fs_inode_info *fi, umode_t mode) +static inline void clear_inode_flag(struct inode *inode, int flag) { - fi->i_acl_mode = mode; - set_inode_flag(fi, FI_ACL_MODE); + if (test_bit(flag, &F2FS_I(inode)->flags)) + clear_bit(flag, &F2FS_I(inode)->flags); + __mark_inode_dirty_flag(inode, flag, false); } -static inline void get_inline_info(struct f2fs_inode_info *fi, - struct f2fs_inode *ri) +static inline void set_acl_inode(struct inode *inode, umode_t mode) { + F2FS_I(inode)->i_acl_mode = mode; + set_inode_flag(inode, FI_ACL_MODE); + f2fs_mark_inode_dirty_sync(inode); +} + +static inline void f2fs_i_links_write(struct inode *inode, bool inc) +{ + if (inc) + inc_nlink(inode); + else + drop_nlink(inode); + f2fs_mark_inode_dirty_sync(inode); +} + +static inline void f2fs_i_blocks_write(struct inode *inode, + blkcnt_t diff, bool add) +{ + bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); + bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER); + + inode->i_blocks = add ? inode->i_blocks + diff : + inode->i_blocks - diff; + f2fs_mark_inode_dirty_sync(inode); + if (clean || recover) + set_inode_flag(inode, FI_AUTO_RECOVER); +} + +static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) +{ + bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); + bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER); + + if (i_size_read(inode) == i_size) + return; + + i_size_write(inode, i_size); + f2fs_mark_inode_dirty_sync(inode); + if (clean || recover) + set_inode_flag(inode, FI_AUTO_RECOVER); +} + +static inline bool f2fs_skip_inode_update(struct inode *inode) +{ + if (!is_inode_flag_set(inode, FI_AUTO_RECOVER)) + return false; + return F2FS_I(inode)->last_disk_size == i_size_read(inode); +} + +static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth) +{ + F2FS_I(inode)->i_current_depth = depth; + f2fs_mark_inode_dirty_sync(inode); +} + +static inline void f2fs_i_xnid_write(struct inode *inode, nid_t xnid) +{ + F2FS_I(inode)->i_xattr_nid = xnid; + f2fs_mark_inode_dirty_sync(inode); +} + +static inline void f2fs_i_pino_write(struct inode *inode, nid_t pino) +{ + F2FS_I(inode)->i_pino = pino; + f2fs_mark_inode_dirty_sync(inode); +} + +static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + if (ri->i_inline & F2FS_INLINE_XATTR) - set_inode_flag(fi, FI_INLINE_XATTR); + set_bit(FI_INLINE_XATTR, &fi->flags); if (ri->i_inline & F2FS_INLINE_DATA) - set_inode_flag(fi, FI_INLINE_DATA); + set_bit(FI_INLINE_DATA, &fi->flags); if (ri->i_inline & F2FS_INLINE_DENTRY) - set_inode_flag(fi, FI_INLINE_DENTRY); + set_bit(FI_INLINE_DENTRY, &fi->flags); if (ri->i_inline & F2FS_DATA_EXIST) - set_inode_flag(fi, FI_DATA_EXIST); + set_bit(FI_DATA_EXIST, &fi->flags); if (ri->i_inline & F2FS_INLINE_DOTS) - set_inode_flag(fi, FI_INLINE_DOTS); + set_bit(FI_INLINE_DOTS, &fi->flags); } -static inline void set_raw_inline(struct f2fs_inode_info *fi, - struct f2fs_inode *ri) +static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) { ri->i_inline = 0; - if (is_inode_flag_set(fi, FI_INLINE_XATTR)) + if (is_inode_flag_set(inode, FI_INLINE_XATTR)) ri->i_inline |= F2FS_INLINE_XATTR; - if (is_inode_flag_set(fi, FI_INLINE_DATA)) + if (is_inode_flag_set(inode, FI_INLINE_DATA)) ri->i_inline |= F2FS_INLINE_DATA; - if (is_inode_flag_set(fi, FI_INLINE_DENTRY)) + if (is_inode_flag_set(inode, FI_INLINE_DENTRY)) ri->i_inline |= F2FS_INLINE_DENTRY; - if (is_inode_flag_set(fi, FI_DATA_EXIST)) + if (is_inode_flag_set(inode, FI_DATA_EXIST)) ri->i_inline |= F2FS_DATA_EXIST; - if (is_inode_flag_set(fi, FI_INLINE_DOTS)) + if (is_inode_flag_set(inode, FI_INLINE_DOTS)) ri->i_inline |= F2FS_INLINE_DOTS; } static inline int f2fs_has_inline_xattr(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR); + return is_inode_flag_set(inode, FI_INLINE_XATTR); } -static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi) +static inline unsigned int addrs_per_inode(struct inode *inode) { - if (f2fs_has_inline_xattr(&fi->vfs_inode)) + if (f2fs_has_inline_xattr(inode)) return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS; return DEF_ADDRS_PER_INODE; } @@ -1502,43 +1798,43 @@ static inline int inline_xattr_size(struct inode *inode) static inline int f2fs_has_inline_data(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA); + return is_inode_flag_set(inode, FI_INLINE_DATA); } static inline void f2fs_clear_inline_inode(struct inode *inode) { - clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA); - clear_inode_flag(F2FS_I(inode), FI_DATA_EXIST); + clear_inode_flag(inode, FI_INLINE_DATA); + clear_inode_flag(inode, FI_DATA_EXIST); } static inline int f2fs_exist_data(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_DATA_EXIST); + return is_inode_flag_set(inode, FI_DATA_EXIST); } static inline int f2fs_has_inline_dots(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DOTS); + return is_inode_flag_set(inode, FI_INLINE_DOTS); } static inline bool f2fs_is_atomic_file(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_ATOMIC_FILE); + return is_inode_flag_set(inode, FI_ATOMIC_FILE); } static inline bool f2fs_is_volatile_file(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE); + return is_inode_flag_set(inode, FI_VOLATILE_FILE); } static inline bool f2fs_is_first_block_written(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); + return is_inode_flag_set(inode, FI_FIRST_BLOCK_WRITTEN); } static inline bool f2fs_is_drop_cache(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_DROP_CACHE); + return is_inode_flag_set(inode, FI_DROP_CACHE); } static inline void *inline_data_addr(struct page *page) @@ -1549,7 +1845,7 @@ static inline void *inline_data_addr(struct page *page) static inline int f2fs_has_inline_dentry(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DENTRY); + return is_inode_flag_set(inode, FI_INLINE_DENTRY); } static inline void f2fs_dentry_kunmap(struct inode *dir, struct page *page) @@ -1566,11 +1862,13 @@ static inline int is_file(struct inode *inode, int type) static inline void set_file(struct inode *inode, int type) { F2FS_I(inode)->i_advise |= type; + f2fs_mark_inode_dirty_sync(inode); } static inline void clear_file(struct inode *inode, int type) { F2FS_I(inode)->i_advise &= ~type; + f2fs_mark_inode_dirty_sync(inode); } static inline int f2fs_readonly(struct super_block *sb) @@ -1580,13 +1878,7 @@ static inline int f2fs_readonly(struct super_block *sb) static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi) { - return is_set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); -} - -static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi) -{ - set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); - sbi->sb->s_flags |= MS_RDONLY; + return is_set_ckpt_flags(sbi, CP_ERROR_FLAG); } static inline bool is_dot_dotdot(const struct qstr *str) @@ -1602,13 +1894,21 @@ static inline bool is_dot_dotdot(const struct qstr *str) static inline bool f2fs_may_extent_tree(struct inode *inode) { - mode_t mode = inode->i_mode; - if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE) || - is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) + is_inode_flag_set(inode, FI_NO_EXTENT)) return false; - return S_ISREG(mode); + return S_ISREG(inode->i_mode); +} + +static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, + size_t size, gfp_t flags) +{ +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_KMALLOC)) + return NULL; +#endif + return kmalloc(size, flags); } static inline void *f2fs_kvmalloc(size_t size, gfp_t flags) @@ -1632,14 +1932,14 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags) } #define get_inode_mode(i) \ - ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \ + ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) /* get offset of first page in next direct node */ -#define PGOFS_OF_NEXT_DNODE(pgofs, fi) \ - ((pgofs < ADDRS_PER_INODE(fi)) ? ADDRS_PER_INODE(fi) : \ - (pgofs - ADDRS_PER_INODE(fi) + ADDRS_PER_BLOCK) / \ - ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi)) +#define PGOFS_OF_NEXT_DNODE(pgofs, inode) \ + ((pgofs < ADDRS_PER_INODE(inode)) ? ADDRS_PER_INODE(inode) : \ + (pgofs - ADDRS_PER_INODE(inode) + ADDRS_PER_BLOCK) / \ + ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode)) /* * file.c @@ -1647,7 +1947,7 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags) int f2fs_sync_file(struct file *, loff_t, loff_t, int); void truncate_data_blocks(struct dnode_of_data *); int truncate_blocks(struct inode *, u64, bool); -int f2fs_truncate(struct inode *, bool); +int f2fs_truncate(struct inode *); int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *); int f2fs_setattr(struct dentry *, struct iattr *); int truncate_hole(struct inode *, pgoff_t, pgoff_t); @@ -1660,9 +1960,10 @@ long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long); */ void f2fs_set_inode_flags(struct inode *); struct inode *f2fs_iget(struct super_block *, unsigned long); +struct inode *f2fs_iget_retry(struct super_block *, unsigned long); int try_to_free_nats(struct f2fs_sb_info *, int); -void update_inode(struct inode *, struct page *); -void update_inode_page(struct inode *); +int update_inode(struct inode *, struct page *); +int update_inode_page(struct inode *); int f2fs_write_inode(struct inode *, struct writeback_control *); void f2fs_evict_inode(struct inode *); void handle_failed_inode(struct inode *); @@ -1675,29 +1976,34 @@ struct dentry *f2fs_get_parent(struct dentry *child); /* * dir.c */ -extern unsigned char f2fs_filetype_table[F2FS_FT_MAX]; void set_de_type(struct f2fs_dir_entry *, umode_t); - -struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *, +unsigned char get_de_type(struct f2fs_dir_entry *); +struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *, f2fs_hash_t, int *, struct f2fs_dentry_ptr *); bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, - unsigned int, struct f2fs_str *); + unsigned int, struct fscrypt_str *); void do_make_empty_dir(struct inode *, struct inode *, struct f2fs_dentry_ptr *); struct page *init_inode_metadata(struct inode *, struct inode *, - const struct qstr *, struct page *); + const struct qstr *, const struct qstr *, struct page *); void update_parent_metadata(struct inode *, struct inode *, unsigned int); int room_for_filename(const void *, int, int); -void f2fs_drop_nlink(struct inode *, struct inode *, struct page *); -struct f2fs_dir_entry *f2fs_find_entry(struct inode *, struct qstr *, +void f2fs_drop_nlink(struct inode *, struct inode *); +struct f2fs_dir_entry *__f2fs_find_entry(struct inode *, struct fscrypt_name *, + struct page **); +struct f2fs_dir_entry *f2fs_find_entry(struct inode *, const struct qstr *, struct page **); struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **); -ino_t f2fs_inode_by_name(struct inode *, struct qstr *); +ino_t f2fs_inode_by_name(struct inode *, const struct qstr *, struct page **); void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, struct page *, struct inode *); int update_dent_inode(struct inode *, struct inode *, const struct qstr *); void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *, const struct qstr *, f2fs_hash_t , unsigned int); +int f2fs_add_regular_entry(struct inode *, const struct qstr *, + const struct qstr *, struct inode *, nid_t, umode_t); +int __f2fs_do_add_link(struct inode *, struct fscrypt_name*, struct inode *, + nid_t, umode_t); int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t, umode_t); void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *, @@ -1714,10 +2020,13 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) /* * super.c */ +int f2fs_inode_dirtied(struct inode *); +void f2fs_inode_synced(struct inode *); int f2fs_commit_super(struct f2fs_sb_info *, bool); int f2fs_sync_fs(struct super_block *, int); extern __printf(3, 4) void f2fs_msg(struct super_block *, const char *, const char *, ...); +int sanity_check_ckpt(struct f2fs_sb_info *sbi); /* * hash.c @@ -1735,6 +2044,7 @@ int need_dentry_mark(struct f2fs_sb_info *, nid_t); bool is_checkpointed_node(struct f2fs_sb_info *, nid_t); bool need_inode_block_update(struct f2fs_sb_info *, nid_t); void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); +pgoff_t get_next_page_offset(struct dnode_of_data *, pgoff_t); int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); int truncate_inode_blocks(struct inode *, pgoff_t); int truncate_xattr_node(struct inode *, struct page *); @@ -1745,8 +2055,11 @@ struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *); void ra_node_page(struct f2fs_sb_info *, nid_t); struct page *get_node_page(struct f2fs_sb_info *, pgoff_t); struct page *get_node_page_ra(struct page *, int); -void sync_inode_page(struct dnode_of_data *); -int sync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *); +void move_node_page(struct page *, int); +int fsync_node_pages(struct f2fs_sb_info *, struct inode *, + struct writeback_control *, bool); +int sync_node_pages(struct f2fs_sb_info *, struct writeback_control *); +void build_free_nids(struct f2fs_sb_info *); bool alloc_nid(struct f2fs_sb_info *, nid_t *); void alloc_nid_done(struct f2fs_sb_info *, nid_t); void alloc_nid_failed(struct f2fs_sb_info *, nid_t); @@ -1766,8 +2079,9 @@ void destroy_node_manager_caches(void); * segment.c */ void register_inmem_page(struct inode *, struct page *); -int commit_inmem_pages(struct inode *, bool); -void f2fs_balance_fs(struct f2fs_sb_info *); +void drop_inmem_pages(struct inode *); +int commit_inmem_pages(struct inode *); +void f2fs_balance_fs(struct f2fs_sb_info *, bool); void f2fs_balance_fs_bg(struct f2fs_sb_info *); int f2fs_issue_flush(struct f2fs_sb_info *); int create_flush_cmd_control(struct f2fs_sb_info *); @@ -1777,7 +2091,6 @@ bool is_checkpointed_data(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *); void release_discard_addrs(struct f2fs_sb_info *); -bool discard_next_dnode(struct f2fs_sb_info *, block_t); int npages_for_summary_flush(struct f2fs_sb_info *, bool); void allocate_new_segments(struct f2fs_sb_info *); int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *); @@ -1787,16 +2100,17 @@ void write_meta_page(struct f2fs_sb_info *, struct page *); void write_node_page(unsigned int, struct f2fs_io_info *); void write_data_page(struct dnode_of_data *, struct f2fs_io_info *); void rewrite_data_page(struct f2fs_io_info *); +void __f2fs_replace_block(struct f2fs_sb_info *, struct f2fs_summary *, + block_t, block_t, bool, bool); void f2fs_replace_block(struct f2fs_sb_info *, struct dnode_of_data *, - block_t, block_t, unsigned char, bool); + block_t, block_t, unsigned char, bool, bool); void allocate_data_block(struct f2fs_sb_info *, struct page *, block_t, block_t *, struct f2fs_summary *, int); -void f2fs_wait_on_page_writeback(struct page *, enum page_type); +void f2fs_wait_on_page_writeback(struct page *, enum page_type, bool); void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *, block_t); void write_data_summaries(struct f2fs_sb_info *, block_t); void write_node_summaries(struct f2fs_sb_info *, block_t); -int lookup_journal_in_cursum(struct f2fs_summary_block *, - int, unsigned int, int); +int lookup_journal_in_cursum(struct f2fs_journal *, int, unsigned int, int); void flush_sit_entries(struct f2fs_sb_info *, struct cp_control *); int build_segment_manager(struct f2fs_sb_info *); void destroy_segment_manager(struct f2fs_sb_info *); @@ -1806,6 +2120,7 @@ void destroy_segment_manager_caches(void); /* * checkpoint.c */ +void f2fs_stop_checkpoint(struct f2fs_sb_info *, bool); struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_tmp_page(struct f2fs_sb_info *, pgoff_t); @@ -1813,21 +2128,21 @@ bool is_valid_blkaddr(struct f2fs_sb_info *, block_t, int); int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int, bool); void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t); long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); -void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type); -void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type); -void release_dirty_inode(struct f2fs_sb_info *); +void add_ino_entry(struct f2fs_sb_info *, nid_t, int type); +void remove_ino_entry(struct f2fs_sb_info *, nid_t, int type); +void release_ino_entry(struct f2fs_sb_info *, bool); bool exist_written_data(struct f2fs_sb_info *, nid_t, int); +int f2fs_sync_inode_meta(struct f2fs_sb_info *); int acquire_orphan_inode(struct f2fs_sb_info *); void release_orphan_inode(struct f2fs_sb_info *); -void add_orphan_inode(struct f2fs_sb_info *, nid_t); +void add_orphan_inode(struct inode *); void remove_orphan_inode(struct f2fs_sb_info *, nid_t); int recover_orphan_inodes(struct f2fs_sb_info *); int get_valid_checkpoint(struct f2fs_sb_info *); void update_dirty_page(struct inode *, struct page *); -void add_dirty_dir_inode(struct inode *); -void remove_dirty_dir_inode(struct inode *); -void sync_dirty_dir_inodes(struct f2fs_sb_info *); -void write_checkpoint(struct f2fs_sb_info *, struct cp_control *); +void remove_dirty_inode(struct inode *); +int sync_dirty_inodes(struct f2fs_sb_info *, enum inode_type); +int write_checkpoint(struct f2fs_sb_info *, struct cp_control *); void init_ino_entry_info(struct f2fs_sb_info *); int __init create_checkpoint_caches(void); void destroy_checkpoint_caches(void); @@ -1836,34 +2151,46 @@ void destroy_checkpoint_caches(void); * data.c */ void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int); +void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *, + struct page *, nid_t, enum page_type, int); +void f2fs_flush_merged_bios(struct f2fs_sb_info *); int f2fs_submit_page_bio(struct f2fs_io_info *); void f2fs_submit_page_mbio(struct f2fs_io_info *); void set_data_blkaddr(struct dnode_of_data *); +void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t); +int reserve_new_blocks(struct dnode_of_data *, blkcnt_t); int reserve_new_block(struct dnode_of_data *); int f2fs_get_block(struct dnode_of_data *, pgoff_t); +ssize_t f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *); int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); struct page *get_read_data_page(struct inode *, pgoff_t, int, bool); struct page *find_data_page(struct inode *, pgoff_t); struct page *get_lock_data_page(struct inode *, pgoff_t, bool); struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); int do_write_data_page(struct f2fs_io_info *); +int f2fs_map_blocks(struct inode *, struct f2fs_map_blocks *, int, int); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); +void f2fs_set_page_dirty_nobuffers(struct page *); void f2fs_invalidate_page(struct page *, unsigned int, unsigned int); int f2fs_release_page(struct page *, gfp_t); +#ifdef CONFIG_MIGRATION +int f2fs_migrate_page(struct address_space *, struct page *, struct page *, + enum migrate_mode); +#endif /* * gc.c */ int start_gc_thread(struct f2fs_sb_info *); void stop_gc_thread(struct f2fs_sb_info *); -block_t start_bidx_of_node(unsigned int, struct f2fs_inode_info *); +block_t start_bidx_of_node(unsigned int, struct inode *); int f2fs_gc(struct f2fs_sb_info *, bool); void build_gc_manager(struct f2fs_sb_info *); /* * recovery.c */ -int recover_fsync_data(struct f2fs_sb_info *); +int recover_fsync_data(struct f2fs_sb_info *, bool); bool space_for_roll_forward(struct f2fs_sb_info *); /* @@ -1877,18 +2204,20 @@ struct f2fs_stat_info { int main_area_segs, main_area_sections, main_area_zones; unsigned long long hit_largest, hit_cached, hit_rbtree; unsigned long long hit_total, total_ext; - int ext_tree, ext_node; - int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta; + int ext_tree, zombie_tree, ext_node; + s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; + s64 inmem_pages; + unsigned int ndirty_dirs, ndirty_files, ndirty_all; int nats, dirty_nats, sits, dirty_sits, fnids; int total_count, utilization; - int bg_gc, inmem_pages, wb_pages; - int inline_xattr, inline_inode, inline_dir; - unsigned int valid_count, valid_node_count, valid_inode_count; + int bg_gc, wb_bios; + int inline_xattr, inline_inode, inline_dir, orphans; + unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; int rsvd_segs, overp_segs; int dirty_count, node_pages, meta_pages; - int prefree_count, call_count, cp_count; + int prefree_count, call_count, cp_count, bg_cp_count; int tot_segs, node_segs, data_segs, free_segs, free_secs; int bg_node_segs, bg_data_segs; int tot_blks, data_blks, node_blks; @@ -1909,10 +2238,11 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) } #define stat_inc_cp_count(si) ((si)->cp_count++) +#define stat_inc_bg_cp_count(si) ((si)->bg_cp_count++) #define stat_inc_call_count(si) ((si)->call_count++) #define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++) -#define stat_inc_dirty_dir(sbi) ((sbi)->n_dirty_dirs++) -#define stat_dec_dirty_dir(sbi) ((sbi)->n_dirty_dirs--) +#define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++) +#define stat_dec_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]--) #define stat_inc_total_hit(sbi) (atomic64_inc(&(sbi)->total_hit_ext)) #define stat_inc_rbtree_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_rbtree)) #define stat_inc_largest_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_largest)) @@ -1987,14 +2317,15 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) int f2fs_build_stats(struct f2fs_sb_info *); void f2fs_destroy_stats(struct f2fs_sb_info *); -void __init f2fs_create_root_stats(void); +int __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); #else #define stat_inc_cp_count(si) +#define stat_inc_bg_cp_count(si) #define stat_inc_call_count(si) #define stat_inc_bggc_count(si) -#define stat_inc_dirty_dir(sbi) -#define stat_dec_dirty_dir(sbi) +#define stat_inc_dirty_inode(sbi, type) +#define stat_dec_dirty_inode(sbi, type) #define stat_inc_total_hit(sb) #define stat_inc_rbtree_node_hit(sb) #define stat_inc_largest_node_hit(sbi) @@ -2015,7 +2346,7 @@ void f2fs_destroy_root_stats(void); static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } -static inline void __init f2fs_create_root_stats(void) { } +static inline int __init f2fs_create_root_stats(void) { return 0; } static inline void f2fs_destroy_root_stats(void) { } #endif @@ -2044,16 +2375,15 @@ int f2fs_convert_inline_inode(struct inode *); int f2fs_write_inline_data(struct inode *, struct page *); bool recover_inline_data(struct inode *, struct page *); struct f2fs_dir_entry *find_in_inline_dir(struct inode *, - struct f2fs_filename *, struct page **); -struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *, struct page **); + struct fscrypt_name *, struct page **); int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *); -int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *, - nid_t, umode_t); +int f2fs_add_inline_entry(struct inode *, const struct qstr *, + const struct qstr *, struct inode *, nid_t, umode_t); void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *, struct inode *, struct inode *); bool f2fs_empty_inline_dir(struct inode *); int f2fs_read_inline_dir(struct file *, struct dir_context *, - struct f2fs_str *); + struct fscrypt_str *); int f2fs_inline_data_fiemap(struct inode *, struct fiemap_extent_info *, __u64, __u64); @@ -2069,8 +2399,8 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *); * extent_cache.c */ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int); -void f2fs_drop_largest_extent(struct inode *, pgoff_t); -void f2fs_init_extent_tree(struct inode *, struct f2fs_extent *); +bool f2fs_init_extent_tree(struct inode *, struct f2fs_extent *); +void f2fs_drop_extent_tree(struct inode *); unsigned int f2fs_destroy_extent_node(struct inode *); void f2fs_destroy_extent_tree(struct inode *); bool f2fs_lookup_extent_cache(struct inode *, pgoff_t, struct extent_info *); @@ -2084,13 +2414,9 @@ void destroy_extent_cache(void); /* * crypto support */ -static inline int f2fs_encrypted_inode(struct inode *inode) +static inline bool f2fs_encrypted_inode(struct inode *inode) { -#ifdef CONFIG_F2FS_FS_ENCRYPTION return file_is_encrypt(inode); -#else - return 0; -#endif } static inline void f2fs_set_encrypted_inode(struct inode *inode) @@ -2102,26 +2428,38 @@ static inline void f2fs_set_encrypted_inode(struct inode *inode) static inline bool f2fs_bio_encrypted(struct bio *bio) { -#ifdef CONFIG_F2FS_FS_ENCRYPTION - return unlikely(bio->bi_private != NULL); -#else - return false; -#endif + return bio->bi_private != NULL; } static inline int f2fs_sb_has_crypto(struct super_block *sb) { -#ifdef CONFIG_F2FS_FS_ENCRYPTION return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT); -#else - return 0; -#endif +} + +static inline int f2fs_sb_mounted_hmsmr(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_HMSMR); +} + +static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt) +{ + clear_opt(sbi, ADAPTIVE); + clear_opt(sbi, LFS); + + switch (mt) { + case F2FS_MOUNT_ADAPTIVE: + set_opt(sbi, ADAPTIVE); + break; + case F2FS_MOUNT_LFS: + set_opt(sbi, LFS); + break; + } } static inline bool f2fs_may_encrypt(struct inode *inode) { #ifdef CONFIG_F2FS_FS_ENCRYPTION - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)); #else @@ -2129,86 +2467,28 @@ static inline bool f2fs_may_encrypt(struct inode *inode) #endif } -/* crypto_policy.c */ -int f2fs_is_child_context_consistent_with_parent(struct inode *, - struct inode *); -int f2fs_inherit_context(struct inode *, struct inode *, struct page *); -int f2fs_process_policy(const struct f2fs_encryption_policy *, struct inode *); -int f2fs_get_policy(struct inode *, struct f2fs_encryption_policy *); - -/* crypt.c */ -extern struct kmem_cache *f2fs_crypt_info_cachep; -bool f2fs_valid_contents_enc_mode(uint32_t); -uint32_t f2fs_validate_encryption_key_size(uint32_t, uint32_t); -struct f2fs_crypto_ctx *f2fs_get_crypto_ctx(struct inode *); -void f2fs_release_crypto_ctx(struct f2fs_crypto_ctx *); -struct page *f2fs_encrypt(struct inode *, struct page *); -int f2fs_decrypt(struct f2fs_crypto_ctx *, struct page *); -int f2fs_decrypt_one(struct inode *, struct page *); -void f2fs_end_io_crypto_work(struct f2fs_crypto_ctx *, struct bio *); - -/* crypto_key.c */ -void f2fs_free_encryption_info(struct inode *, struct f2fs_crypt_info *); -int _f2fs_get_encryption_info(struct inode *inode); - -/* crypto_fname.c */ -bool f2fs_valid_filenames_enc_mode(uint32_t); -u32 f2fs_fname_crypto_round_up(u32, u32); -int f2fs_fname_crypto_alloc_buffer(struct inode *, u32, struct f2fs_str *); -int f2fs_fname_disk_to_usr(struct inode *, f2fs_hash_t *, - const struct f2fs_str *, struct f2fs_str *); -int f2fs_fname_usr_to_disk(struct inode *, const struct qstr *, - struct f2fs_str *); - -#ifdef CONFIG_F2FS_FS_ENCRYPTION -void f2fs_restore_and_release_control_page(struct page **); -void f2fs_restore_control_page(struct page *); - -int __init f2fs_init_crypto(void); -int f2fs_crypto_initialize(void); -void f2fs_exit_crypto(void); - -int f2fs_has_encryption_key(struct inode *); - -static inline int f2fs_get_encryption_info(struct inode *inode) -{ - struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; - - if (!ci || - (ci->ci_keyring_key && - (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | - (1 << KEY_FLAG_REVOKED) | - (1 << KEY_FLAG_DEAD))))) - return _f2fs_get_encryption_info(inode); - return 0; -} - -void f2fs_fname_crypto_free_buffer(struct f2fs_str *); -int f2fs_fname_setup_filename(struct inode *, const struct qstr *, - int lookup, struct f2fs_filename *); -void f2fs_fname_free_filename(struct f2fs_filename *); -#else -static inline void f2fs_restore_and_release_control_page(struct page **p) { } -static inline void f2fs_restore_control_page(struct page *p) { } - -static inline int __init f2fs_init_crypto(void) { return 0; } -static inline void f2fs_exit_crypto(void) { } - -static inline int f2fs_has_encryption_key(struct inode *i) { return 0; } -static inline int f2fs_get_encryption_info(struct inode *i) { return 0; } -static inline void f2fs_fname_crypto_free_buffer(struct f2fs_str *p) { } - -static inline int f2fs_fname_setup_filename(struct inode *dir, - const struct qstr *iname, - int lookup, struct f2fs_filename *fname) -{ - memset(fname, 0, sizeof(struct f2fs_filename)); - fname->usr_fname = iname; - fname->disk_name.name = (unsigned char *)iname->name; - fname->disk_name.len = iname->len; - return 0; -} - -static inline void f2fs_fname_free_filename(struct f2fs_filename *fname) { } +#ifndef CONFIG_F2FS_FS_ENCRYPTION +#define fscrypt_set_d_op(i) +#define fscrypt_get_ctx fscrypt_notsupp_get_ctx +#define fscrypt_release_ctx fscrypt_notsupp_release_ctx +#define fscrypt_encrypt_page fscrypt_notsupp_encrypt_page +#define fscrypt_decrypt_page fscrypt_notsupp_decrypt_page +#define fscrypt_decrypt_bio_pages fscrypt_notsupp_decrypt_bio_pages +#define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page +#define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page +#define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range +#define fscrypt_process_policy fscrypt_notsupp_process_policy +#define fscrypt_get_policy fscrypt_notsupp_get_policy +#define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context +#define fscrypt_inherit_context fscrypt_notsupp_inherit_context +#define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info +#define fscrypt_put_encryption_info fscrypt_notsupp_put_encryption_info +#define fscrypt_setup_filename fscrypt_notsupp_setup_filename +#define fscrypt_free_filename fscrypt_notsupp_free_filename +#define fscrypt_fname_encrypted_size fscrypt_notsupp_fname_encrypted_size +#define fscrypt_fname_alloc_buffer fscrypt_notsupp_fname_alloc_buffer +#define fscrypt_fname_free_buffer fscrypt_notsupp_fname_free_buffer +#define fscrypt_fname_disk_to_usr fscrypt_notsupp_fname_disk_to_usr +#define fscrypt_fname_usr_to_disk fscrypt_notsupp_fname_usr_to_disk #endif #endif diff --git a/fs/f2fs/f2fs_crypto.h b/fs/f2fs/f2fs_crypto.h deleted file mode 100644 index c2c1c2b63b25..000000000000 --- a/fs/f2fs/f2fs_crypto.h +++ /dev/null @@ -1,151 +0,0 @@ -/* - * linux/fs/f2fs/f2fs_crypto.h - * - * Copied from linux/fs/ext4/ext4_crypto.h - * - * Copyright (C) 2015, Google, Inc. - * - * This contains encryption header content for f2fs - * - * Written by Michael Halcrow, 2015. - * Modified by Jaegeuk Kim, 2015. - */ -#ifndef _F2FS_CRYPTO_H -#define _F2FS_CRYPTO_H - -#include - -#define F2FS_KEY_DESCRIPTOR_SIZE 8 - -/* Policy provided via an ioctl on the topmost directory */ -struct f2fs_encryption_policy { - char version; - char contents_encryption_mode; - char filenames_encryption_mode; - char flags; - char master_key_descriptor[F2FS_KEY_DESCRIPTOR_SIZE]; -} __attribute__((__packed__)); - -#define F2FS_ENCRYPTION_CONTEXT_FORMAT_V1 1 -#define F2FS_KEY_DERIVATION_NONCE_SIZE 16 - -#define F2FS_POLICY_FLAGS_PAD_4 0x00 -#define F2FS_POLICY_FLAGS_PAD_8 0x01 -#define F2FS_POLICY_FLAGS_PAD_16 0x02 -#define F2FS_POLICY_FLAGS_PAD_32 0x03 -#define F2FS_POLICY_FLAGS_PAD_MASK 0x03 -#define F2FS_POLICY_FLAGS_VALID 0x03 - -/** - * Encryption context for inode - * - * Protector format: - * 1 byte: Protector format (1 = this version) - * 1 byte: File contents encryption mode - * 1 byte: File names encryption mode - * 1 byte: Flags - * 8 bytes: Master Key descriptor - * 16 bytes: Encryption Key derivation nonce - */ -struct f2fs_encryption_context { - char format; - char contents_encryption_mode; - char filenames_encryption_mode; - char flags; - char master_key_descriptor[F2FS_KEY_DESCRIPTOR_SIZE]; - char nonce[F2FS_KEY_DERIVATION_NONCE_SIZE]; -} __attribute__((__packed__)); - -/* Encryption parameters */ -#define F2FS_XTS_TWEAK_SIZE 16 -#define F2FS_AES_128_ECB_KEY_SIZE 16 -#define F2FS_AES_256_GCM_KEY_SIZE 32 -#define F2FS_AES_256_CBC_KEY_SIZE 32 -#define F2FS_AES_256_CTS_KEY_SIZE 32 -#define F2FS_AES_256_XTS_KEY_SIZE 64 -#define F2FS_MAX_KEY_SIZE 64 - -#define F2FS_KEY_DESC_PREFIX "f2fs:" -#define F2FS_KEY_DESC_PREFIX_SIZE 5 - -struct f2fs_encryption_key { - __u32 mode; - char raw[F2FS_MAX_KEY_SIZE]; - __u32 size; -} __attribute__((__packed__)); - -struct f2fs_crypt_info { - char ci_data_mode; - char ci_filename_mode; - char ci_flags; - struct crypto_ablkcipher *ci_ctfm; - struct key *ci_keyring_key; - char ci_master_key[F2FS_KEY_DESCRIPTOR_SIZE]; -}; - -#define F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 -#define F2FS_WRITE_PATH_FL 0x00000002 - -struct f2fs_crypto_ctx { - union { - struct { - struct page *bounce_page; /* Ciphertext page */ - struct page *control_page; /* Original page */ - } w; - struct { - struct bio *bio; - struct work_struct work; - } r; - struct list_head free_list; /* Free list */ - }; - char flags; /* Flags */ -}; - -struct f2fs_completion_result { - struct completion completion; - int res; -}; - -#define DECLARE_F2FS_COMPLETION_RESULT(ecr) \ - struct f2fs_completion_result ecr = { \ - COMPLETION_INITIALIZER((ecr).completion), 0 } - -static inline int f2fs_encryption_key_size(int mode) -{ - switch (mode) { - case F2FS_ENCRYPTION_MODE_AES_256_XTS: - return F2FS_AES_256_XTS_KEY_SIZE; - case F2FS_ENCRYPTION_MODE_AES_256_GCM: - return F2FS_AES_256_GCM_KEY_SIZE; - case F2FS_ENCRYPTION_MODE_AES_256_CBC: - return F2FS_AES_256_CBC_KEY_SIZE; - case F2FS_ENCRYPTION_MODE_AES_256_CTS: - return F2FS_AES_256_CTS_KEY_SIZE; - default: - BUG(); - } - return 0; -} - -#define F2FS_FNAME_NUM_SCATTER_ENTRIES 4 -#define F2FS_CRYPTO_BLOCK_SIZE 16 -#define F2FS_FNAME_CRYPTO_DIGEST_SIZE 32 - -/** - * For encrypted symlinks, the ciphertext length is stored at the beginning - * of the string in little-endian format. - */ -struct f2fs_encrypted_symlink_data { - __le16 len; - char encrypted_path[1]; -} __attribute__((__packed__)); - -/** - * This function is used to calculate the disk space required to - * store a filename of length l in encrypted symlink format. - */ -static inline u32 encrypted_symlink_data_len(u32 l) -{ - return (l + sizeof(struct f2fs_encrypted_symlink_data) - 1); -} -#endif /* _F2FS_CRYPTO_H */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a197215ad52b..c6e33258fabf 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -21,6 +21,8 @@ #include #include #include +#include +#include #include "f2fs.h" #include "node.h" @@ -40,8 +42,6 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, struct dnode_of_data dn; int err; - f2fs_balance_fs(sbi); - sb_start_pagefault(inode->i_sb); f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); @@ -57,6 +57,8 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); + f2fs_balance_fs(sbi, dn.node_changed); + file_update_time(vma->vm_file); lock_page(page); if (unlikely(page->mapping != inode->i_mapping || @@ -74,19 +76,20 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, goto mapped; /* page is wholly or partially inside EOF */ - if (((loff_t)(page->index + 1) << PAGE_CACHE_SHIFT) > + if (((loff_t)(page->index + 1) << PAGE_SHIFT) > i_size_read(inode)) { unsigned offset; - offset = i_size_read(inode) & ~PAGE_CACHE_MASK; - zero_user_segment(page, offset, PAGE_CACHE_SIZE); + offset = i_size_read(inode) & ~PAGE_MASK; + zero_user_segment(page, offset, PAGE_SIZE); } set_page_dirty(page); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); trace_f2fs_vm_page_mkwrite(page, DATA); mapped: /* fill the page */ - f2fs_wait_on_page_writeback(page, DATA); + f2fs_wait_on_page_writeback(page, DATA, false); /* wait for GCed encrypted page writeback */ if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) @@ -96,6 +99,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, clear_cold_data(page); out: sb_end_pagefault(inode->i_sb); + f2fs_update_time(sbi, REQ_TIME); return block_page_mkwrite_return(err); } @@ -132,7 +136,7 @@ static inline bool need_do_checkpoint(struct inode *inode) if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) need_cp = true; - else if (file_enc_name(inode) && need_dentry_mark(sbi, inode->i_ino)) + else if (is_sbi_flag_set(sbi, SBI_NEED_CP)) need_cp = true; else if (file_wrong_pino(inode)) need_cp = true; @@ -170,21 +174,16 @@ static void try_to_fix_pino(struct inode *inode) fi->xattr_ver = 0; if (file_wrong_pino(inode) && inode->i_nlink == 1 && get_parent_ino(inode, &pino)) { - fi->i_pino = pino; + f2fs_i_pino_write(inode, pino); file_got_pino(inode); - up_write(&fi->i_sem); - - mark_inode_dirty_sync(inode); - f2fs_write_inode(inode, NULL); - } else { - up_write(&fi->i_sem); } + up_write(&fi->i_sem); } -int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) +static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, + int datasync, bool atomic) { struct inode *inode = file->f_mapping->host; - struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t ino = inode->i_ino; int ret = 0; @@ -201,10 +200,10 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trace_f2fs_sync_file_enter(inode); /* if fdatasync is triggered, let's do in-place-update */ - if (get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks) - set_inode_flag(fi, FI_NEED_IPU); + if (datasync || get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks) + set_inode_flag(inode, FI_NEED_IPU); ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - clear_inode_flag(fi, FI_NEED_IPU); + clear_inode_flag(inode, FI_NEED_IPU); if (ret) { trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); @@ -212,7 +211,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) } /* if the inode is dirty, let's recover all the time */ - if (!datasync) { + if (!datasync && !f2fs_skip_inode_update(inode)) { f2fs_write_inode(inode, NULL); goto go_write; } @@ -220,29 +219,26 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) /* * if there is no written data, don't waste time to write recovery info. */ - if (!is_inode_flag_set(fi, FI_APPEND_WRITE) && + if (!is_inode_flag_set(inode, FI_APPEND_WRITE) && !exist_written_data(sbi, ino, APPEND_INO)) { /* it may call write_inode just prior to fsync */ if (need_inode_page_update(sbi, ino)) goto go_write; - if (is_inode_flag_set(fi, FI_UPDATE_WRITE) || + if (is_inode_flag_set(inode, FI_UPDATE_WRITE) || exist_written_data(sbi, ino, UPDATE_INO)) goto flush_out; goto out; } go_write: - /* guarantee free sections for fsync */ - f2fs_balance_fs(sbi); - /* * Both of fdatasync() and fsync() are able to be recovered from * sudden-power-off. */ - down_read(&fi->i_sem); + down_read(&F2FS_I(inode)->i_sem); need_cp = need_do_checkpoint(inode); - up_read(&fi->i_sem); + up_read(&F2FS_I(inode)->i_sem); if (need_cp) { /* all the dirty node pages should be flushed for POR */ @@ -253,19 +249,23 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * will be used only for fsynced inodes after checkpoint. */ try_to_fix_pino(inode); - clear_inode_flag(fi, FI_APPEND_WRITE); - clear_inode_flag(fi, FI_UPDATE_WRITE); + clear_inode_flag(inode, FI_APPEND_WRITE); + clear_inode_flag(inode, FI_UPDATE_WRITE); goto out; } sync_nodes: - sync_node_pages(sbi, ino, &wbc); - - /* if cp_error was enabled, we should avoid infinite loop */ - if (unlikely(f2fs_cp_error(sbi))) + ret = fsync_node_pages(sbi, inode, &wbc, atomic); + if (ret) goto out; + /* if cp_error was enabled, we should avoid infinite loop */ + if (unlikely(f2fs_cp_error(sbi))) { + ret = -EIO; + goto out; + } + if (need_inode_block_update(sbi, ino)) { - mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode); f2fs_write_inode(inode, NULL); goto sync_nodes; } @@ -275,18 +275,24 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out; /* once recovery info is written, don't need to tack this */ - remove_dirty_inode(sbi, ino, APPEND_INO); - clear_inode_flag(fi, FI_APPEND_WRITE); + remove_ino_entry(sbi, ino, APPEND_INO); + clear_inode_flag(inode, FI_APPEND_WRITE); flush_out: - remove_dirty_inode(sbi, ino, UPDATE_INO); - clear_inode_flag(fi, FI_UPDATE_WRITE); + remove_ino_entry(sbi, ino, UPDATE_INO); + clear_inode_flag(inode, FI_UPDATE_WRITE); ret = f2fs_issue_flush(sbi); + f2fs_update_time(sbi, REQ_TIME); out: trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); f2fs_trace_ios(NULL, 1); return ret; } +int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) +{ + return f2fs_do_sync_file(file, start, end, datasync, false); +} + static pgoff_t __get_first_dirty_index(struct address_space *mapping, pgoff_t pgofs, int whence) { @@ -300,7 +306,7 @@ static pgoff_t __get_first_dirty_index(struct address_space *mapping, pagevec_init(&pvec, 0); nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs, PAGECACHE_TAG_DIRTY, 1); - pgofs = nr_pages ? pvec.pages[0]->index : LONG_MAX; + pgofs = nr_pages ? pvec.pages[0]->index : ULONG_MAX; pagevec_release(&pvec); return pgofs; } @@ -332,7 +338,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) loff_t isize; int err = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); isize = i_size_read(inode); if (offset >= isize) @@ -345,32 +351,31 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) goto found; } - pgofs = (pgoff_t)(offset >> PAGE_CACHE_SHIFT); + pgofs = (pgoff_t)(offset >> PAGE_SHIFT); dirty = __get_first_dirty_index(inode->i_mapping, pgofs, whence); - for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) { + for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_SHIFT) { set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA); + err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE); if (err && err != -ENOENT) { goto fail; } else if (err == -ENOENT) { /* direct node does not exists */ if (whence == SEEK_DATA) { - pgofs = PGOFS_OF_NEXT_DNODE(pgofs, - F2FS_I(inode)); + pgofs = get_next_page_offset(&dn, pgofs); continue; } else { goto found; } } - end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); /* find data/hole in dnode block */ for (; dn.ofs_in_node < end_offset; dn.ofs_in_node++, pgofs++, - data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) { + data_ofs = (loff_t)pgofs << PAGE_SHIFT) { block_t blkaddr; blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); @@ -387,10 +392,10 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) found: if (whence == SEEK_HOLE && data_ofs > isize) data_ofs = isize; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return vfs_setpos(file, data_ofs, maxbytes); fail: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return -ENXIO; } @@ -418,19 +423,20 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence) static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); + int err; if (f2fs_encrypted_inode(inode)) { - int err = f2fs_get_encryption_info(inode); + err = fscrypt_get_encryption_info(inode); if (err) return 0; + if (!f2fs_encrypted_inode(inode)) + return -ENOKEY; } /* we don't need to use inline_data strictly */ - if (f2fs_has_inline_data(inode)) { - int err = f2fs_convert_inline_inode(inode); - if (err) - return err; - } + err = f2fs_convert_inline_inode(inode); + if (err) + return err; file_accessed(file); vma->vm_ops = &f2fs_file_vm_ops; @@ -440,12 +446,22 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) static int f2fs_file_open(struct inode *inode, struct file *filp) { int ret = generic_file_open(inode, filp); + struct dentry *dir; if (!ret && f2fs_encrypted_inode(inode)) { - ret = f2fs_get_encryption_info(inode); + ret = fscrypt_get_encryption_info(inode); if (ret) - ret = -EACCES; + return -EACCES; + if (!fscrypt_has_encryption_key(inode)) + return -ENOKEY; } + dir = dget_parent(file_dentry(filp)); + if (f2fs_encrypted_inode(d_inode(dir)) && + !fscrypt_has_permitted_context(d_inode(dir), inode)) { + dput(dir); + return -EPERM; + } + dput(dir); return ret; } @@ -468,8 +484,7 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) set_data_blkaddr(dn); invalidate_blocks(sbi, blkaddr); if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page)) - clear_inode_flag(F2FS_I(dn->inode), - FI_FIRST_BLOCK_WRITTEN); + clear_inode_flag(dn->inode, FI_FIRST_BLOCK_WRITTEN); nr_free++; } @@ -480,14 +495,13 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) * we will invalidate all blkaddr in the whole range. */ fofs = start_bidx_of_node(ofs_of_node(dn->node_page), - F2FS_I(dn->inode)) + ofs; + dn->inode) + ofs; f2fs_update_extent_cache_range(dn, fofs, 0, len); dec_valid_block_count(sbi, dn->inode, nr_free); - set_page_dirty(dn->node_page); - sync_inode_page(dn); } dn->ofs_in_node = ofs; + f2fs_update_time(sbi, REQ_TIME); trace_f2fs_truncate_data_blocks_range(dn->inode, dn->nid, dn->ofs_in_node, nr_free); return nr_free; @@ -501,8 +515,8 @@ void truncate_data_blocks(struct dnode_of_data *dn) static int truncate_partial_data_page(struct inode *inode, u64 from, bool cache_only) { - unsigned offset = from & (PAGE_CACHE_SIZE - 1); - pgoff_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_SIZE - 1); + pgoff_t index = from >> PAGE_SHIFT; struct address_space *mapping = inode->i_mapping; struct page *page; @@ -510,7 +524,7 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, return 0; if (cache_only) { - page = f2fs_grab_cache_page(mapping, index, false); + page = find_lock_page(mapping, index); if (page && PageUptodate(page)) goto truncate_out; f2fs_put_page(page, 1); @@ -521,9 +535,10 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, if (IS_ERR(page)) return 0; truncate_out: - f2fs_wait_on_page_writeback(page, DATA); - zero_user(page, offset, PAGE_CACHE_SIZE - offset); - if (!cache_only || !f2fs_encrypted_inode(inode) || !S_ISREG(inode->i_mode)) + f2fs_wait_on_page_writeback(page, DATA, true); + zero_user(page, offset, PAGE_SIZE - offset); + if (!cache_only || !f2fs_encrypted_inode(inode) || + !S_ISREG(inode->i_mode)) set_page_dirty(page); f2fs_put_page(page, 1); return 0; @@ -543,6 +558,9 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) free_from = (pgoff_t)F2FS_BYTES_TO_BLK(from + blocksize - 1); + if (free_from >= sbi->max_file_blocks) + goto free_partial; + if (lock) f2fs_lock_op(sbi); @@ -561,14 +579,14 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) } set_new_dnode(&dn, inode, ipage, NULL, 0); - err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE); + err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE_RA); if (err) { if (err == -ENOENT) goto free_next; goto out; } - count = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); + count = ADDRS_PER_PAGE(dn.node_page, inode); count -= dn.ofs_in_node; f2fs_bug_on(sbi, count < 0); @@ -584,7 +602,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) out: if (lock) f2fs_unlock_op(sbi); - +free_partial: /* lastly zero out the first data page */ if (!err) err = truncate_partial_data_page(inode, from, truncate_page); @@ -593,7 +611,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) return err; } -int f2fs_truncate(struct inode *inode, bool lock) +int f2fs_truncate(struct inode *inode) { int err; @@ -604,18 +622,18 @@ int f2fs_truncate(struct inode *inode, bool lock) trace_f2fs_truncate(inode); /* we should check inline_data size */ - if (f2fs_has_inline_data(inode) && !f2fs_may_inline_data(inode)) { + if (!f2fs_may_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); if (err) return err; } - err = truncate_blocks(inode, i_size_read(inode), lock); + err = truncate_blocks(inode, i_size_read(inode), true); if (err) return err; inode->i_mtime = inode->i_ctime = CURRENT_TIME; - mark_inode_dirty(inode); + f2fs_mark_inode_dirty_sync(inode); return 0; } @@ -631,7 +649,6 @@ int f2fs_getattr(struct vfsmount *mnt, #ifdef CONFIG_F2FS_FS_POSIX_ACL static void __setattr_copy(struct inode *inode, const struct iattr *attr) { - struct f2fs_inode_info *fi = F2FS_I(inode); unsigned int ia_valid = attr->ia_valid; if (ia_valid & ATTR_UID) @@ -652,7 +669,7 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) mode &= ~S_ISGID; - set_acl_inode(fi, mode); + set_acl_inode(inode, mode); } } #else @@ -662,7 +679,6 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) int f2fs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); - struct f2fs_inode_info *fi = F2FS_I(inode); int err; err = inode_change_ok(inode, attr); @@ -671,21 +687,28 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & ATTR_SIZE) { if (f2fs_encrypted_inode(inode) && - f2fs_get_encryption_info(inode)) + fscrypt_get_encryption_info(inode)) return -EACCES; if (attr->ia_size <= i_size_read(inode)) { truncate_setsize(inode, attr->ia_size); - err = f2fs_truncate(inode, true); + err = f2fs_truncate(inode); if (err) return err; - f2fs_balance_fs(F2FS_I_SB(inode)); + f2fs_balance_fs(F2FS_I_SB(inode), true); } else { /* * do not trim all blocks after i_size if target size is * larger than i_size. */ truncate_setsize(inode, attr->ia_size); + + /* should convert inline inode here */ + if (!f2fs_may_inline_data(inode)) { + err = f2fs_convert_inline_inode(inode); + if (err) + return err; + } inode->i_mtime = inode->i_ctime = CURRENT_TIME; } } @@ -694,13 +717,13 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & ATTR_MODE) { err = posix_acl_chmod(inode, get_inode_mode(inode)); - if (err || is_inode_flag_set(fi, FI_ACL_MODE)) { - inode->i_mode = fi->i_acl_mode; - clear_inode_flag(fi, FI_ACL_MODE); + if (err || is_inode_flag_set(inode, FI_ACL_MODE)) { + inode->i_mode = F2FS_I(inode)->i_acl_mode; + clear_inode_flag(inode, FI_ACL_MODE); } } - mark_inode_dirty(inode); + f2fs_mark_inode_dirty_sync(inode); return err; } @@ -727,7 +750,7 @@ static int fill_zero(struct inode *inode, pgoff_t index, if (!len) return 0; - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); page = get_new_data_page(inode, NULL, index, false); @@ -736,7 +759,7 @@ static int fill_zero(struct inode *inode, pgoff_t index, if (IS_ERR(page)) return PTR_ERR(page); - f2fs_wait_on_page_writeback(page, DATA); + f2fs_wait_on_page_writeback(page, DATA, true); zero_user(page, start, len); set_page_dirty(page); f2fs_put_page(page, 1); @@ -761,7 +784,7 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) return err; } - end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); count = min(end_offset - dn.ofs_in_node, pg_end - pg_start); f2fs_bug_on(F2FS_I_SB(inode), count == 0 || count > end_offset); @@ -778,19 +801,17 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) { pgoff_t pg_start, pg_end; loff_t off_start, off_end; - int ret = 0; + int ret; - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; - pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; - pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; + pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; + pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; - off_start = offset & (PAGE_CACHE_SIZE - 1); - off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); + off_start = offset & (PAGE_SIZE - 1); + off_end = (offset + len) & (PAGE_SIZE - 1); if (pg_start == pg_end) { ret = fill_zero(inode, pg_start, off_start, @@ -800,7 +821,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) } else { if (off_start) { ret = fill_zero(inode, pg_start++, off_start, - PAGE_CACHE_SIZE - off_start); + PAGE_SIZE - off_start); if (ret) return ret; } @@ -815,10 +836,10 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) loff_t blk_start, blk_end; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); - blk_start = (loff_t)pg_start << PAGE_CACHE_SHIFT; - blk_end = (loff_t)pg_end << PAGE_CACHE_SHIFT; + blk_start = (loff_t)pg_start << PAGE_SHIFT; + blk_end = (loff_t)pg_end << PAGE_SHIFT; truncate_inode_pages_range(mapping, blk_start, blk_end - 1); @@ -831,83 +852,199 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) return ret; } -static int __exchange_data_block(struct inode *inode, pgoff_t src, - pgoff_t dst, bool full) +static int __read_out_blkaddrs(struct inode *inode, block_t *blkaddr, + int *do_replace, pgoff_t off, pgoff_t len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; - block_t new_addr; - bool do_replace = false; - int ret; + int ret, done, i; +next_dnode: set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, src, LOOKUP_NODE_RA); + ret = get_dnode_of_data(&dn, off, LOOKUP_NODE_RA); if (ret && ret != -ENOENT) { return ret; } else if (ret == -ENOENT) { - new_addr = NULL_ADDR; - } else { - new_addr = dn.data_blkaddr; - if (!is_checkpointed_data(sbi, new_addr)) { - dn.data_blkaddr = NULL_ADDR; + if (dn.max_level == 0) + return -ENOENT; + done = min((pgoff_t)ADDRS_PER_BLOCK - dn.ofs_in_node, len); + blkaddr += done; + do_replace += done; + goto next; + } + + done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, inode) - + dn.ofs_in_node, len); + for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) { + *blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + if (!is_checkpointed_data(sbi, *blkaddr)) { + + if (test_opt(sbi, LFS)) { + f2fs_put_dnode(&dn); + return -ENOTSUPP; + } + /* do not invalidate this block address */ - set_data_blkaddr(&dn); - f2fs_update_extent_cache(&dn); - do_replace = true; + f2fs_update_data_blkaddr(&dn, NULL_ADDR); + *do_replace = 1; + } + } + f2fs_put_dnode(&dn); +next: + len -= done; + off += done; + if (len) + goto next_dnode; + return 0; +} + +static int __roll_back_blkaddrs(struct inode *inode, block_t *blkaddr, + int *do_replace, pgoff_t off, int len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + int ret, i; + + for (i = 0; i < len; i++, do_replace++, blkaddr++) { + if (*do_replace == 0) + continue; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = get_dnode_of_data(&dn, off + i, LOOKUP_NODE_RA); + if (ret) { + dec_valid_block_count(sbi, inode, 1); + invalidate_blocks(sbi, *blkaddr); + } else { + f2fs_update_data_blkaddr(&dn, *blkaddr); } f2fs_put_dnode(&dn); } + return 0; +} - if (new_addr == NULL_ADDR) - return full ? truncate_hole(inode, dst, dst + 1) : 0; +static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, + block_t *blkaddr, int *do_replace, + pgoff_t src, pgoff_t dst, pgoff_t len, bool full) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(src_inode); + pgoff_t i = 0; + int ret; - if (do_replace) { - struct page *ipage = get_node_page(sbi, inode->i_ino); - struct node_info ni; - - if (IS_ERR(ipage)) { - ret = PTR_ERR(ipage); - goto err_out; + while (i < len) { + if (blkaddr[i] == NULL_ADDR && !full) { + i++; + continue; } - set_new_dnode(&dn, inode, ipage, NULL, 0); - ret = f2fs_reserve_block(&dn, dst); - if (ret) - goto err_out; + if (do_replace[i] || blkaddr[i] == NULL_ADDR) { + struct dnode_of_data dn; + struct node_info ni; + size_t new_size; + pgoff_t ilen; - truncate_data_blocks_range(&dn, 1); + set_new_dnode(&dn, dst_inode, NULL, NULL, 0); + ret = get_dnode_of_data(&dn, dst + i, ALLOC_NODE); + if (ret) + return ret; - get_node_info(sbi, dn.nid, &ni); - f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr, - ni.version, true); - f2fs_put_dnode(&dn); - } else { - struct page *psrc, *pdst; + get_node_info(sbi, dn.nid, &ni); + ilen = min((pgoff_t) + ADDRS_PER_PAGE(dn.node_page, dst_inode) - + dn.ofs_in_node, len - i); + do { + dn.data_blkaddr = datablock_addr(dn.node_page, + dn.ofs_in_node); + truncate_data_blocks_range(&dn, 1); - psrc = get_lock_data_page(inode, src, true); - if (IS_ERR(psrc)) - return PTR_ERR(psrc); - pdst = get_new_data_page(inode, NULL, dst, false); - if (IS_ERR(pdst)) { + if (do_replace[i]) { + f2fs_i_blocks_write(src_inode, + 1, false); + f2fs_i_blocks_write(dst_inode, + 1, true); + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, + blkaddr[i], ni.version, true, false); + + do_replace[i] = 0; + } + dn.ofs_in_node++; + i++; + new_size = (dst + i) << PAGE_SHIFT; + if (dst_inode->i_size < new_size) + f2fs_i_size_write(dst_inode, new_size); + } while ((do_replace[i] || blkaddr[i] == NULL_ADDR) && --ilen); + + f2fs_put_dnode(&dn); + } else { + struct page *psrc, *pdst; + + psrc = get_lock_data_page(src_inode, src + i, true); + if (IS_ERR(psrc)) + return PTR_ERR(psrc); + pdst = get_new_data_page(dst_inode, NULL, dst + i, + true); + if (IS_ERR(pdst)) { + f2fs_put_page(psrc, 1); + return PTR_ERR(pdst); + } + f2fs_copy_page(psrc, pdst); + set_page_dirty(pdst); + f2fs_put_page(pdst, 1); f2fs_put_page(psrc, 1); - return PTR_ERR(pdst); - } - f2fs_copy_page(psrc, pdst); - set_page_dirty(pdst); - f2fs_put_page(pdst, 1); - f2fs_put_page(psrc, 1); - return truncate_hole(inode, src, src + 1); + ret = truncate_hole(src_inode, src + i, src + i + 1); + if (ret) + return ret; + i++; + } + } + return 0; +} + +static int __exchange_data_block(struct inode *src_inode, + struct inode *dst_inode, pgoff_t src, pgoff_t dst, + pgoff_t len, bool full) +{ + block_t *src_blkaddr; + int *do_replace; + pgoff_t olen; + int ret; + + while (len) { + olen = min((pgoff_t)4 * ADDRS_PER_BLOCK, len); + + src_blkaddr = f2fs_kvzalloc(sizeof(block_t) * olen, GFP_KERNEL); + if (!src_blkaddr) + return -ENOMEM; + + do_replace = f2fs_kvzalloc(sizeof(int) * olen, GFP_KERNEL); + if (!do_replace) { + kvfree(src_blkaddr); + return -ENOMEM; + } + + ret = __read_out_blkaddrs(src_inode, src_blkaddr, + do_replace, src, olen); + if (ret) + goto roll_back; + + ret = __clone_blkaddrs(src_inode, dst_inode, src_blkaddr, + do_replace, src, dst, olen, full); + if (ret) + goto roll_back; + + src += olen; + dst += olen; + len -= olen; + + kvfree(src_blkaddr); + kvfree(do_replace); } return 0; -err_out: - if (!get_dnode_of_data(&dn, src, LOOKUP_NODE)) { - dn.data_blkaddr = new_addr; - set_data_blkaddr(&dn); - f2fs_update_extent_cache(&dn); - f2fs_put_dnode(&dn); - } +roll_back: + __roll_back_blkaddrs(src_inode, src_blkaddr, do_replace, src, len); + kvfree(src_blkaddr); + kvfree(do_replace); return ret; } @@ -915,16 +1052,15 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; - int ret = 0; + int ret; - for (; end < nrpages; start++, end++) { - f2fs_balance_fs(sbi); - f2fs_lock_op(sbi); - ret = __exchange_data_block(inode, end, start, true); - f2fs_unlock_op(sbi); - if (ret) - break; - } + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); + + f2fs_drop_extent_tree(inode); + + ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); + f2fs_unlock_op(sbi); return ret; } @@ -941,16 +1077,12 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1)) return -EINVAL; - f2fs_balance_fs(F2FS_I_SB(inode)); + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } - - pg_start = offset >> PAGE_CACHE_SHIFT; - pg_end = (offset + len) >> PAGE_CACHE_SHIFT; + pg_start = offset >> PAGE_SHIFT; + pg_end = (offset + len) >> PAGE_SHIFT; /* write out all dirty pages from offset */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); @@ -972,7 +1104,50 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) ret = truncate_blocks(inode, new_size, true); if (!ret) - i_size_write(inode, new_size); + f2fs_i_size_write(inode, new_size); + + return ret; +} + +static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, + pgoff_t end) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + pgoff_t index = start; + unsigned int ofs_in_node = dn->ofs_in_node; + blkcnt_t count = 0; + int ret; + + for (; index < end; index++, dn->ofs_in_node++) { + if (datablock_addr(dn->node_page, dn->ofs_in_node) == NULL_ADDR) + count++; + } + + dn->ofs_in_node = ofs_in_node; + ret = reserve_new_blocks(dn, count); + if (ret) + return ret; + + dn->ofs_in_node = ofs_in_node; + for (index = start; index < end; index++, dn->ofs_in_node++) { + dn->data_blkaddr = + datablock_addr(dn->node_page, dn->ofs_in_node); + /* + * reserve_new_blocks will not guarantee entire block + * allocation. + */ + if (dn->data_blkaddr == NULL_ADDR) { + ret = -ENOSPC; + break; + } + if (dn->data_blkaddr != NEW_ADDR) { + invalidate_blocks(sbi, dn->data_blkaddr); + dn->data_blkaddr = NEW_ADDR; + set_data_blkaddr(dn); + } + } + + f2fs_update_extent_cache_range(dn, start, 0, index - start); return ret; } @@ -991,13 +1166,9 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) return ret; - f2fs_balance_fs(sbi); - - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1); if (ret) @@ -1005,11 +1176,11 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, truncate_pagecache_range(inode, offset, offset + len - 1); - pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; - pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; + pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; + pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; - off_start = offset & (PAGE_CACHE_SIZE - 1); - off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); + off_start = offset & (PAGE_SIZE - 1); + off_end = (offset + len) & (PAGE_SIZE - 1); if (pg_start == pg_end) { ret = fill_zero(inode, pg_start, off_start, @@ -1023,48 +1194,40 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, } else { if (off_start) { ret = fill_zero(inode, pg_start++, off_start, - PAGE_CACHE_SIZE - off_start); + PAGE_SIZE - off_start); if (ret) return ret; new_size = max_t(loff_t, new_size, - (loff_t)pg_start << PAGE_CACHE_SHIFT); + (loff_t)pg_start << PAGE_SHIFT); } - for (index = pg_start; index < pg_end; index++) { + for (index = pg_start; index < pg_end;) { struct dnode_of_data dn; - struct page *ipage; + unsigned int end_offset; + pgoff_t end; f2fs_lock_op(sbi); - ipage = get_node_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) { - ret = PTR_ERR(ipage); - f2fs_unlock_op(sbi); - goto out; - } - - set_new_dnode(&dn, inode, ipage, NULL, 0); - ret = f2fs_reserve_block(&dn, index); + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = get_dnode_of_data(&dn, index, ALLOC_NODE); if (ret) { f2fs_unlock_op(sbi); goto out; } - if (dn.data_blkaddr != NEW_ADDR) { - invalidate_blocks(sbi, dn.data_blkaddr); + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); + end = min(pg_end, end_offset - dn.ofs_in_node + index); - dn.data_blkaddr = NEW_ADDR; - set_data_blkaddr(&dn); - - dn.data_blkaddr = NULL_ADDR; - f2fs_update_extent_cache(&dn); - } + ret = f2fs_do_zero_range(&dn, index, end); f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); + if (ret) + goto out; + index = end; new_size = max_t(loff_t, new_size, - (loff_t)(index + 1) << PAGE_CACHE_SHIFT); + (loff_t)index << PAGE_SHIFT); } if (off_end) { @@ -1077,11 +1240,8 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, } out: - if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) { - i_size_write(inode, new_size); - mark_inode_dirty(inode); - update_inode_page(inode); - } + if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) + f2fs_i_size_write(inode, new_size); return ret; } @@ -1089,7 +1249,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - pgoff_t pg_start, pg_end, delta, nrpages, idx; + pgoff_t nr, pg_start, pg_end, delta, idx; loff_t new_size; int ret = 0; @@ -1104,13 +1264,11 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1)) return -EINVAL; - f2fs_balance_fs(sbi); + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } + f2fs_balance_fs(sbi, true); ret = truncate_blocks(inode, i_size_read(inode), true); if (ret) @@ -1123,17 +1281,23 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) truncate_pagecache(inode, offset); - pg_start = offset >> PAGE_CACHE_SHIFT; - pg_end = (offset + len) >> PAGE_CACHE_SHIFT; + pg_start = offset >> PAGE_SHIFT; + pg_end = (offset + len) >> PAGE_SHIFT; delta = pg_end - pg_start; - nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; + idx = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; + + while (!ret && idx > pg_start) { + nr = idx - pg_start; + if (nr > delta) + nr = delta; + idx -= nr; - for (idx = nrpages - 1; idx >= pg_start && idx != -1; idx--) { f2fs_lock_op(sbi); - ret = __exchange_data_block(inode, idx, idx + delta, false); + f2fs_drop_extent_tree(inode); + + ret = __exchange_data_block(inode, inode, idx, + idx + delta, nr, false); f2fs_unlock_op(sbi); - if (ret) - break; } /* write out all moved pages, if possible */ @@ -1141,7 +1305,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) truncate_pagecache(inode, offset); if (!ret) - i_size_write(inode, new_size); + f2fs_i_size_write(inode, new_size); return ret; } @@ -1149,60 +1313,48 @@ static int expand_inode_data(struct inode *inode, loff_t offset, loff_t len, int mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - pgoff_t index, pg_start, pg_end; + struct f2fs_map_blocks map = { .m_next_pgofs = NULL }; + pgoff_t pg_end; loff_t new_size = i_size_read(inode); - loff_t off_start, off_end; - int ret = 0; - - f2fs_balance_fs(sbi); + loff_t off_end; + int ret; ret = inode_newsize_ok(inode, (len + offset)); if (ret) return ret; - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + + f2fs_balance_fs(sbi, true); + + pg_end = ((unsigned long long)offset + len) >> PAGE_SHIFT; + off_end = (offset + len) & (PAGE_SIZE - 1); + + map.m_lblk = ((unsigned long long)offset) >> PAGE_SHIFT; + map.m_len = pg_end - map.m_lblk; + if (off_end) + map.m_len++; + + ret = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); + if (ret) { + pgoff_t last_off; + + if (!map.m_len) return ret; + + last_off = map.m_lblk + map.m_len - 1; + + /* update new size to the failed position */ + new_size = (last_off == pg_end) ? offset + len: + (loff_t)(last_off + 1) << PAGE_SHIFT; + } else { + new_size = ((loff_t)pg_end << PAGE_SHIFT) + off_end; } - pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; - pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; - - off_start = offset & (PAGE_CACHE_SIZE - 1); - off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); - - f2fs_lock_op(sbi); - - for (index = pg_start; index <= pg_end; index++) { - struct dnode_of_data dn; - - if (index == pg_end && !off_end) - goto noalloc; - - set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = f2fs_reserve_block(&dn, index); - if (ret) - break; -noalloc: - if (pg_start == pg_end) - new_size = offset + len; - else if (index == pg_start && off_start) - new_size = (loff_t)(index + 1) << PAGE_CACHE_SHIFT; - else if (index == pg_end) - new_size = ((loff_t)index << PAGE_CACHE_SHIFT) + - off_end; - else - new_size += PAGE_CACHE_SIZE; - } - - if (!(mode & FALLOC_FL_KEEP_SIZE) && - i_size_read(inode) < new_size) { - i_size_write(inode, new_size); - mark_inode_dirty(inode); - update_inode_page(inode); - } - f2fs_unlock_op(sbi); + if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) + f2fs_i_size_write(inode, new_size); return ret; } @@ -1226,7 +1378,7 @@ static long f2fs_fallocate(struct file *file, int mode, FALLOC_FL_INSERT_RANGE)) return -EOPNOTSUPP; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (mode & FALLOC_FL_PUNCH_HOLE) { if (offset >= inode->i_size) @@ -1245,11 +1397,12 @@ static long f2fs_fallocate(struct file *file, int mode, if (!ret) { inode->i_mtime = inode->i_ctime = CURRENT_TIME; - mark_inode_dirty(inode); + f2fs_mark_inode_dirty_sync(inode); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); trace_f2fs_fallocate(inode, mode, offset, len, ret); return ret; @@ -1257,13 +1410,22 @@ static long f2fs_fallocate(struct file *file, int mode, static int f2fs_release_file(struct inode *inode, struct file *filp) { + /* + * f2fs_relase_file is called at every close calls. So we should + * not drop any inmemory pages by close called by other process. + */ + if (!(filp->f_mode & FMODE_WRITE) || + atomic_read(&inode->i_writecount) != 1) + return 0; + /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - commit_inmem_pages(inode, true); + drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { - set_inode_flag(F2FS_I(inode), FI_DROP_CACHE); + clear_inode_flag(inode, FI_VOLATILE_FILE); + set_inode_flag(inode, FI_DROP_CACHE); filemap_fdatawrite(inode->i_mapping); - clear_inode_flag(F2FS_I(inode), FI_DROP_CACHE); + clear_inode_flag(inode, FI_DROP_CACHE); } return 0; } @@ -1293,33 +1455,29 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_inode_info *fi = F2FS_I(inode); - unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE; + unsigned int flags; unsigned int oldflags; int ret; + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (get_user(flags, (int __user *)arg)) + return -EFAULT; + ret = mnt_want_write_file(filp); if (ret) return ret; - if (!inode_owner_or_capable(inode)) { - ret = -EACCES; - goto out; - } - - if (get_user(flags, (int __user *)arg)) { - ret = -EFAULT; - goto out; - } - flags = f2fs_mask_flags(inode->i_mode, flags); - mutex_lock(&inode->i_mutex); + inode_lock(inode); oldflags = fi->i_flags; if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { if (!capable(CAP_LINUX_IMMUTABLE)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); ret = -EPERM; goto out; } @@ -1328,11 +1486,10 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) flags = flags & FS_FL_USER_MODIFIABLE; flags |= oldflags & ~FS_FL_USER_MODIFIABLE; fi->i_flags = flags; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); - f2fs_set_inode_flags(inode); inode->i_ctime = CURRENT_TIME; - mark_inode_dirty(inode); + f2fs_set_inode_flags(inode); out: mnt_drop_write_file(filp); return ret; @@ -1353,17 +1510,35 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; - f2fs_balance_fs(F2FS_I_SB(inode)); - - if (f2fs_is_atomic_file(inode)) - return 0; - - ret = f2fs_convert_inline_inode(inode); + ret = mnt_want_write_file(filp); if (ret) return ret; - set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); - return 0; + inode_lock(inode); + + if (f2fs_is_atomic_file(inode)) + goto out; + + ret = f2fs_convert_inline_inode(inode); + if (ret) + goto out; + + set_inode_flag(inode, FI_ATOMIC_FILE); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + + if (!get_dirty_pages(inode)) + goto out; + + f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, + "Unexpected flush for atomic writes: ino=%lu, npages=%lld", + inode->i_ino, get_dirty_pages(inode)); + ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); + if (ret) + clear_inode_flag(inode, FI_ATOMIC_FILE); +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_commit_atomic_write(struct file *filp) @@ -1374,22 +1549,27 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; - if (f2fs_is_volatile_file(inode)) - return 0; - ret = mnt_want_write_file(filp); if (ret) return ret; + inode_lock(inode); + + if (f2fs_is_volatile_file(inode)) + goto err_out; + if (f2fs_is_atomic_file(inode)) { - clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); - ret = commit_inmem_pages(inode, false); - if (ret) + clear_inode_flag(inode, FI_ATOMIC_FILE); + ret = commit_inmem_pages(inode); + if (ret) { + set_inode_flag(inode, FI_ATOMIC_FILE); goto err_out; + } } - ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0); + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); err_out: + inode_unlock(inode); mnt_drop_write_file(filp); return ret; } @@ -1402,31 +1582,54 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; - if (f2fs_is_volatile_file(inode)) - return 0; - - ret = f2fs_convert_inline_inode(inode); + ret = mnt_want_write_file(filp); if (ret) return ret; - set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); - return 0; + inode_lock(inode); + + if (f2fs_is_volatile_file(inode)) + goto out; + + ret = f2fs_convert_inline_inode(inode); + if (ret) + goto out; + + set_inode_flag(inode, FI_VOLATILE_FILE); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_release_volatile_write(struct file *filp) { struct inode *inode = file_inode(filp); + int ret; if (!inode_owner_or_capable(inode)) return -EACCES; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + inode_lock(inode); + if (!f2fs_is_volatile_file(inode)) - return 0; + goto out; - if (!f2fs_is_first_block_written(inode)) - return truncate_partial_data_page(inode, 0, true); + if (!f2fs_is_first_block_written(inode)) { + ret = truncate_partial_data_page(inode, 0, true); + goto out; + } - return punch_hole(inode, 0, F2FS_BLKSIZE); + ret = punch_hole(inode, 0, F2FS_BLKSIZE); +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_abort_volatile_write(struct file *filp) @@ -1441,13 +1644,19 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) if (ret) return ret; - f2fs_balance_fs(F2FS_I_SB(inode)); + inode_lock(inode); - clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); - clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); - commit_inmem_pages(inode, true); + if (f2fs_is_atomic_file(inode)) + drop_inmem_pages(inode); + if (f2fs_is_volatile_file(inode)) { + clear_inode_flag(inode, FI_VOLATILE_FILE); + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); + } + + inode_unlock(inode); mnt_drop_write_file(filp); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return ret; } @@ -1457,6 +1666,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct super_block *sb = sbi->sb; __u32 in; + int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1464,30 +1674,38 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) if (get_user(in, (__u32 __user *)arg)) return -EFAULT; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + switch (in) { case F2FS_GOING_DOWN_FULLSYNC: sb = freeze_bdev(sb->s_bdev); if (sb && !IS_ERR(sb)) { - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); thaw_bdev(sb->s_bdev, sb); } break; case F2FS_GOING_DOWN_METASYNC: /* do checkpoint only */ f2fs_sync_fs(sb, 1); - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); break; case F2FS_GOING_DOWN_NOSYNC: - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); break; case F2FS_GOING_DOWN_METAFLUSH: sync_meta_pages(sbi, META, LONG_MAX); - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); break; default: - return -EINVAL; + ret = -EINVAL; + goto out; } - return 0; + f2fs_update_time(sbi, REQ_TIME); +out: + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) @@ -1508,15 +1726,21 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) sizeof(range))) return -EFAULT; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + range.minlen = max((unsigned int)range.minlen, q->limits.discard_granularity); ret = f2fs_trim_fs(F2FS_SB(sb), &range); + mnt_drop_write_file(filp); if (ret < 0) return ret; if (copy_to_user((struct fstrim_range __user *)arg, &range, sizeof(range))) return -EFAULT; + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return 0; } @@ -1532,38 +1756,31 @@ static bool uuid_is_nonzero(__u8 u[16]) static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) { -#ifdef CONFIG_F2FS_FS_ENCRYPTION - struct f2fs_encryption_policy policy; + struct fscrypt_policy policy; struct inode *inode = file_inode(filp); - if (copy_from_user(&policy, (struct f2fs_encryption_policy __user *)arg, - sizeof(policy))) + if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg, + sizeof(policy))) return -EFAULT; - return f2fs_process_policy(&policy, inode); -#else - return -EOPNOTSUPP; -#endif + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + + return fscrypt_process_policy(filp, &policy); } static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) { -#ifdef CONFIG_F2FS_FS_ENCRYPTION - struct f2fs_encryption_policy policy; + struct fscrypt_policy policy; struct inode *inode = file_inode(filp); int err; - err = f2fs_get_policy(inode, &policy); + err = fscrypt_get_policy(inode, &policy); if (err) return err; - if (copy_to_user((struct f2fs_encryption_policy __user *)arg, &policy, - sizeof(policy))) + if (copy_to_user((struct fscrypt_policy __user *)arg, &policy, sizeof(policy))) return -EFAULT; return 0; -#else - return -EOPNOTSUPP; -#endif } static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) @@ -1586,13 +1803,13 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) generate_random_uuid(sbi->raw_super->encrypt_pw_salt); err = f2fs_commit_super(sbi, false); - - mnt_drop_write_file(filp); if (err) { /* undo new data */ memset(sbi->raw_super->encrypt_pw_salt, 0, 16); + mnt_drop_write_file(filp); return err; } + mnt_drop_write_file(filp); got_it: if (copy_to_user((__u8 __user *)arg, sbi->raw_super->encrypt_pw_salt, 16)) @@ -1605,6 +1822,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); __u32 sync; + int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1615,21 +1833,30 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) if (f2fs_readonly(sbi->sb)) return -EROFS; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + if (!sync) { - if (!mutex_trylock(&sbi->gc_mutex)) - return -EBUSY; + if (!mutex_trylock(&sbi->gc_mutex)) { + ret = -EBUSY; + goto out; + } } else { mutex_lock(&sbi->gc_mutex); } - return f2fs_gc(sbi, sync); + ret = f2fs_gc(sbi, sync); +out: + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct cp_control cpc; + int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1637,13 +1864,343 @@ static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) if (f2fs_readonly(sbi->sb)) return -EROFS; - cpc.reason = __get_cp_reason(sbi); + ret = mnt_want_write_file(filp); + if (ret) + return ret; - mutex_lock(&sbi->gc_mutex); - write_checkpoint(sbi, &cpc); - mutex_unlock(&sbi->gc_mutex); + ret = f2fs_sync_fs(sbi->sb, 1); - return 0; + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_defragment_range(struct f2fs_sb_info *sbi, + struct file *filp, + struct f2fs_defragment *range) +{ + struct inode *inode = file_inode(filp); + struct f2fs_map_blocks map = { .m_next_pgofs = NULL }; + struct extent_info ei; + pgoff_t pg_start, pg_end; + unsigned int blk_per_seg = sbi->blocks_per_seg; + unsigned int total = 0, sec_num; + unsigned int pages_per_sec = sbi->segs_per_sec * blk_per_seg; + block_t blk_end = 0; + bool fragmented = false; + int err; + + /* if in-place-update policy is enabled, don't waste time here */ + if (need_inplace_update(inode)) + return -EINVAL; + + pg_start = range->start >> PAGE_SHIFT; + pg_end = (range->start + range->len) >> PAGE_SHIFT; + + f2fs_balance_fs(sbi, true); + + inode_lock(inode); + + /* writeback all dirty pages in the range */ + err = filemap_write_and_wait_range(inode->i_mapping, range->start, + range->start + range->len - 1); + if (err) + goto out; + + /* + * lookup mapping info in extent cache, skip defragmenting if physical + * block addresses are continuous. + */ + if (f2fs_lookup_extent_cache(inode, pg_start, &ei)) { + if (ei.fofs + ei.len >= pg_end) + goto out; + } + + map.m_lblk = pg_start; + + /* + * lookup mapping info in dnode page cache, skip defragmenting if all + * physical block addresses are continuous even if there are hole(s) + * in logical blocks. + */ + while (map.m_lblk < pg_end) { + map.m_len = pg_end - map.m_lblk; + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ); + if (err) + goto out; + + if (!(map.m_flags & F2FS_MAP_FLAGS)) { + map.m_lblk++; + continue; + } + + if (blk_end && blk_end != map.m_pblk) { + fragmented = true; + break; + } + blk_end = map.m_pblk + map.m_len; + + map.m_lblk += map.m_len; + } + + if (!fragmented) + goto out; + + map.m_lblk = pg_start; + map.m_len = pg_end - pg_start; + + sec_num = (map.m_len + pages_per_sec - 1) / pages_per_sec; + + /* + * make sure there are enough free section for LFS allocation, this can + * avoid defragment running in SSR mode when free section are allocated + * intensively + */ + if (has_not_enough_free_secs(sbi, 0, sec_num)) { + err = -EAGAIN; + goto out; + } + + while (map.m_lblk < pg_end) { + pgoff_t idx; + int cnt = 0; + +do_map: + map.m_len = pg_end - map.m_lblk; + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ); + if (err) + goto clear_out; + + if (!(map.m_flags & F2FS_MAP_FLAGS)) { + map.m_lblk++; + continue; + } + + set_inode_flag(inode, FI_DO_DEFRAG); + + idx = map.m_lblk; + while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) { + struct page *page; + + page = get_lock_data_page(inode, idx, true); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto clear_out; + } + + set_page_dirty(page); + f2fs_put_page(page, 1); + + idx++; + cnt++; + total++; + } + + map.m_lblk = idx; + + if (idx < pg_end && cnt < blk_per_seg) + goto do_map; + + clear_inode_flag(inode, FI_DO_DEFRAG); + + err = filemap_fdatawrite(inode->i_mapping); + if (err) + goto out; + } +clear_out: + clear_inode_flag(inode, FI_DO_DEFRAG); +out: + inode_unlock(inode); + if (!err) + range->len = (u64)total << PAGE_SHIFT; + return err; +} + +static int f2fs_ioc_defragment(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_defragment range; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + err = mnt_want_write_file(filp); + if (err) + return err; + + if (f2fs_readonly(sbi->sb)) { + err = -EROFS; + goto out; + } + + if (copy_from_user(&range, (struct f2fs_defragment __user *)arg, + sizeof(range))) { + err = -EFAULT; + goto out; + } + + /* verify alignment of offset & size */ + if (range.start & (F2FS_BLKSIZE - 1) || + range.len & (F2FS_BLKSIZE - 1)) { + err = -EINVAL; + goto out; + } + + err = f2fs_defragment_range(sbi, filp, &range); + f2fs_update_time(sbi, REQ_TIME); + if (err < 0) + goto out; + + if (copy_to_user((struct f2fs_defragment __user *)arg, &range, + sizeof(range))) + err = -EFAULT; +out: + mnt_drop_write_file(filp); + return err; +} + +static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, size_t len) +{ + struct inode *src = file_inode(file_in); + struct inode *dst = file_inode(file_out); + struct f2fs_sb_info *sbi = F2FS_I_SB(src); + size_t olen = len, dst_max_i_size = 0; + size_t dst_osize; + int ret; + + if (file_in->f_path.mnt != file_out->f_path.mnt || + src->i_sb != dst->i_sb) + return -EXDEV; + + if (unlikely(f2fs_readonly(src->i_sb))) + return -EROFS; + + if (!S_ISREG(src->i_mode) || !S_ISREG(dst->i_mode)) + return -EINVAL; + + if (f2fs_encrypted_inode(src) || f2fs_encrypted_inode(dst)) + return -EOPNOTSUPP; + + if (src == dst) { + if (pos_in == pos_out) + return 0; + if (pos_out > pos_in && pos_out < pos_in + len) + return -EINVAL; + } + + inode_lock(src); + if (src != dst) { + if (!inode_trylock(dst)) { + ret = -EBUSY; + goto out; + } + } + + ret = -EINVAL; + if (pos_in + len > src->i_size || pos_in + len < pos_in) + goto out_unlock; + if (len == 0) + olen = len = src->i_size - pos_in; + if (pos_in + len == src->i_size) + len = ALIGN(src->i_size, F2FS_BLKSIZE) - pos_in; + if (len == 0) { + ret = 0; + goto out_unlock; + } + + dst_osize = dst->i_size; + if (pos_out + olen > dst->i_size) + dst_max_i_size = pos_out + olen; + + /* verify the end result is block aligned */ + if (!IS_ALIGNED(pos_in, F2FS_BLKSIZE) || + !IS_ALIGNED(pos_in + len, F2FS_BLKSIZE) || + !IS_ALIGNED(pos_out, F2FS_BLKSIZE)) + goto out_unlock; + + ret = f2fs_convert_inline_inode(src); + if (ret) + goto out_unlock; + + ret = f2fs_convert_inline_inode(dst); + if (ret) + goto out_unlock; + + /* write out all dirty pages from offset */ + ret = filemap_write_and_wait_range(src->i_mapping, + pos_in, pos_in + len); + if (ret) + goto out_unlock; + + ret = filemap_write_and_wait_range(dst->i_mapping, + pos_out, pos_out + len); + if (ret) + goto out_unlock; + + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); + ret = __exchange_data_block(src, dst, pos_in >> F2FS_BLKSIZE_BITS, + pos_out >> F2FS_BLKSIZE_BITS, + len >> F2FS_BLKSIZE_BITS, false); + + if (!ret) { + if (dst_max_i_size) + f2fs_i_size_write(dst, dst_max_i_size); + else if (dst_osize != dst->i_size) + f2fs_i_size_write(dst, dst_osize); + } + f2fs_unlock_op(sbi); +out_unlock: + if (src != dst) + inode_unlock(dst); +out: + inode_unlock(src); + return ret; +} + +static int f2fs_ioc_move_range(struct file *filp, unsigned long arg) +{ + struct f2fs_move_range range; + struct fd dst; + int err; + + if (!(filp->f_mode & FMODE_READ) || + !(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(&range, (struct f2fs_move_range __user *)arg, + sizeof(range))) + return -EFAULT; + + dst = fdget(range.dst_fd); + if (!dst.file) + return -EBADF; + + if (!(dst.file->f_mode & FMODE_WRITE)) { + err = -EBADF; + goto err_out; + } + + err = mnt_want_write_file(filp); + if (err) + goto err_out; + + err = f2fs_move_file_range(filp, range.pos_in, dst.file, + range.pos_out, range.len); + + mnt_drop_write_file(filp); + + if (copy_to_user((struct f2fs_move_range __user *)arg, + &range, sizeof(range))) + err = -EFAULT; +err_out: + fdput(dst); + return err; } long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) @@ -1679,6 +2236,10 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_gc(filp, arg); case F2FS_IOC_WRITE_CHECKPOINT: return f2fs_ioc_write_checkpoint(filp, arg); + case F2FS_IOC_DEFRAGMENT: + return f2fs_ioc_defragment(filp, arg); + case F2FS_IOC_MOVE_RANGE: + return f2fs_ioc_move_range(filp, arg); default: return -ENOTTY; } @@ -1686,14 +2247,36 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { - struct inode *inode = file_inode(iocb->ki_filp); + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct blk_plug plug; + ssize_t ret; if (f2fs_encrypted_inode(inode) && - !f2fs_has_encryption_key(inode) && - f2fs_get_encryption_info(inode)) + !fscrypt_has_encryption_key(inode) && + fscrypt_get_encryption_info(inode)) return -EACCES; - return generic_file_write_iter(iocb, from); + inode_lock(inode); + ret = generic_write_checks(iocb, from); + if (ret > 0) { + ret = f2fs_preallocate_blocks(iocb, from); + if (!ret) { + blk_start_plug(&plug); + ret = __generic_file_write_iter(iocb, from); + blk_finish_plug(&plug); + } + } + inode_unlock(inode); + + if (ret > 0) { + ssize_t err; + + err = generic_write_sync(file, iocb->ki_pos - ret, ret); + if (err < 0) + ret = err; + } + return ret; } #ifdef CONFIG_COMPAT @@ -1706,6 +2289,24 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC32_SETFLAGS: cmd = F2FS_IOC_SETFLAGS; break; + case F2FS_IOC32_GETVERSION: + cmd = F2FS_IOC_GETVERSION; + break; + case F2FS_IOC_START_ATOMIC_WRITE: + case F2FS_IOC_COMMIT_ATOMIC_WRITE: + case F2FS_IOC_START_VOLATILE_WRITE: + case F2FS_IOC_RELEASE_VOLATILE_WRITE: + case F2FS_IOC_ABORT_VOLATILE_WRITE: + case F2FS_IOC_SHUTDOWN: + case F2FS_IOC_SET_ENCRYPTION_POLICY: + case F2FS_IOC_GET_ENCRYPTION_PWSALT: + case F2FS_IOC_GET_ENCRYPTION_POLICY: + case F2FS_IOC_GARBAGE_COLLECT: + case F2FS_IOC_WRITE_CHECKPOINT: + case F2FS_IOC_DEFRAGMENT: + break; + case F2FS_IOC_MOVE_RANGE: + break; default: return -ENOIOCTLCMD; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index fedbf67a0842..0a0a1ad1fe1f 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -16,7 +16,6 @@ #include #include #include -#include #include "f2fs.h" #include "node.h" @@ -48,6 +47,11 @@ static int gc_thread_func(void *data) continue; } +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_CHECKPOINT)) + f2fs_stop_checkpoint(sbi, false); +#endif + /* * [GC triggering condition] * 0. GC is not conducted currently. @@ -97,7 +101,7 @@ int start_gc_thread(struct f2fs_sb_info *sbi) dev_t dev = sbi->sb->s_bdev->bd_dev; int err = 0; - gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL); + gc_th = f2fs_kmalloc(sbi, sizeof(struct f2fs_gc_kthread), GFP_KERNEL); if (!gc_th) { err = -ENOMEM; goto out; @@ -173,9 +177,9 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi, { /* SSR allocates in a segment unit */ if (p->alloc_mode == SSR) - return 1 << sbi->log_blocks_per_seg; + return sbi->blocks_per_seg; if (p->gc_mode == GC_GREEDY) - return (1 << sbi->log_blocks_per_seg) * p->ofs_unit; + return sbi->blocks_per_seg * p->ofs_unit; else if (p->gc_mode == GC_CB) return UINT_MAX; else /* No other gc_mode */ @@ -246,6 +250,18 @@ static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, return get_cb_cost(sbi, segno); } +static unsigned int count_bits(const unsigned long *addr, + unsigned int offset, unsigned int len) +{ + unsigned int end = offset + len, sum = 0; + + while (offset < end) { + if (test_bit(offset++, addr)) + ++sum; + } + return sum; +} + /* * This function is called from two paths. * One is garbage collection and the other is SSR segment selection. @@ -259,9 +275,9 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct victim_sel_policy p; - unsigned int secno, max_cost; + unsigned int secno, last_victim; unsigned int last_segment = MAIN_SEGS(sbi); - int nsearched = 0; + unsigned int nsearched = 0; mutex_lock(&dirty_i->seglist_lock); @@ -269,11 +285,12 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, select_policy(sbi, gc_type, type, &p); p.min_segno = NULL_SEGNO; - p.min_cost = max_cost = get_max_cost(sbi, &p); + p.min_cost = get_max_cost(sbi, &p); if (p.max_search == 0) goto out; + last_victim = sbi->last_victim[p.gc_mode]; if (p.alloc_mode == LFS && gc_type == FG_GC) { p.min_segno = check_bg_victims(sbi); if (p.min_segno != NULL_SEGNO) @@ -296,27 +313,35 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, } p.offset = segno + p.ofs_unit; - if (p.ofs_unit > 1) + if (p.ofs_unit > 1) { p.offset -= segno % p.ofs_unit; + nsearched += count_bits(p.dirty_segmap, + p.offset - p.ofs_unit, + p.ofs_unit); + } else { + nsearched++; + } + secno = GET_SECNO(sbi, segno); if (sec_usage_check(sbi, secno)) - continue; + goto next; if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) - continue; + goto next; cost = get_gc_cost(sbi, segno, &p); if (p.min_cost > cost) { p.min_segno = segno; p.min_cost = cost; - } else if (unlikely(cost == max_cost)) { - continue; } - - if (nsearched++ >= p.max_search) { - sbi->last_victim[p.gc_mode] = segno; +next: + if (nsearched >= p.max_search) { + if (!sbi->last_victim[p.gc_mode] && segno <= last_victim) + sbi->last_victim[p.gc_mode] = last_victim + 1; + else + sbi->last_victim[p.gc_mode] = segno + 1; break; } } @@ -400,13 +425,13 @@ static int check_valid_map(struct f2fs_sb_info *sbi, * On validity, copy that node with cold status, otherwise (invalid node) * ignore that. */ -static int gc_node_segment(struct f2fs_sb_info *sbi, +static void gc_node_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, unsigned int segno, int gc_type) { - bool initial = true; struct f2fs_summary *entry; block_t start_addr; int off; + int phase = 0; start_addr = START_BLOCK(sbi, segno); @@ -419,16 +444,24 @@ static int gc_node_segment(struct f2fs_sb_info *sbi, struct node_info ni; /* stop BG_GC if there is not enough free sections. */ - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0)) - return 0; + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) + return; if (check_valid_map(sbi, segno, off) == 0) continue; - if (initial) { + if (phase == 0) { + ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, + META_NAT, true); + continue; + } + + if (phase == 1) { ra_node_page(sbi, nid); continue; } + + /* phase == 2 */ node_page = get_node_page(sbi, nid); if (IS_ERR(node_page)) continue; @@ -445,36 +478,12 @@ static int gc_node_segment(struct f2fs_sb_info *sbi, continue; } - /* set page dirty and write it */ - if (gc_type == FG_GC) { - f2fs_wait_on_page_writeback(node_page, NODE); - set_page_dirty(node_page); - } else { - if (!PageWriteback(node_page)) - set_page_dirty(node_page); - } - f2fs_put_page(node_page, 1); + move_node_page(node_page, gc_type); stat_inc_node_blk_count(sbi, 1, gc_type); } - if (initial) { - initial = false; + if (++phase < 3) goto next_step; - } - - if (gc_type == FG_GC) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, - .for_reclaim = 0, - }; - sync_node_pages(sbi, 0, &wbc); - - /* return 1 only if FG_GC succefully reclaimed one */ - if (get_valid_blocks(sbi, segno, 1) == 0) - return 1; - } - return 0; } /* @@ -484,7 +493,7 @@ static int gc_node_segment(struct f2fs_sb_info *sbi, * as indirect or double indirect node blocks, are given, it must be a caller's * bug. */ -block_t start_bidx_of_node(unsigned int node_ofs, struct f2fs_inode_info *fi) +block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode) { unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4; unsigned int bidx; @@ -501,7 +510,7 @@ block_t start_bidx_of_node(unsigned int node_ofs, struct f2fs_inode_info *fi) int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1); bidx = node_ofs - 5 - dec; } - return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi); + return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode); } static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, @@ -547,6 +556,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) struct f2fs_summary sum; struct node_info ni; struct page *page; + block_t newaddr; int err; /* do not read out */ @@ -568,21 +578,24 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) * don't cache encrypted data into meta inode until previous dirty * data were writebacked to avoid racing between GC and flush. */ - f2fs_wait_on_page_writeback(page, DATA); + f2fs_wait_on_page_writeback(page, DATA, true); get_node_info(fio.sbi, dn.nid, &ni); set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); /* read page */ fio.page = page; - fio.blk_addr = dn.data_blkaddr; + fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; - fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi), - fio.blk_addr, - FGP_LOCK|FGP_CREAT, - GFP_NOFS); - if (!fio.encrypted_page) - goto put_out; + allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, + &sum, CURSEG_COLD_DATA); + + fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi), newaddr, + FGP_LOCK | FGP_CREAT, GFP_NOFS); + if (!fio.encrypted_page) { + err = -ENOMEM; + goto recover_block; + } err = f2fs_submit_page_bio(&fio); if (err) @@ -591,33 +604,39 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) /* write page */ lock_page(fio.encrypted_page); - if (unlikely(!PageUptodate(fio.encrypted_page))) + if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) { + err = -EIO; goto put_page_out; - if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) + } + if (unlikely(!PageUptodate(fio.encrypted_page))) { + err = -EIO; goto put_page_out; + } set_page_dirty(fio.encrypted_page); - f2fs_wait_on_page_writeback(fio.encrypted_page, DATA); + f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true); if (clear_page_dirty_for_io(fio.encrypted_page)) dec_page_count(fio.sbi, F2FS_DIRTY_META); set_page_writeback(fio.encrypted_page); /* allocate block address */ - f2fs_wait_on_page_writeback(dn.node_page, NODE); - allocate_data_block(fio.sbi, NULL, fio.blk_addr, - &fio.blk_addr, &sum, CURSEG_COLD_DATA); + f2fs_wait_on_page_writeback(dn.node_page, NODE, true); + fio.rw = WRITE_SYNC; + fio.new_blkaddr = newaddr; f2fs_submit_page_mbio(&fio); - dn.data_blkaddr = fio.blk_addr; - set_data_blkaddr(&dn); - f2fs_update_extent_cache(&dn); - set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); + f2fs_update_data_blkaddr(&dn, newaddr); + set_inode_flag(inode, FI_APPEND_WRITE); if (page->index == 0) - set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); put_page_out: f2fs_put_page(fio.encrypted_page, 1); +recover_block: + if (err) + __f2fs_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr, + true, true); put_out: f2fs_put_dnode(&dn); out: @@ -645,12 +664,23 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) .page = page, .encrypted_page = NULL, }; + bool is_dirty = PageDirty(page); + int err; + +retry: set_page_dirty(page); - f2fs_wait_on_page_writeback(page, DATA); + f2fs_wait_on_page_writeback(page, DATA, true); if (clear_page_dirty_for_io(page)) inode_dec_dirty_pages(inode); + set_cold_data(page); - do_write_data_page(&fio); + + err = do_write_data_page(&fio); + if (err == -ENOMEM && is_dirty) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } + clear_cold_data(page); } out: @@ -664,7 +694,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) * If the parent node is not valid or the data block address is different, * the victim data block is ignored. */ -static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, +static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, struct gc_inode_list *gc_list, unsigned int segno, int gc_type) { struct super_block *sb = sbi->sb; @@ -684,16 +714,23 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, struct node_info dni; /* dnode info for the data */ unsigned int ofs_in_node, nofs; block_t start_bidx; + nid_t nid = le32_to_cpu(entry->nid); /* stop BG_GC if there is not enough free sections. */ - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0)) - return 0; + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) + return; if (check_valid_map(sbi, segno, off) == 0) continue; if (phase == 0) { - ra_node_page(sbi, le32_to_cpu(entry->nid)); + ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, + META_NAT, true); + continue; + } + + if (phase == 1) { + ra_node_page(sbi, nid); continue; } @@ -701,14 +738,14 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (!is_alive(sbi, entry, &dni, start_addr + off, &nofs)) continue; - if (phase == 1) { + if (phase == 2) { ra_node_page(sbi, dni.ino); continue; } ofs_in_node = le16_to_cpu(entry->ofs_in_node); - if (phase == 2) { + if (phase == 3) { inode = f2fs_iget(sb, dni.ino); if (IS_ERR(inode) || is_bad_inode(inode)) continue; @@ -720,7 +757,7 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, continue; } - start_bidx = start_bidx_of_node(nofs, F2FS_I(inode)); + start_bidx = start_bidx_of_node(nofs, inode); data_page = get_read_data_page(inode, start_bidx + ofs_in_node, READA, true); if (IS_ERR(data_page)) { @@ -733,30 +770,41 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, continue; } - /* phase 3 */ + /* phase 4 */ inode = find_gc_inode(gc_list, dni.ino); if (inode) { - start_bidx = start_bidx_of_node(nofs, F2FS_I(inode)) + struct f2fs_inode_info *fi = F2FS_I(inode); + bool locked = false; + + if (S_ISREG(inode->i_mode)) { + if (!down_write_trylock(&fi->dio_rwsem[READ])) + continue; + if (!down_write_trylock( + &fi->dio_rwsem[WRITE])) { + up_write(&fi->dio_rwsem[READ]); + continue; + } + locked = true; + } + + start_bidx = start_bidx_of_node(nofs, inode) + ofs_in_node; if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) move_encrypted_block(inode, start_bidx); else move_data_page(inode, start_bidx, gc_type); + + if (locked) { + up_write(&fi->dio_rwsem[WRITE]); + up_write(&fi->dio_rwsem[READ]); + } + stat_inc_data_blk_count(sbi, 1, gc_type); } } - if (++phase < 4) + if (++phase < 5) goto next_step; - - if (gc_type == FG_GC) { - f2fs_submit_merged_bio(sbi, DATA, WRITE); - - /* return 1 only if FG_GC succefully reclaimed one */ - if (get_valid_blocks(sbi, segno, 1) == 0) - return 1; - } - return 0; } static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, @@ -772,51 +820,84 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, return ret; } -static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, +static int do_garbage_collect(struct f2fs_sb_info *sbi, + unsigned int start_segno, struct gc_inode_list *gc_list, int gc_type) { struct page *sum_page; struct f2fs_summary_block *sum; struct blk_plug plug; - int nfree = 0; + unsigned int segno = start_segno; + unsigned int end_segno = start_segno + sbi->segs_per_sec; + int sec_freed = 0; + unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ? + SUM_TYPE_DATA : SUM_TYPE_NODE; - /* read segment summary of victim */ - sum_page = get_sum_page(sbi, segno); + /* readahead multi ssa blocks those have contiguous address */ + if (sbi->segs_per_sec > 1) + ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), + sbi->segs_per_sec, META_SSA, true); + + /* reference all summary page */ + while (segno < end_segno) { + sum_page = get_sum_page(sbi, segno++); + unlock_page(sum_page); + } blk_start_plug(&plug); - sum = page_address(sum_page); + for (segno = start_segno; segno < end_segno; segno++) { - /* - * this is to avoid deadlock: - * - lock_page(sum_page) - f2fs_replace_block - * - check_valid_map() - mutex_lock(sentry_lock) - * - mutex_lock(sentry_lock) - change_curseg() - * - lock_page(sum_page) - */ - unlock_page(sum_page); + if (get_valid_blocks(sbi, segno, 1) == 0 || + unlikely(f2fs_cp_error(sbi))) + goto next; - switch (GET_SUM_TYPE((&sum->footer))) { - case SUM_TYPE_NODE: - nfree = gc_node_segment(sbi, sum->entries, segno, gc_type); - break; - case SUM_TYPE_DATA: - nfree = gc_data_segment(sbi, sum->entries, gc_list, - segno, gc_type); - break; + /* find segment summary of victim */ + sum_page = find_get_page(META_MAPPING(sbi), + GET_SUM_BLOCK(sbi, segno)); + f2fs_bug_on(sbi, !PageUptodate(sum_page)); + f2fs_put_page(sum_page, 0); + + sum = page_address(sum_page); + f2fs_bug_on(sbi, type != GET_SUM_TYPE((&sum->footer))); + + /* + * this is to avoid deadlock: + * - lock_page(sum_page) - f2fs_replace_block + * - check_valid_map() - mutex_lock(sentry_lock) + * - mutex_lock(sentry_lock) - change_curseg() + * - lock_page(sum_page) + */ + + if (type == SUM_TYPE_NODE) + gc_node_segment(sbi, sum->entries, segno, gc_type); + else + gc_data_segment(sbi, sum->entries, gc_list, segno, + gc_type); + + stat_inc_seg_count(sbi, type, gc_type); +next: + f2fs_put_page(sum_page, 0); } + + if (gc_type == FG_GC) + f2fs_submit_merged_bio(sbi, + (type == SUM_TYPE_NODE) ? NODE : DATA, WRITE); + blk_finish_plug(&plug); - stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer)), gc_type); + if (gc_type == FG_GC && + get_valid_blocks(sbi, start_segno, sbi->segs_per_sec) == 0) + sec_freed = 1; + stat_inc_call_count(sbi->stat_info); - f2fs_put_page(sum_page, 0); - return nfree; + return sec_freed; } int f2fs_gc(struct f2fs_sb_info *sbi, bool sync) { - unsigned int segno, i; + unsigned int segno; int gc_type = sync ? FG_GC : BG_GC; int sec_freed = 0; int ret = -EINVAL; @@ -832,46 +913,48 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync) if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) goto stop; - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi))) { + ret = -EIO; goto stop; + } - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed)) { + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed, 0)) { gc_type = FG_GC; - if (__get_victim(sbi, &segno, gc_type) || prefree_segments(sbi)) - write_checkpoint(sbi, &cpc); + /* + * If there is no victim and no prefree segment but still not + * enough free sections, we should flush dent/node blocks and do + * garbage collections. + */ + if (__get_victim(sbi, &segno, gc_type) || + prefree_segments(sbi)) { + ret = write_checkpoint(sbi, &cpc); + if (ret) + goto stop; + segno = NULL_SEGNO; + } else if (has_not_enough_free_secs(sbi, 0, 0)) { + ret = write_checkpoint(sbi, &cpc); + if (ret) + goto stop; + } } if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type)) goto stop; ret = 0; - /* readahead multi ssa blocks those have contiguous address */ - if (sbi->segs_per_sec > 1) - ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec, - META_SSA, true); - - for (i = 0; i < sbi->segs_per_sec; i++) { - /* - * for FG_GC case, halt gcing left segments once failed one - * of segments in selected section to avoid long latency. - */ - if (!do_garbage_collect(sbi, segno + i, &gc_list, gc_type) && - gc_type == FG_GC) - break; - } - - if (i == sbi->segs_per_sec && gc_type == FG_GC) + if (do_garbage_collect(sbi, segno, &gc_list, gc_type) && + gc_type == FG_GC) sec_freed++; if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; if (!sync) { - if (has_not_enough_free_secs(sbi, sec_freed)) + if (has_not_enough_free_secs(sbi, sec_freed, 0)) goto gc_more; if (gc_type == FG_GC) - write_checkpoint(sbi, &cpc); + ret = write_checkpoint(sbi, &cpc); } stop: mutex_unlock(&sbi->gc_mutex); diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index b4a65be9f7d3..a993967dcdb9 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -100,11 +100,3 @@ static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) return true; return false; } - -static inline int is_idle(struct f2fs_sb_info *sbi) -{ - struct block_device *bdev = sbi->sb->s_bdev; - struct request_queue *q = bdev_get_queue(bdev); - struct request_list *rl = &q->root_rl; - return !(rl->count[BLK_RW_SYNC]) && !(rl->count[BLK_RW_ASYNC]); -} diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index bda7126466c0..a04c1016d511 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -16,9 +16,6 @@ bool f2fs_may_inline_data(struct inode *inode) { - if (!test_opt(F2FS_I_SB(inode), INLINE_DATA)) - return false; - if (f2fs_is_atomic_file(inode)) return false; @@ -54,7 +51,7 @@ void read_inline_data(struct page *page, struct page *ipage) f2fs_bug_on(F2FS_P_SB(page), page->index); - zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); + zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE); /* Copy the whole inline data block */ src_addr = inline_data_addr(ipage); @@ -62,7 +59,8 @@ void read_inline_data(struct page *page, struct page *ipage) memcpy(dst_addr, src_addr, MAX_INLINE_DATA); flush_dcache_page(page); kunmap_atomic(dst_addr); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); } bool truncate_inline_inode(struct page *ipage, u64 from) @@ -74,9 +72,9 @@ bool truncate_inline_inode(struct page *ipage, u64 from) addr = inline_data_addr(ipage); - f2fs_wait_on_page_writeback(ipage, NODE); + f2fs_wait_on_page_writeback(ipage, NODE, true); memset(addr + from, 0, MAX_INLINE_DATA - from); - + set_page_dirty(ipage); return true; } @@ -96,11 +94,12 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page) } if (page->index) - zero_user_segment(page, 0, PAGE_CACHE_SIZE); + zero_user_segment(page, 0, PAGE_SIZE); else read_inline_data(page, ipage); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); f2fs_put_page(ipage, 1); unlock_page(page); return 0; @@ -108,7 +107,6 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page) int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) { - void *src_addr, *dst_addr; struct f2fs_io_info fio = { .sbi = F2FS_I_SB(dn->inode), .type = DATA, @@ -118,8 +116,6 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) }; int dirty, err; - f2fs_bug_on(F2FS_I_SB(dn->inode), page->index); - if (!f2fs_exist_data(dn->inode)) goto clear_out; @@ -127,21 +123,9 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) if (err) return err; - f2fs_wait_on_page_writeback(page, DATA); + f2fs_bug_on(F2FS_P_SB(page), PageWriteback(page)); - if (PageUptodate(page)) - goto no_update; - - zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); - - /* Copy the whole inline data block */ - src_addr = inline_data_addr(dn->inode_page); - dst_addr = kmap_atomic(page); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA); - flush_dcache_page(page); - kunmap_atomic(dst_addr); - SetPageUptodate(page); -no_update: + read_inline_data(page, dn->inode_page); set_page_dirty(page); /* clear dirty state */ @@ -149,23 +133,21 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) /* write data page to try to make data consistent */ set_page_writeback(page); - fio.blk_addr = dn->data_blkaddr; + fio.old_blkaddr = dn->data_blkaddr; write_data_page(dn, &fio); - set_data_blkaddr(dn); - f2fs_update_extent_cache(dn); - f2fs_wait_on_page_writeback(page, DATA); + f2fs_wait_on_page_writeback(page, DATA, true); if (dirty) inode_dec_dirty_pages(dn->inode); /* this converted inline_data should be recovered. */ - set_inode_flag(F2FS_I(dn->inode), FI_APPEND_WRITE); + set_inode_flag(dn->inode, FI_APPEND_WRITE); /* clear inline data and flag after data writeback */ truncate_inline_inode(dn->inode_page, 0); + clear_inline_node(dn->inode_page); clear_out: stat_dec_inline_inode(dn->inode); f2fs_clear_inline_inode(dn->inode); - sync_inode_page(dn); f2fs_put_dnode(dn); return 0; } @@ -177,7 +159,10 @@ int f2fs_convert_inline_inode(struct inode *inode) struct page *ipage, *page; int err = 0; - page = grab_cache_page(inode->i_mapping, 0); + if (!f2fs_has_inline_data(inode)) + return 0; + + page = f2fs_grab_cache_page(inode->i_mapping, 0, false); if (!page) return -ENOMEM; @@ -199,6 +184,9 @@ int f2fs_convert_inline_inode(struct inode *inode) f2fs_unlock_op(sbi); f2fs_put_page(page, 1); + + f2fs_balance_fs(sbi, dn.node_changed); + return err; } @@ -220,16 +208,17 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) f2fs_bug_on(F2FS_I_SB(inode), page->index); - f2fs_wait_on_page_writeback(dn.inode_page, NODE); + f2fs_wait_on_page_writeback(dn.inode_page, NODE, true); src_addr = kmap_atomic(page); dst_addr = inline_data_addr(dn.inode_page); memcpy(dst_addr, src_addr, MAX_INLINE_DATA); kunmap_atomic(src_addr); + set_page_dirty(dn.inode_page); - set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); - set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); + set_inode_flag(inode, FI_APPEND_WRITE); + set_inode_flag(inode, FI_DATA_EXIST); - sync_inode_page(&dn); + clear_inline_node(dn.inode_page); f2fs_put_dnode(&dn); return 0; } @@ -258,16 +247,16 @@ bool recover_inline_data(struct inode *inode, struct page *npage) ipage = get_node_page(sbi, inode->i_ino); f2fs_bug_on(sbi, IS_ERR(ipage)); - f2fs_wait_on_page_writeback(ipage, NODE); + f2fs_wait_on_page_writeback(ipage, NODE, true); src_addr = inline_data_addr(npage); dst_addr = inline_data_addr(ipage); memcpy(dst_addr, src_addr, MAX_INLINE_DATA); - set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); - set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); + set_inode_flag(inode, FI_INLINE_DATA); + set_inode_flag(inode, FI_DATA_EXIST); - update_inode(inode, ipage); + set_page_dirty(ipage); f2fs_put_page(ipage, 1); return true; } @@ -278,7 +267,6 @@ bool recover_inline_data(struct inode *inode, struct page *npage) if (!truncate_inline_inode(ipage, 0)) return false; f2fs_clear_inline_inode(inode); - update_inode(inode, ipage); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { if (truncate_blocks(inode, 0, false)) @@ -289,7 +277,7 @@ bool recover_inline_data(struct inode *inode, struct page *npage) } struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, - struct f2fs_filename *fname, struct page **res_page) + struct fscrypt_name *fname, struct page **res_page) { struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); struct f2fs_inline_dentry *inline_dentry; @@ -300,8 +288,10 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, f2fs_hash_t namehash; ipage = get_node_page(sbi, dir->i_ino); - if (IS_ERR(ipage)) + if (IS_ERR(ipage)) { + *res_page = ipage; return NULL; + } namehash = f2fs_dentry_hash(&name); @@ -315,30 +305,6 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, else f2fs_put_page(ipage, 0); - /* - * For the most part, it should be a bug when name_len is zero. - * We stop here for figuring out where the bugs has occurred. - */ - f2fs_bug_on(sbi, d.max < 0); - return de; -} - -struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *dir, - struct page **p) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(dir); - struct page *ipage; - struct f2fs_dir_entry *de; - struct f2fs_inline_dentry *dentry_blk; - - ipage = get_node_page(sbi, dir->i_ino); - if (IS_ERR(ipage)) - return NULL; - - dentry_blk = inline_data_addr(ipage); - de = &dentry_blk->dentry[1]; - *p = ipage; - unlock_page(ipage); return de; } @@ -356,10 +322,8 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent, set_page_dirty(ipage); /* update i_size to MAX_INLINE_DATA */ - if (i_size_read(inode) < MAX_INLINE_DATA) { - i_size_write(inode, MAX_INLINE_DATA); - set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR); - } + if (i_size_read(inode) < MAX_INLINE_DATA) + f2fs_i_size_write(inode, MAX_INLINE_DATA); return 0; } @@ -367,7 +331,7 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent, * NOTE: ipage is grabbed by caller, but if any error occurs, we should * release ipage in this function. */ -static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, +static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, struct f2fs_inline_dentry *inline_dentry) { struct page *page; @@ -375,7 +339,7 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, struct f2fs_dentry_block *dentry_blk; int err; - page = grab_cache_page(dir->i_mapping, 0); + page = f2fs_grab_cache_page(dir->i_mapping, 0, false); if (!page) { f2fs_put_page(ipage, 1); return -ENOMEM; @@ -386,8 +350,8 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, if (err) goto out; - f2fs_wait_on_page_writeback(page, DATA); - zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); + f2fs_wait_on_page_writeback(page, DATA, true); + zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE); dentry_blk = kmap_atomic(page); @@ -408,37 +372,132 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, NR_INLINE_DENTRY * F2FS_SLOT_LEN); kunmap_atomic(dentry_blk); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); set_page_dirty(page); /* clear inline dir and flag after data writeback */ truncate_inline_inode(ipage, 0); stat_dec_inline_dir(dir); - clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY); + clear_inode_flag(dir, FI_INLINE_DENTRY); - if (i_size_read(dir) < PAGE_CACHE_SIZE) { - i_size_write(dir, PAGE_CACHE_SIZE); - set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); - } - - sync_inode_page(&dn); + f2fs_i_depth_write(dir, 1); + if (i_size_read(dir) < PAGE_SIZE) + f2fs_i_size_write(dir, PAGE_SIZE); out: f2fs_put_page(page, 1); return err; } -int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name, - struct inode *inode, nid_t ino, umode_t mode) +static int f2fs_add_inline_entries(struct inode *dir, + struct f2fs_inline_dentry *inline_dentry) +{ + struct f2fs_dentry_ptr d; + unsigned long bit_pos = 0; + int err = 0; + + make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2); + + while (bit_pos < d.max) { + struct f2fs_dir_entry *de; + struct qstr new_name; + nid_t ino; + umode_t fake_mode; + + if (!test_bit_le(bit_pos, d.bitmap)) { + bit_pos++; + continue; + } + + de = &d.dentry[bit_pos]; + + if (unlikely(!de->name_len)) { + bit_pos++; + continue; + } + + new_name.name = d.filename[bit_pos]; + new_name.len = de->name_len; + + ino = le32_to_cpu(de->ino); + fake_mode = get_de_type(de) << S_SHIFT; + + err = f2fs_add_regular_entry(dir, &new_name, NULL, NULL, + ino, fake_mode); + if (err) + goto punch_dentry_pages; + + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); + } + return 0; +punch_dentry_pages: + truncate_inode_pages(&dir->i_data, 0); + truncate_blocks(dir, 0, false); + remove_dirty_inode(dir); + return err; +} + +static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, + struct f2fs_inline_dentry *inline_dentry) +{ + struct f2fs_inline_dentry *backup_dentry; + int err; + + backup_dentry = f2fs_kmalloc(F2FS_I_SB(dir), + sizeof(struct f2fs_inline_dentry), GFP_F2FS_ZERO); + if (!backup_dentry) { + f2fs_put_page(ipage, 1); + return -ENOMEM; + } + + memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA); + truncate_inline_inode(ipage, 0); + + unlock_page(ipage); + + err = f2fs_add_inline_entries(dir, backup_dentry); + if (err) + goto recover; + + lock_page(ipage); + + stat_dec_inline_dir(dir); + clear_inode_flag(dir, FI_INLINE_DENTRY); + kfree(backup_dentry); + return 0; +recover: + lock_page(ipage); + memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA); + f2fs_i_depth_write(dir, 0); + f2fs_i_size_write(dir, MAX_INLINE_DATA); + set_page_dirty(ipage); + f2fs_put_page(ipage, 1); + + kfree(backup_dentry); + return err; +} + +static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, + struct f2fs_inline_dentry *inline_dentry) +{ + if (!F2FS_I(dir)->i_dir_level) + return f2fs_move_inline_dirents(dir, ipage, inline_dentry); + else + return f2fs_move_rehashed_dirents(dir, ipage, inline_dentry); +} + +int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, + const struct qstr *orig_name, + struct inode *inode, nid_t ino, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct page *ipage; unsigned int bit_pos; f2fs_hash_t name_hash; - size_t namelen = name->len; struct f2fs_inline_dentry *dentry_blk = NULL; struct f2fs_dentry_ptr d; - int slots = GET_DENTRY_SLOTS(namelen); + int slots = GET_DENTRY_SLOTS(new_name->len); struct page *page = NULL; int err = 0; @@ -459,25 +518,27 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name, if (inode) { down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, name, ipage); + page = init_inode_metadata(inode, dir, new_name, + orig_name, ipage); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; } + if (f2fs_encrypted_inode(dir)) + file_set_enc_name(inode); } - f2fs_wait_on_page_writeback(ipage, NODE); + f2fs_wait_on_page_writeback(ipage, NODE, true); - name_hash = f2fs_dentry_hash(name); + name_hash = f2fs_dentry_hash(new_name); make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2); - f2fs_update_dentry(ino, mode, &d, name, name_hash, bit_pos); + f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos); set_page_dirty(ipage); /* we don't need to mark_inode_dirty now */ if (inode) { - F2FS_I(inode)->i_pino = dir->i_ino; - update_inode(inode, page); + f2fs_i_pino_write(inode, dir->i_ino); f2fs_put_page(page, 1); } @@ -485,11 +546,6 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name, fail: if (inode) up_write(&F2FS_I(inode)->i_sem); - - if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) { - update_inode(dir, ipage); - clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); - } out: f2fs_put_page(ipage, 1); return err; @@ -504,22 +560,22 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, int i; lock_page(page); - f2fs_wait_on_page_writeback(page, NODE); + f2fs_wait_on_page_writeback(page, NODE, true); inline_dentry = inline_data_addr(page); bit_pos = dentry - inline_dentry->dentry; for (i = 0; i < slots; i++) - test_and_clear_bit_le(bit_pos + i, + __clear_bit_le(bit_pos + i, &inline_dentry->dentry_bitmap); set_page_dirty(page); + f2fs_put_page(page, 1); dir->i_ctime = dir->i_mtime = CURRENT_TIME; + f2fs_mark_inode_dirty_sync(dir); if (inode) - f2fs_drop_nlink(dir, inode, page); - - f2fs_put_page(page, 1); + f2fs_drop_nlink(dir, inode); } bool f2fs_empty_inline_dir(struct inode *dir) @@ -547,7 +603,7 @@ bool f2fs_empty_inline_dir(struct inode *dir) } int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, - struct f2fs_str *fstr) + struct fscrypt_str *fstr) { struct inode *inode = file_inode(file); struct f2fs_inline_dentry *inline_dentry = NULL; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 97e20decacb4..d7369895a78a 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include "f2fs.h" @@ -18,6 +19,13 @@ #include +void f2fs_mark_inode_dirty_sync(struct inode *inode) +{ + if (f2fs_inode_dirtied(inode)) + return; + mark_inode_dirty_sync(inode); +} + void f2fs_set_inode_flags(struct inode *inode) { unsigned int flags = F2FS_I(inode)->i_flags; @@ -35,6 +43,7 @@ void f2fs_set_inode_flags(struct inode *inode) new_fl |= S_DIRSYNC; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + f2fs_mark_inode_dirty_sync(inode); } static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) @@ -83,10 +92,10 @@ static void __recover_inline_status(struct inode *inode, struct page *ipage) while (start < end) { if (*start++) { - f2fs_wait_on_page_writeback(ipage, NODE); + f2fs_wait_on_page_writeback(ipage, NODE, true); - set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); - set_raw_inline(F2FS_I(inode), F2FS_INODE(ipage)); + set_inode_flag(inode, FI_DATA_EXIST); + set_raw_inline(inode, F2FS_INODE(ipage)); set_page_dirty(ipage); return; } @@ -138,9 +147,10 @@ static int do_read_inode(struct inode *inode) fi->i_pino = le32_to_cpu(ri->i_pino); fi->i_dir_level = ri->i_dir_level; - f2fs_init_extent_tree(inode, &ri->i_ext); + if (f2fs_init_extent_tree(inode, &ri->i_ext)) + set_page_dirty(node_page); - get_inline_info(fi, ri); + get_inline_info(inode, ri); /* check data exist */ if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) @@ -150,7 +160,10 @@ static int do_read_inode(struct inode *inode) __get_inode_rdev(inode, ri); if (__written_first_block(ri)) - set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); + + if (!need_inode_block_update(sbi, inode->i_ino)) + fi->last_disk_size = inode->i_size; f2fs_put_page(node_page, 1); @@ -202,6 +215,7 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) inode->i_op = &f2fs_encrypted_symlink_inode_operations; else inode->i_op = &f2fs_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &f2fs_dblock_aops; } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { @@ -221,11 +235,27 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) return ERR_PTR(ret); } -void update_inode(struct inode *inode, struct page *node_page) +struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino) +{ + struct inode *inode; +retry: + inode = f2fs_iget(sb, ino); + if (IS_ERR(inode)) { + if (PTR_ERR(inode) == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } + } + return inode; +} + +int update_inode(struct inode *inode, struct page *node_page) { struct f2fs_inode *ri; - f2fs_wait_on_page_writeback(node_page, NODE); + f2fs_inode_synced(inode); + + f2fs_wait_on_page_writeback(node_page, NODE, true); ri = F2FS_INODE(node_page); @@ -242,7 +272,7 @@ void update_inode(struct inode *inode, struct page *node_page) &ri->i_ext); else memset(&ri->i_ext, 0, sizeof(ri->i_ext)); - set_raw_inline(F2FS_I(inode), ri); + set_raw_inline(inode, ri); ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec); ri->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); @@ -259,15 +289,19 @@ void update_inode(struct inode *inode, struct page *node_page) __set_inode_rdev(inode, ri); set_cold_node(inode, node_page); - set_page_dirty(node_page); - clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); + /* deleted inode */ + if (inode->i_nlink == 0) + clear_inline_node(node_page); + + return set_page_dirty(node_page); } -void update_inode_page(struct inode *inode) +int update_inode_page(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *node_page; + int ret = 0; retry: node_page = get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) { @@ -276,12 +310,14 @@ void update_inode_page(struct inode *inode) cond_resched(); goto retry; } else if (err != -ENOENT) { - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); } - return; + f2fs_inode_synced(inode); + return 0; } - update_inode(inode, node_page); + ret = update_inode(inode, node_page); f2fs_put_page(node_page, 1); + return ret; } int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) @@ -292,16 +328,15 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) inode->i_ino == F2FS_META_INO(sbi)) return 0; - if (!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_INODE)) + if (!is_inode_flag_set(inode, FI_DIRTY_INODE)) return 0; /* * We need to balance fs here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. */ - update_inode_page(inode); - - f2fs_balance_fs(sbi); + if (update_inode_page(inode)) + f2fs_balance_fs(sbi, true); return 0; } @@ -311,13 +346,12 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) void f2fs_evict_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_inode_info *fi = F2FS_I(inode); - nid_t xnid = fi->i_xattr_nid; + nid_t xnid = F2FS_I(inode)->i_xattr_nid; int err = 0; /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - commit_inmem_pages(inode, true); + drop_inmem_pages(inode); trace_f2fs_evict_inode(inode); truncate_inode_pages_final(&inode->i_data); @@ -327,19 +361,24 @@ void f2fs_evict_inode(struct inode *inode) goto out_clear; f2fs_bug_on(sbi, get_dirty_pages(inode)); - remove_dirty_dir_inode(inode); + remove_dirty_inode(inode); f2fs_destroy_extent_tree(inode); if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; - sb_start_intwrite(inode->i_sb); - set_inode_flag(fi, FI_NO_ALLOC); - i_size_write(inode, 0); +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_EVICT_INODE)) + goto no_delete; +#endif + sb_start_intwrite(inode->i_sb); + set_inode_flag(inode, FI_NO_ALLOC); + i_size_write(inode, 0); +retry: if (F2FS_HAS_BLOCKS(inode)) - err = f2fs_truncate(inode, true); + err = f2fs_truncate(inode); if (!err) { f2fs_lock_op(sbi); @@ -347,6 +386,14 @@ void f2fs_evict_inode(struct inode *inode) f2fs_unlock_op(sbi); } + /* give more chances, if ENOMEM case */ + if (err == -ENOMEM) { + err = 0; + goto retry; + } + + if (err) + update_inode_page(inode); sb_end_intwrite(inode->i_sb); no_delete: stat_dec_inline_xattr(inode); @@ -356,36 +403,18 @@ void f2fs_evict_inode(struct inode *inode) invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); if (xnid) invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); - if (is_inode_flag_set(fi, FI_APPEND_WRITE)) - add_dirty_inode(sbi, inode->i_ino, APPEND_INO); - if (is_inode_flag_set(fi, FI_UPDATE_WRITE)) - add_dirty_inode(sbi, inode->i_ino, UPDATE_INO); - if (is_inode_flag_set(fi, FI_FREE_NID)) { - if (err && err != -ENOENT) - alloc_nid_done(sbi, inode->i_ino); - else - alloc_nid_failed(sbi, inode->i_ino); - clear_inode_flag(fi, FI_FREE_NID); - } - - if (err && err != -ENOENT) { - if (!exist_written_data(sbi, inode->i_ino, ORPHAN_INO)) { - /* - * get here because we failed to release resource - * of inode previously, reminder our user to run fsck - * for fixing. - */ - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_msg(sbi->sb, KERN_WARNING, - "inode (ino:%lu) resource leak, run fsck " - "to fix this issue!", inode->i_ino); - } + if (is_inode_flag_set(inode, FI_APPEND_WRITE)) + add_ino_entry(sbi, inode->i_ino, APPEND_INO); + if (is_inode_flag_set(inode, FI_UPDATE_WRITE)) + add_ino_entry(sbi, inode->i_ino, UPDATE_INO); + if (is_inode_flag_set(inode, FI_FREE_NID)) { + alloc_nid_failed(sbi, inode->i_ino); + clear_inode_flag(inode, FI_FREE_NID); } + f2fs_bug_on(sbi, err && + !exist_written_data(sbi, inode->i_ino, ORPHAN_INO)); out_clear: -#ifdef CONFIG_F2FS_FS_ENCRYPTION - if (fi->i_crypt_info) - f2fs_free_encryption_info(inode, fi->i_crypt_info); -#endif + fscrypt_put_encryption_info(inode, NULL); clear_inode(inode); } @@ -393,37 +422,32 @@ void f2fs_evict_inode(struct inode *inode) void handle_failed_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - int err = 0; + struct node_info ni; - clear_nlink(inode); - make_bad_inode(inode); + /* don't make bad inode, since it becomes a regular file. */ unlock_new_inode(inode); - i_size_write(inode, 0); - if (F2FS_HAS_BLOCKS(inode)) - err = f2fs_truncate(inode, false); - - if (!err) - err = remove_inode_page(inode); - /* - * if we skip truncate_node in remove_inode_page bacause we failed - * before, it's better to find another way to release resource of - * this inode (e.g. valid block count, node block or nid). Here we - * choose to add this inode to orphan list, so that we can call iput - * for releasing in orphan recovery flow. - * * Note: we should add inode to orphan list before f2fs_unlock_op() * so we can prevent losing this orphan when encoutering checkpoint * and following suddenly power-off. */ - if (err && err != -ENOENT) { - err = acquire_orphan_inode(sbi); - if (!err) - add_orphan_inode(sbi, inode->i_ino); + get_node_info(sbi, inode->i_ino, &ni); + + if (ni.blk_addr != NULL_ADDR) { + int err = acquire_orphan_inode(sbi); + if (err) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "Too many orphan inodes, run fsck to fix."); + } else { + add_orphan_inode(inode); + } + alloc_nid_done(sbi, inode->i_ino); + } else { + set_inode_flag(inode, FI_FREE_NID); } - set_inode_flag(F2FS_I(inode), FI_FREE_NID); f2fs_unlock_op(sbi); /* iput will drop the inode object */ diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 2c32110f9fc0..0f071a70522d 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -60,10 +60,14 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) f2fs_set_encrypted_inode(inode); - if (f2fs_may_inline_data(inode)) - set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); + set_inode_flag(inode, FI_NEW_INODE); + + if (test_opt(sbi, INLINE_XATTR)) + set_inode_flag(inode, FI_INLINE_XATTR); + if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) + set_inode_flag(inode, FI_INLINE_DATA); if (f2fs_may_inline_dentry(inode)) - set_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY); + set_inode_flag(inode, FI_INLINE_DENTRY); f2fs_init_extent_tree(inode, NULL); @@ -72,14 +76,13 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) stat_inc_inline_dir(inode); trace_f2fs_new_inode(inode, 0); - mark_inode_dirty(inode); return inode; fail: trace_f2fs_new_inode(inode, err); make_bad_inode(inode); if (nid_free) - set_inode_flag(F2FS_I(inode), FI_FREE_NID); + set_inode_flag(inode, FI_FREE_NID); iput(inode); return ERR_PTR(err); } @@ -88,18 +91,23 @@ static int is_multimedia_file(const unsigned char *s, const char *sub) { size_t slen = strlen(s); size_t sublen = strlen(sub); + int i; /* * filename format of multimedia file should be defined as: - * "filename + '.' + extension". + * "filename + '.' + extension + (optional: '.' + temp extension)". */ if (slen < sublen + 2) return 0; - if (s[slen - sublen - 1] != '.') - return 0; + for (i = 1; i < slen - sublen; i++) { + if (s[i] != '.') + continue; + if (!strncasecmp(s + i + 1, sub, sublen)) + return 1; + } - return !strncasecmp(s + slen - sublen, sub, sublen); + return 0; } /* @@ -128,8 +136,6 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, nid_t ino = 0; int err; - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -142,6 +148,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, inode->i_mapping->a_ops = &f2fs_dblock_aops; ino = inode->i_ino; + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -169,15 +177,15 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, int err; if (f2fs_encrypted_inode(dir) && - !f2fs_is_child_context_consistent_with_parent(dir, inode)) + !fscrypt_has_permitted_context(dir, inode)) return -EPERM; - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); inode->i_ctime = CURRENT_TIME; ihold(inode); - set_inode_flag(F2FS_I(inode), FI_INC_LINK); + set_inode_flag(inode, FI_INC_LINK); f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -190,7 +198,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, f2fs_sync_fs(sbi->sb, 1); return 0; out: - clear_inode_flag(F2FS_I(inode), FI_INC_LINK); + clear_inode_flag(inode, FI_INC_LINK); iput(inode); f2fs_unlock_op(sbi); return err; @@ -199,10 +207,14 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *f2fs_get_parent(struct dentry *child) { struct qstr dotdot = QSTR_INIT("..", 2); - unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot); - if (!ino) + struct page *page; + unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot, &page); + if (!ino) { + if (IS_ERR(page)) + return ERR_CAST(page); return ERR_PTR(-ENOENT); - return d_obtain_alias(f2fs_iget(d_inode(child)->i_sb, ino)); + } + return d_obtain_alias(f2fs_iget(child->d_sb, ino)); } static int __recover_dot_dentries(struct inode *dir, nid_t pino) @@ -214,12 +226,24 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) struct page *page; int err = 0; + if (f2fs_readonly(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_INFO, + "skip recovering inline_dots inode (ino:%lu, pino:%u) " + "in readonly mountpoint", dir->i_ino, pino); + return 0; + } + + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); de = f2fs_find_entry(dir, &dot, &page); if (de) { f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); + } else if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out; } else { err = __f2fs_add_link(dir, &dot, NULL, dir->i_ino, S_IFDIR); if (err) @@ -230,14 +254,14 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) if (de) { f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); + } else if (IS_ERR(page)) { + err = PTR_ERR(page); } else { err = __f2fs_add_link(dir, &dotdot, NULL, pino, S_IFDIR); } out: - if (!err) { - clear_inode_flag(F2FS_I(dir), FI_INLINE_DOTS); - mark_inode_dirty(dir); - } + if (!err) + clear_inode_flag(dir, FI_INLINE_DOTS); f2fs_unlock_op(sbi); return err; @@ -251,13 +275,32 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, struct page *page; nid_t ino; int err = 0; + unsigned int root_ino = F2FS_ROOT_INO(F2FS_I_SB(dir)); + + if (f2fs_encrypted_inode(dir)) { + int res = fscrypt_get_encryption_info(dir); + + /* + * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is + * created while the directory was encrypted and we + * don't have access to the key. + */ + if (fscrypt_has_encryption_key(dir)) + fscrypt_set_encrypted_dentry(dentry); + fscrypt_set_d_op(dentry); + if (res && res != -ENOKEY) + return ERR_PTR(res); + } if (dentry->d_name.len > F2FS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); de = f2fs_find_entry(dir, &dentry->d_name, &page); - if (!de) + if (!de) { + if (IS_ERR(page)) + return (struct dentry *)page; return d_splice_alias(inode, dentry); + } ino = le32_to_cpu(de->ino); f2fs_dentry_kunmap(dir, page); @@ -267,15 +310,29 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) return ERR_CAST(inode); + if ((dir->i_ino == root_ino) && f2fs_has_inline_dots(dir)) { + err = __recover_dot_dentries(dir, root_ino); + if (err) + goto err_out; + } + if (f2fs_has_inline_dots(inode)) { err = __recover_dot_dentries(inode, dir->i_ino); if (err) goto err_out; } + if (!IS_ERR(inode) && f2fs_encrypted_inode(dir) && + (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && + !fscrypt_has_permitted_context(dir, inode)) { + bool nokey = f2fs_encrypted_inode(inode) && + !fscrypt_has_encryption_key(inode); + err = nokey ? -ENOKEY : -EPERM; + goto err_out; + } return d_splice_alias(inode, dentry); err_out: - iget_failed(inode); + iput(inode); return ERR_PTR(err); } @@ -288,11 +345,15 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) int err = -ENOENT; trace_f2fs_unlink_enter(dir, dentry); - f2fs_balance_fs(sbi); de = f2fs_find_entry(dir, &dentry->d_name, &page); - if (!de) + if (!de) { + if (IS_ERR(page)) + err = PTR_ERR(page); goto fail; + } + + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); err = acquire_orphan_inode(sbi); @@ -305,9 +366,6 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) f2fs_delete_entry(de, page, dir, inode); f2fs_unlock_op(sbi); - /* In order to evict this inode, we set it dirty */ - mark_inode_dirty(inode); - if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); fail: @@ -332,16 +390,24 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; size_t len = strlen(symname); - size_t p_len; - char *p_str; - struct f2fs_str disk_link = FSTR_INIT(NULL, 0); - struct f2fs_encrypted_symlink_data *sd = NULL; + struct fscrypt_str disk_link = FSTR_INIT((char *)symname, len + 1); + struct fscrypt_symlink_data *sd = NULL; int err; - if (len > dir->i_sb->s_blocksize) - return -ENAMETOOLONG; + if (f2fs_encrypted_inode(dir)) { + err = fscrypt_get_encryption_info(dir); + if (err) + return err; - f2fs_balance_fs(sbi); + if (!fscrypt_has_encryption_key(dir)) + return -EPERM; + + disk_link.len = (fscrypt_fname_encrypted_size(dir, len) + + sizeof(struct fscrypt_symlink_data)); + } + + if (disk_link.len > dir->i_sb->s_blocksize) + return -ENAMETOOLONG; inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO); if (IS_ERR(inode)) @@ -351,8 +417,11 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, inode->i_op = &f2fs_encrypted_symlink_inode_operations; else inode->i_op = &f2fs_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &f2fs_dblock_aops; + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -360,42 +429,36 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, f2fs_unlock_op(sbi); alloc_nid_done(sbi, inode->i_ino); - if (f2fs_encrypted_inode(dir)) { + if (f2fs_encrypted_inode(inode)) { struct qstr istr = QSTR_INIT(symname, len); + struct fscrypt_str ostr; - err = f2fs_get_encryption_info(inode); - if (err) - goto err_out; - - err = f2fs_fname_crypto_alloc_buffer(inode, len, &disk_link); - if (err) - goto err_out; - - err = f2fs_fname_usr_to_disk(inode, &istr, &disk_link); - if (err < 0) - goto err_out; - - p_len = encrypted_symlink_data_len(disk_link.len) + 1; - - if (p_len > dir->i_sb->s_blocksize) { - err = -ENAMETOOLONG; - goto err_out; - } - - sd = kzalloc(p_len, GFP_NOFS); + sd = kzalloc(disk_link.len, GFP_NOFS); if (!sd) { err = -ENOMEM; goto err_out; } - memcpy(sd->encrypted_path, disk_link.name, disk_link.len); - sd->len = cpu_to_le16(disk_link.len); - p_str = (char *)sd; - } else { - p_len = len + 1; - p_str = (char *)symname; + + err = fscrypt_get_encryption_info(inode); + if (err) + goto err_out; + + if (!fscrypt_has_encryption_key(inode)) { + err = -EPERM; + goto err_out; + } + + ostr.name = sd->encrypted_path; + ostr.len = disk_link.len; + err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr); + if (err < 0) + goto err_out; + + sd->len = cpu_to_le16(ostr.len); + disk_link.name = (char *)sd; } - err = page_symlink(inode, p_str, p_len); + err = page_symlink(inode, disk_link.name, disk_link.len); err_out: d_instantiate(dentry, inode); @@ -411,7 +474,8 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, * performance regression. */ if (!err) { - filemap_write_and_wait_range(inode->i_mapping, 0, p_len - 1); + filemap_write_and_wait_range(inode->i_mapping, 0, + disk_link.len - 1); if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); @@ -420,7 +484,6 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, } kfree(sd); - f2fs_fname_crypto_free_buffer(&disk_link); return err; out: handle_failed_inode(inode); @@ -433,8 +496,6 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct inode *inode; int err; - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, S_IFDIR | mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -444,7 +505,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_mapping->a_ops = &f2fs_dblock_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO); - set_inode_flag(F2FS_I(inode), FI_INC_LINK); + f2fs_balance_fs(sbi, true); + + set_inode_flag(inode, FI_INC_LINK); f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -461,7 +524,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) return 0; out_fail: - clear_inode_flag(F2FS_I(inode), FI_INC_LINK); + clear_inode_flag(inode, FI_INC_LINK); handle_failed_inode(inode); return err; } @@ -481,8 +544,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, struct inode *inode; int err = 0; - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -490,6 +551,8 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &f2fs_special_inode_operations; + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -516,9 +579,6 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, struct inode *inode; int err; - if (!whiteout) - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -532,6 +592,8 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, inode->i_mapping->a_ops = &f2fs_dblock_aops; } + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = acquire_orphan_inode(sbi); if (err) @@ -545,17 +607,17 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, * add this non-linked tmpfile to orphan list, in this way we could * remove all unused data of tmpfile after abnormal power-off. */ - add_orphan_inode(sbi, inode->i_ino); - f2fs_unlock_op(sbi); - + add_orphan_inode(inode); alloc_nid_done(sbi, inode->i_ino); if (whiteout) { - inode_dec_link_count(inode); + f2fs_i_links_write(inode, false); *whiteout = inode; } else { d_tmpfile(dentry, inode); } + /* link_count was changed by d_tmpfile as well. */ + f2fs_unlock_op(sbi); unlock_new_inode(inode); return 0; @@ -569,7 +631,7 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) { if (f2fs_encrypted_inode(dir)) { - int err = f2fs_get_encryption_info(dir); + int err = fscrypt_get_encryption_info(dir); if (err) return err; } @@ -595,26 +657,29 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, struct f2fs_dir_entry *old_dir_entry = NULL; struct f2fs_dir_entry *old_entry; struct f2fs_dir_entry *new_entry; + bool is_old_inline = f2fs_has_inline_dentry(old_dir); int err = -ENOENT; if ((old_dir != new_dir) && f2fs_encrypted_inode(new_dir) && - !f2fs_is_child_context_consistent_with_parent(new_dir, - old_inode)) { + !fscrypt_has_permitted_context(new_dir, old_inode)) { err = -EPERM; goto out; } - f2fs_balance_fs(sbi); - old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); - if (!old_entry) + if (!old_entry) { + if (IS_ERR(old_page)) + err = PTR_ERR(old_page); goto out; + } if (S_ISDIR(old_inode->i_mode)) { - err = -EIO; old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page); - if (!old_dir_entry) + if (!old_dir_entry) { + if (IS_ERR(old_dir_page)) + err = PTR_ERR(old_dir_page); goto out_old; + } } if (flags & RENAME_WHITEOUT) { @@ -632,8 +697,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, err = -ENOENT; new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, &new_page); - if (!new_entry) + if (!new_entry) { + if (IS_ERR(new_page)) + err = PTR_ERR(new_page); goto out_whiteout; + } + + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); @@ -641,8 +711,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (err) goto put_out_dir; - if (update_dent_inode(old_inode, new_inode, - &new_dentry->d_name)) { + err = update_dent_inode(old_inode, new_inode, + &new_dentry->d_name); + if (err) { release_orphan_inode(sbi); goto put_out_dir; } @@ -652,20 +723,17 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, new_inode->i_ctime = CURRENT_TIME; down_write(&F2FS_I(new_inode)->i_sem); if (old_dir_entry) - drop_nlink(new_inode); - drop_nlink(new_inode); + f2fs_i_links_write(new_inode, false); + f2fs_i_links_write(new_inode, false); up_write(&F2FS_I(new_inode)->i_sem); - mark_inode_dirty(new_inode); - if (!new_inode->i_nlink) - add_orphan_inode(sbi, new_inode->i_ino); + add_orphan_inode(new_inode); else release_orphan_inode(sbi); - - update_inode_page(old_inode); - update_inode_page(new_inode); } else { + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = f2fs_add_link(new_dentry, old_inode); @@ -674,9 +742,29 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_whiteout; } - if (old_dir_entry) { - inc_nlink(new_dir); - update_inode_page(new_dir); + if (old_dir_entry) + f2fs_i_links_write(new_dir, true); + + /* + * old entry and new entry can locate in the same inline + * dentry in inode, when attaching new entry in inline dentry, + * it could force inline dentry conversion, after that, + * old_entry and old_page will point to wrong address, in + * order to avoid this, let's do the check and update here. + */ + if (is_old_inline && !f2fs_has_inline_dentry(old_dir)) { + f2fs_put_page(old_page, 0); + old_page = NULL; + + old_entry = f2fs_find_entry(old_dir, + &old_dentry->d_name, &old_page); + if (!old_entry) { + err = -ENOENT; + if (IS_ERR(old_page)) + err = PTR_ERR(old_page); + f2fs_unlock_op(sbi); + goto out_whiteout; + } } } @@ -687,13 +775,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, up_write(&F2FS_I(old_inode)->i_sem); old_inode->i_ctime = CURRENT_TIME; - mark_inode_dirty(old_inode); + f2fs_mark_inode_dirty_sync(old_inode); f2fs_delete_entry(old_entry, old_page, old_dir, NULL); if (whiteout) { whiteout->i_state |= I_LINKABLE; - set_inode_flag(F2FS_I(whiteout), FI_INC_LINK); + set_inode_flag(whiteout, FI_INC_LINK); err = f2fs_add_link(old_dentry, whiteout); if (err) goto put_out_dir; @@ -705,14 +793,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_dir != new_dir && !whiteout) { f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); - update_inode_page(old_inode); } else { f2fs_dentry_kunmap(old_inode, old_dir_page); f2fs_put_page(old_dir_page, 0); } - drop_nlink(old_dir); - mark_inode_dirty(old_dir); - update_inode_page(old_dir); + f2fs_i_links_write(old_dir, false); } f2fs_unlock_op(sbi); @@ -756,39 +841,45 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, int err = -ENOENT; if ((f2fs_encrypted_inode(old_dir) || f2fs_encrypted_inode(new_dir)) && - (old_dir != new_dir) && - (!f2fs_is_child_context_consistent_with_parent(new_dir, - old_inode) || - !f2fs_is_child_context_consistent_with_parent(old_dir, - new_inode))) + (old_dir != new_dir) && + (!fscrypt_has_permitted_context(new_dir, old_inode) || + !fscrypt_has_permitted_context(old_dir, new_inode))) return -EPERM; - f2fs_balance_fs(sbi); - old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); - if (!old_entry) + if (!old_entry) { + if (IS_ERR(old_page)) + err = PTR_ERR(old_page); goto out; + } new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, &new_page); - if (!new_entry) + if (!new_entry) { + if (IS_ERR(new_page)) + err = PTR_ERR(new_page); goto out_old; + } /* prepare for updating ".." directory entry info later */ if (old_dir != new_dir) { if (S_ISDIR(old_inode->i_mode)) { - err = -EIO; old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page); - if (!old_dir_entry) + if (!old_dir_entry) { + if (IS_ERR(old_dir_page)) + err = PTR_ERR(old_dir_page); goto out_new; + } } if (S_ISDIR(new_inode->i_mode)) { - err = -EIO; new_dir_entry = f2fs_parent_dir(new_inode, &new_dir_page); - if (!new_dir_entry) + if (!new_dir_entry) { + if (IS_ERR(new_dir_page)) + err = PTR_ERR(new_dir_page); goto out_old_dir; + } } } @@ -807,6 +898,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_new_dir; } + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = update_dent_inode(old_inode, new_inode, &new_dentry->d_name); @@ -836,19 +929,13 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, file_lost_pino(old_inode); up_write(&F2FS_I(old_inode)->i_sem); - update_inode_page(old_inode); - old_dir->i_ctime = CURRENT_TIME; if (old_nlink) { down_write(&F2FS_I(old_dir)->i_sem); - if (old_nlink < 0) - drop_nlink(old_dir); - else - inc_nlink(old_dir); + f2fs_i_links_write(old_dir, old_nlink > 0); up_write(&F2FS_I(old_dir)->i_sem); } - mark_inode_dirty(old_dir); - update_inode_page(old_dir); + f2fs_mark_inode_dirty_sync(old_dir); /* update directory entry info of new dir inode */ f2fs_set_link(new_dir, new_entry, new_page, old_inode); @@ -857,19 +944,13 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, file_lost_pino(new_inode); up_write(&F2FS_I(new_inode)->i_sem); - update_inode_page(new_inode); - new_dir->i_ctime = CURRENT_TIME; if (new_nlink) { down_write(&F2FS_I(new_dir)->i_sem); - if (new_nlink < 0) - drop_nlink(new_dir); - else - inc_nlink(new_dir); + f2fs_i_links_write(new_dir, new_nlink > 0); up_write(&F2FS_I(new_dir)->i_sem); } - mark_inode_dirty(new_dir); - update_inode_page(new_dir); + f2fs_mark_inode_dirty_sync(new_dir); f2fs_unlock_op(sbi); @@ -922,89 +1003,85 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry, return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry, flags); } -#ifdef CONFIG_F2FS_FS_ENCRYPTION static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cookie) { struct page *cpage = NULL; char *caddr, *paddr = NULL; - struct f2fs_str cstr; - struct f2fs_str pstr = FSTR_INIT(NULL, 0); + struct fscrypt_str cstr = FSTR_INIT(NULL, 0); + struct fscrypt_str pstr = FSTR_INIT(NULL, 0); + struct fscrypt_symlink_data *sd; struct inode *inode = d_inode(dentry); - struct f2fs_encrypted_symlink_data *sd; - loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1); u32 max_size = inode->i_sb->s_blocksize; int res; - res = f2fs_get_encryption_info(inode); + if (!dentry) + return ERR_PTR(-ECHILD); + + res = fscrypt_get_encryption_info(inode); if (res) return ERR_PTR(res); cpage = read_mapping_page(inode->i_mapping, 0, NULL); if (IS_ERR(cpage)) return ERR_CAST(cpage); - caddr = kmap(cpage); - caddr[size] = 0; + caddr = page_address(cpage); /* Symlink is encrypted */ - sd = (struct f2fs_encrypted_symlink_data *)caddr; + sd = (struct fscrypt_symlink_data *)caddr; + cstr.name = sd->encrypted_path; cstr.len = le16_to_cpu(sd->len); - cstr.name = kmalloc(cstr.len, GFP_NOFS); - if (!cstr.name) { - res = -ENOMEM; - goto errout; - } - memcpy(cstr.name, sd->encrypted_path, cstr.len); /* this is broken symlink case */ - if (cstr.name[0] == 0 && cstr.len == 0) { + if (unlikely(cstr.len == 0)) { res = -ENOENT; goto errout; } - if ((cstr.len + sizeof(struct f2fs_encrypted_symlink_data) - 1) > - max_size) { + if ((cstr.len + sizeof(struct fscrypt_symlink_data) - 1) > max_size) { /* Symlink data on the disk is corrupted */ res = -EIO; goto errout; } - res = f2fs_fname_crypto_alloc_buffer(inode, cstr.len, &pstr); + res = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr); if (res) goto errout; - res = f2fs_fname_disk_to_usr(inode, NULL, &cstr, &pstr); + res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); if (res < 0) goto errout; - kfree(cstr.name); + /* this is broken symlink case */ + if (unlikely(pstr.name[0] == 0)) { + res = -ENOENT; + goto errout; + } paddr = pstr.name; /* Null-terminate the name */ paddr[res] = '\0'; - kunmap(cpage); - page_cache_release(cpage); + put_page(cpage); return *cookie = paddr; errout: - kfree(cstr.name); - f2fs_fname_crypto_free_buffer(&pstr); - kunmap(cpage); - page_cache_release(cpage); + fscrypt_fname_free_buffer(&pstr); + put_page(cpage); return ERR_PTR(res); } const struct inode_operations f2fs_encrypted_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = f2fs_encrypted_follow_link, - .put_link = kfree_put_link, + .follow_link = f2fs_encrypted_follow_link, + .put_link = kfree_put_link, .getattr = f2fs_getattr, .setattr = f2fs_setattr, +#ifdef CONFIG_F2FS_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = f2fs_listxattr, .removexattr = generic_removexattr, -}; #endif +}; const struct inode_operations f2fs_dir_inode_operations = { .create = f2fs_create, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 7bcbc6e9c40d..b1e615ed2bef 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -46,12 +46,14 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) */ if (type == FREE_NIDS) { mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >> - PAGE_CACHE_SHIFT; + PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); } else if (type == NAT_ENTRIES) { mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> - PAGE_CACHE_SHIFT; + PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); + if (excess_cached_nats(sbi)) + res = false; } else if (type == DIRTY_DENTS) { if (sbi->sb->s_bdi->wb.dirty_exceeded) return false; @@ -62,16 +64,17 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) for (i = 0; i <= UPDATE_INO; i++) mem_size += (sbi->im[i].ino_num * - sizeof(struct ino_entry)) >> PAGE_CACHE_SHIFT; + sizeof(struct ino_entry)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else if (type == EXTENT_CACHE) { - mem_size = (sbi->total_ext_tree * sizeof(struct extent_tree) + + mem_size = (atomic_read(&sbi->total_ext_tree) * + sizeof(struct extent_tree) + atomic_read(&sbi->total_ext_node) * - sizeof(struct extent_node)) >> PAGE_CACHE_SHIFT; + sizeof(struct extent_node)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else { - if (sbi->sb->s_bdi->wb.dirty_exceeded) - return false; + if (!sbi->sb->s_bdi->wb.dirty_exceeded) + return true; } return res; } @@ -120,7 +123,7 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) src_addr = page_address(src_page); dst_addr = page_address(dst_page); - memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE); + memcpy(dst_addr, src_addr, PAGE_SIZE); set_page_dirty(dst_page); f2fs_put_page(src_page, 1); @@ -256,18 +259,21 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) return new; } -static void cache_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid, +static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, struct f2fs_nat_entry *ne) { + struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; - down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (!e) { e = grab_nat_entry(nm_i, nid); node_info_from_raw_nat(&e->ni, ne); + } else { + f2fs_bug_on(sbi, nat_get_ino(e) != ne->ino || + nat_get_blkaddr(e) != ne->block_addr || + nat_get_version(e) != ne->version); } - up_write(&nm_i->nat_tree_lock); } static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, @@ -355,7 +361,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; + struct f2fs_journal *journal = curseg->journal; nid_t start_nid = START_NID(nid); struct f2fs_nat_block *nat_blk; struct page *page = NULL; @@ -372,21 +378,20 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) ni->ino = nat_get_ino(e); ni->blk_addr = nat_get_blkaddr(e); ni->version = nat_get_version(e); - } - up_read(&nm_i->nat_tree_lock); - if (e) + up_read(&nm_i->nat_tree_lock); return; + } memset(&ne, 0, sizeof(struct f2fs_nat_entry)); /* Check current segment summary */ - mutex_lock(&curseg->curseg_mutex); - i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0); + down_read(&curseg->journal_rwsem); + i = lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0); if (i >= 0) { - ne = nat_in_journal(sum, i); + ne = nat_in_journal(journal, i); node_info_from_raw_nat(ni, &ne); } - mutex_unlock(&curseg->curseg_mutex); + up_read(&curseg->journal_rwsem); if (i >= 0) goto cache; @@ -397,18 +402,75 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) node_info_from_raw_nat(ni, &ne); f2fs_put_page(page, 1); cache: + up_read(&nm_i->nat_tree_lock); /* cache nat entry */ - cache_nat_entry(NM_I(sbi), nid, &ne); + down_write(&nm_i->nat_tree_lock); + cache_nat_entry(sbi, nid, &ne); + up_write(&nm_i->nat_tree_lock); +} + +/* + * readahead MAX_RA_NODE number of node pages. + */ +static void ra_node_pages(struct page *parent, int start, int n) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(parent); + struct blk_plug plug; + int i, end; + nid_t nid; + + blk_start_plug(&plug); + + /* Then, try readahead for siblings of the desired node */ + end = start + n; + end = min(end, NIDS_PER_BLOCK); + for (i = start; i < end; i++) { + nid = get_nid(parent, i, false); + ra_node_page(sbi, nid); + } + + blk_finish_plug(&plug); +} + +pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs) +{ + const long direct_index = ADDRS_PER_INODE(dn->inode); + const long direct_blks = ADDRS_PER_BLOCK; + const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK; + unsigned int skipped_unit = ADDRS_PER_BLOCK; + int cur_level = dn->cur_level; + int max_level = dn->max_level; + pgoff_t base = 0; + + if (!dn->max_level) + return pgofs + 1; + + while (max_level-- > cur_level) + skipped_unit *= NIDS_PER_BLOCK; + + switch (dn->max_level) { + case 3: + base += 2 * indirect_blks; + case 2: + base += 2 * direct_blks; + case 1: + base += direct_index; + break; + default: + f2fs_bug_on(F2FS_I_SB(dn->inode), 1); + } + + return ((pgofs - base) / skipped_unit + 1) * skipped_unit + base; } /* * The maximum depth is four. * Offset[0] will have raw inode offset. */ -static int get_node_path(struct f2fs_inode_info *fi, long block, +static int get_node_path(struct inode *inode, long block, int offset[4], unsigned int noffset[4]) { - const long direct_index = ADDRS_PER_INODE(fi); + const long direct_index = ADDRS_PER_INODE(inode); const long direct_blks = ADDRS_PER_BLOCK; const long dptrs_per_blk = NIDS_PER_BLOCK; const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK; @@ -493,10 +555,10 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) int offset[4]; unsigned int noffset[4]; nid_t nids[4]; - int level, i; + int level, i = 0; int err = 0; - level = get_node_path(F2FS_I(dn->inode), index, offset, noffset); + level = get_node_path(dn->inode, index, offset, noffset); nids[0] = dn->inode->i_ino; npage[0] = dn->inode_page; @@ -583,6 +645,11 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) release_out: dn->inode_page = NULL; dn->node_page = NULL; + if (err == -ENOENT) { + dn->cur_level = i; + dn->max_level = level; + dn->ofs_in_node = offset[level]; + } return err; } @@ -606,8 +673,7 @@ static void truncate_node(struct dnode_of_data *dn) if (dn->nid == dn->inode->i_ino) { remove_orphan_inode(sbi, dn->nid); dec_valid_inode_count(sbi); - } else { - sync_inode_page(dn); + f2fs_inode_synced(dn->inode); } invalidate: clear_node_page_dirty(dn->node_page); @@ -666,6 +732,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, return PTR_ERR(page); } + ra_node_pages(page, ofs, NIDS_PER_BLOCK); + rn = F2FS_NODE(page); if (depth < 3) { for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) { @@ -676,7 +744,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, ret = truncate_dnode(&rdn); if (ret < 0) goto out_err; - set_nid(page, i, 0, false); + if (set_nid(page, i, 0, false)) + dn->node_changed = true; } } else { child_nofs = nofs + ofs * (NIDS_PER_BLOCK + 1) + 1; @@ -689,7 +758,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, rdn.nid = child_nid; ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1); if (ret == (NIDS_PER_BLOCK + 1)) { - set_nid(page, i, 0, false); + if (set_nid(page, i, 0, false)) + dn->node_changed = true; child_nofs += ret; } else if (ret < 0 && ret != -ENOENT) { goto out_err; @@ -741,6 +811,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, nid[i + 1] = get_nid(pages[i], offset[i + 1], false); } + ra_node_pages(pages[idx], offset[idx + 1], NIDS_PER_BLOCK); + /* free direct nodes linked to a partial indirect node */ for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) { child_nid = get_nid(pages[idx], i, false); @@ -750,7 +822,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, err = truncate_dnode(dn); if (err < 0) goto fail; - set_nid(pages[idx], i, 0, false); + if (set_nid(pages[idx], i, 0, false)) + dn->node_changed = true; } if (offset[idx + 1] == 0) { @@ -787,8 +860,8 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from) trace_f2fs_truncate_inode_blocks_enter(inode, from); - level = get_node_path(F2FS_I(inode), from, offset, noffset); -restart: + level = get_node_path(inode, from, offset, noffset); + page = get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) { trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page)); @@ -852,11 +925,8 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from) if (offset[1] == 0 && ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) { lock_page(page); - if (unlikely(page->mapping != NODE_MAPPING(sbi))) { - f2fs_put_page(page, 1); - goto restart; - } - f2fs_wait_on_page_writeback(page, NODE); + BUG_ON(page->mapping != NODE_MAPPING(sbi)); + f2fs_wait_on_page_writeback(page, NODE, true); ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; set_page_dirty(page); unlock_page(page); @@ -885,7 +955,7 @@ int truncate_xattr_node(struct inode *inode, struct page *page) if (IS_ERR(npage)) return PTR_ERR(npage); - F2FS_I(inode)->i_xattr_nid = 0; + f2fs_i_xnid_write(inode, 0); /* need to do checkpoint during fsync */ F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi)); @@ -951,10 +1021,10 @@ struct page *new_node_page(struct dnode_of_data *dn, struct page *page; int err; - if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) + if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return ERR_PTR(-EPERM); - page = grab_cache_page(NODE_MAPPING(sbi), dn->nid); + page = f2fs_grab_cache_page(NODE_MAPPING(sbi), dn->nid, false); if (!page) return ERR_PTR(-ENOMEM); @@ -971,23 +1041,19 @@ struct page *new_node_page(struct dnode_of_data *dn, new_ni.ino = dn->inode->i_ino; set_node_addr(sbi, &new_ni, NEW_ADDR, false); - f2fs_wait_on_page_writeback(page, NODE); + f2fs_wait_on_page_writeback(page, NODE, true); fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); set_cold_node(dn->inode, page); - SetPageUptodate(page); - set_page_dirty(page); + if (!PageUptodate(page)) + SetPageUptodate(page); + if (set_page_dirty(page)) + dn->node_changed = true; if (f2fs_has_xattr_block(ofs)) - F2FS_I(dn->inode)->i_xattr_nid = dn->nid; + f2fs_i_xnid_write(dn->inode, dn->nid); - dn->node_page = page; - if (ipage) - update_inode(dn->inode, ipage); - else - sync_inode_page(dn); if (ofs == 0) inc_valid_inode_count(sbi); - return page; fail: @@ -1013,6 +1079,9 @@ static int read_node_page(struct page *page, int rw) .encrypted_page = NULL, }; + if (PageUptodate(page)) + return LOCKED_PAGE; + get_node_info(sbi, page->index, &ni); if (unlikely(ni.blk_addr == NULL_ADDR)) { @@ -1020,10 +1089,7 @@ static int read_node_page(struct page *page, int rw) return -ENOENT; } - if (PageUptodate(page)) - return LOCKED_PAGE; - - fio.blk_addr = ni.blk_addr; + fio.new_blkaddr = fio.old_blkaddr = ni.blk_addr; return f2fs_submit_page_bio(&fio); } @@ -1035,14 +1101,17 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) struct page *apage; int err; - apage = find_get_page(NODE_MAPPING(sbi), nid); - if (apage && PageUptodate(apage)) { - f2fs_put_page(apage, 0); + if (!nid) return; - } - f2fs_put_page(apage, 0); + f2fs_bug_on(sbi, check_nid_range(sbi, nid)); - apage = grab_cache_page(NODE_MAPPING(sbi), nid); + rcu_read_lock(); + apage = radix_tree_lookup(&NODE_MAPPING(sbi)->page_tree, nid); + rcu_read_unlock(); + if (apage) + return; + + apage = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false); if (!apage) return; @@ -1050,53 +1119,17 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) f2fs_put_page(apage, err ? 1 : 0); } -struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) +static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, + struct page *parent, int start) { struct page *page; int err; -repeat: - page = grab_cache_page(NODE_MAPPING(sbi), nid); - if (!page) - return ERR_PTR(-ENOMEM); - err = read_node_page(page, READ_SYNC); - if (err < 0) { - f2fs_put_page(page, 1); - return ERR_PTR(err); - } else if (err != LOCKED_PAGE) { - lock_page(page); - } - - if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) { - ClearPageUptodate(page); - f2fs_put_page(page, 1); - return ERR_PTR(-EIO); - } - if (unlikely(page->mapping != NODE_MAPPING(sbi))) { - f2fs_put_page(page, 1); - goto repeat; - } - return page; -} - -/* - * Return a locked page for the desired node page. - * And, readahead MAX_RA_NODE number of node pages. - */ -struct page *get_node_page_ra(struct page *parent, int start) -{ - struct f2fs_sb_info *sbi = F2FS_P_SB(parent); - struct blk_plug plug; - struct page *page; - int err, i, end; - nid_t nid; - - /* First, try getting the desired direct node. */ - nid = get_nid(parent, start, false); if (!nid) return ERR_PTR(-ENOENT); + f2fs_bug_on(sbi, check_nid_range(sbi, nid)); repeat: - page = grab_cache_page(NODE_MAPPING(sbi), nid); + page = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false); if (!page) return ERR_PTR(-ENOMEM); @@ -1108,61 +1141,116 @@ struct page *get_node_page_ra(struct page *parent, int start) goto page_hit; } - blk_start_plug(&plug); - - /* Then, try readahead for siblings of the desired node */ - end = start + MAX_RA_NODE; - end = min(end, NIDS_PER_BLOCK); - for (i = start + 1; i < end; i++) { - nid = get_nid(parent, i, false); - if (!nid) - continue; - ra_node_page(sbi, nid); - } - - blk_finish_plug(&plug); + if (parent) + ra_node_pages(parent, start + 1, MAX_RA_NODE); lock_page(page); + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { f2fs_put_page(page, 1); goto repeat; } + + if (unlikely(!PageUptodate(page))) + goto out_err; page_hit: - if (unlikely(!PageUptodate(page))) { + if(unlikely(nid != nid_of_node(page))) { + f2fs_bug_on(sbi, 1); + ClearPageUptodate(page); +out_err: f2fs_put_page(page, 1); return ERR_PTR(-EIO); } return page; } -void sync_inode_page(struct dnode_of_data *dn) +struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) { - if (IS_INODE(dn->node_page) || dn->inode_page == dn->node_page) { - update_inode(dn->inode, dn->node_page); - } else if (dn->inode_page) { - if (!dn->inode_page_locked) - lock_page(dn->inode_page); - update_inode(dn->inode, dn->inode_page); - if (!dn->inode_page_locked) - unlock_page(dn->inode_page); - } else { - update_inode_page(dn->inode); - } + return __get_node_page(sbi, nid, NULL, 0); } -int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, - struct writeback_control *wbc) +struct page *get_node_page_ra(struct page *parent, int start) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(parent); + nid_t nid = get_nid(parent, start, false); + + return __get_node_page(sbi, nid, parent, start); +} + +static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct inode *inode; + struct page *page; + int ret; + + /* should flush inline_data before evict_inode */ + inode = ilookup(sbi->sb, ino); + if (!inode) + return; + + page = pagecache_get_page(inode->i_mapping, 0, FGP_LOCK|FGP_NOWAIT, 0); + if (!page) + goto iput_out; + + if (!PageUptodate(page)) + goto page_out; + + if (!PageDirty(page)) + goto page_out; + + if (!clear_page_dirty_for_io(page)) + goto page_out; + + ret = f2fs_write_inline_data(inode, page); + inode_dec_dirty_pages(inode); + if (ret) + set_page_dirty(page); +page_out: + f2fs_put_page(page, 1); +iput_out: + iput(inode); +} + +void move_node_page(struct page *node_page, int gc_type) +{ + if (gc_type == FG_GC) { + struct f2fs_sb_info *sbi = F2FS_P_SB(node_page); + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 1, + .for_reclaim = 0, + }; + + set_page_dirty(node_page); + f2fs_wait_on_page_writeback(node_page, NODE, true); + + f2fs_bug_on(sbi, PageWriteback(node_page)); + if (!clear_page_dirty_for_io(node_page)) + goto out_page; + + if (NODE_MAPPING(sbi)->a_ops->writepage(node_page, &wbc)) + unlock_page(node_page); + goto release_page; + } else { + /* set page dirty and write it */ + if (!PageWriteback(node_page)) + set_page_dirty(node_page); + } +out_page: + unlock_page(node_page); +release_page: + f2fs_put_page(node_page, 0); +} + +static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) { pgoff_t index, end; struct pagevec pvec; - int step = ino ? 2 : 0; - int nwritten = 0, wrote = 0; + struct page *last_page = NULL; pagevec_init(&pvec, 0); - -next_step: index = 0; - end = LONG_MAX; + end = ULONG_MAX; while (index <= end) { int i, nr_pages; @@ -1175,6 +1263,190 @@ int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_put_page(last_page, 0); + pagevec_release(&pvec); + return ERR_PTR(-EIO); + } + + if (!IS_DNODE(page) || !is_cold_node(page)) + continue; + if (ino_of_node(page) != ino) + continue; + + lock_page(page); + + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { +continue_unlock: + unlock_page(page); + continue; + } + if (ino_of_node(page) != ino) + goto continue_unlock; + + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + if (last_page) + f2fs_put_page(last_page, 0); + + get_page(page); + last_page = page; + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + } + return last_page; +} + +int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, + struct writeback_control *wbc, bool atomic) +{ + pgoff_t index, end; + struct pagevec pvec; + int ret = 0; + struct page *last_page = NULL; + bool marked = false; + nid_t ino = inode->i_ino; + int nwritten = 0; + + if (atomic) { + last_page = last_fsync_dnode(sbi, ino); + if (IS_ERR_OR_NULL(last_page)) + return PTR_ERR_OR_ZERO(last_page); + } +retry: + pagevec_init(&pvec, 0); + index = 0; + end = ULONG_MAX; + + while (index <= end) { + int i, nr_pages; + nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_put_page(last_page, 0); + pagevec_release(&pvec); + return -EIO; + } + + if (!IS_DNODE(page) || !is_cold_node(page)) + continue; + if (ino_of_node(page) != ino) + continue; + + lock_page(page); + + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { +continue_unlock: + unlock_page(page); + continue; + } + if (ino_of_node(page) != ino) + goto continue_unlock; + + if (!PageDirty(page) && page != last_page) { + /* someone wrote it for us */ + goto continue_unlock; + } + + f2fs_wait_on_page_writeback(page, NODE, true); + BUG_ON(PageWriteback(page)); + + if (!atomic || page == last_page) { + set_fsync_mark(page, 1); + if (IS_INODE(page)) { + if (is_inode_flag_set(inode, + FI_DIRTY_INODE)) + update_inode(inode, page); + set_dentry_mark(page, + need_dentry_mark(sbi, ino)); + } + /* may be written by other thread */ + if (!PageDirty(page)) + set_page_dirty(page); + } + + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + + ret = NODE_MAPPING(sbi)->a_ops->writepage(page, wbc); + if (ret) { + unlock_page(page); + f2fs_put_page(last_page, 0); + break; + } else { + nwritten++; + } + + if (page == last_page) { + f2fs_put_page(page, 0); + marked = true; + break; + } + } + pagevec_release(&pvec); + cond_resched(); + + if (ret || marked) + break; + } + if (!ret && atomic && !marked) { + f2fs_msg(sbi->sb, KERN_DEBUG, + "Retry to write fsync mark: ino=%u, idx=%lx", + ino, last_page->index); + lock_page(last_page); + set_page_dirty(last_page); + unlock_page(last_page); + goto retry; + } + + if (nwritten) + f2fs_submit_merged_bio_cond(sbi, NULL, NULL, ino, NODE, WRITE); + return ret ? -EIO: 0; +} + +int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc) +{ + pgoff_t index, end; + struct pagevec pvec; + int step = 0; + int nwritten = 0; + int ret = 0; + + pagevec_init(&pvec, 0); + +next_step: + index = 0; + end = ULONG_MAX; + + while (index <= end) { + int i, nr_pages; + nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (unlikely(f2fs_cp_error(sbi))) { + pagevec_release(&pvec); + ret = -EIO; + goto out; + } + /* * flushing sequence with step: * 0. indirect nodes @@ -1189,14 +1461,8 @@ int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, if (step == 2 && (!IS_DNODE(page) || !is_cold_node(page))) continue; - - /* - * If an fsync mode, - * we should not skip writing node pages. - */ - if (ino && ino_of_node(page) == ino) - lock_page(page); - else if (!trylock_page(page)) +lock_node: + if (!trylock_page(page)) continue; if (unlikely(page->mapping != NODE_MAPPING(sbi))) { @@ -1204,33 +1470,33 @@ int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, unlock_page(page); continue; } - if (ino && ino_of_node(page) != ino) - goto continue_unlock; if (!PageDirty(page)) { /* someone wrote it for us */ goto continue_unlock; } + /* flush inline_data */ + if (is_inline_node(page)) { + clear_inline_node(page); + unlock_page(page); + flush_inline_data(sbi, ino_of_node(page)); + goto lock_node; + } + + f2fs_wait_on_page_writeback(page, NODE, true); + + BUG_ON(PageWriteback(page)); if (!clear_page_dirty_for_io(page)) goto continue_unlock; - /* called by fsync() */ - if (ino && IS_DNODE(page)) { - set_fsync_mark(page, 1); - if (IS_INODE(page)) - set_dentry_mark(page, - need_dentry_mark(sbi, ino)); - nwritten++; - } else { - set_fsync_mark(page, 0); - set_dentry_mark(page, 0); - } + set_fsync_mark(page, 0); + set_dentry_mark(page, 0); if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc)) unlock_page(page); else - wrote++; + nwritten++; if (--wbc->nr_to_write == 0) break; @@ -1248,15 +1514,15 @@ int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, step++; goto next_step; } - - if (wrote) +out: + if (nwritten) f2fs_submit_merged_bio(sbi, NODE, WRITE); - return nwritten; + return ret; } int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) { - pgoff_t index = 0, end = LONG_MAX; + pgoff_t index = 0, end = ULONG_MAX; struct pagevec pvec; int ret2 = 0, ret = 0; @@ -1278,7 +1544,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) continue; if (ino && ino_of_node(page) == ino) { - f2fs_wait_on_page_writeback(page, NODE); + f2fs_wait_on_page_writeback(page, NODE, true); if (TestClearPageError(page)) ret = -EIO; } @@ -1317,8 +1583,6 @@ static int f2fs_write_node_page(struct page *page, if (unlikely(f2fs_cp_error(sbi))) goto redirty_out; - f2fs_wait_on_page_writeback(page, NODE); - /* get old block addr of this node page */ nid = nid_of_node(page); f2fs_bug_on(sbi, page->index != nid); @@ -1342,14 +1606,18 @@ static int f2fs_write_node_page(struct page *page, } set_page_writeback(page); - fio.blk_addr = ni.blk_addr; + fio.old_blkaddr = ni.blk_addr; write_node_page(nid, &fio); - set_node_addr(sbi, &ni, fio.blk_addr, is_fsync_dnode(page)); + set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); dec_page_count(sbi, F2FS_DIRTY_NODES); up_read(&sbi->node_write); - unlock_page(page); if (wbc->for_reclaim) + f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, NODE, WRITE); + + unlock_page(page); + + if (unlikely(f2fs_cp_error(sbi))) f2fs_submit_merged_bio(sbi, NODE, WRITE); return 0; @@ -1363,10 +1631,9 @@ static int f2fs_write_node_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); + struct blk_plug plug; long diff; - trace_f2fs_writepages(mapping->host, wbc, NODE); - /* balancing f2fs's metadata in background */ f2fs_balance_fs_bg(sbi); @@ -1374,14 +1641,19 @@ static int f2fs_write_node_pages(struct address_space *mapping, if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE)) goto skip_write; + trace_f2fs_writepages(mapping->host, wbc, NODE); + diff = nr_pages_to_write(sbi, NODE, wbc); wbc->sync_mode = WB_SYNC_NONE; - sync_node_pages(sbi, 0, wbc); + blk_start_plug(&plug); + sync_node_pages(sbi, wbc); + blk_finish_plug(&plug); wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return 0; skip_write: wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_NODES); + trace_f2fs_writepages(mapping->host, wbc, NODE); return 0; } @@ -1389,9 +1661,10 @@ static int f2fs_set_node_page_dirty(struct page *page) { trace_f2fs_set_page_dirty(page, NODE); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); if (!PageDirty(page)) { - __set_page_dirty_nobuffers(page); + f2fs_set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); SetPagePrivate(page); f2fs_trace_pid(page); @@ -1409,6 +1682,9 @@ const struct address_space_operations f2fs_node_aops = { .set_page_dirty = f2fs_set_node_page_dirty, .invalidatepage = f2fs_invalidate_page, .releasepage = f2fs_release_page, +#ifdef CONFIG_MIGRATION + .migratepage = f2fs_migrate_page, +#endif }; static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, @@ -1429,7 +1705,6 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; struct nat_entry *ne; - bool allocated = false; if (!available_free_memory(sbi, FREE_NIDS)) return -1; @@ -1440,14 +1715,9 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) if (build) { /* do not add allocated nids */ - down_read(&nm_i->nat_tree_lock); ne = __lookup_nat_cache(nm_i, nid); - if (ne && - (!get_nat_flag(ne, IS_CHECKPOINTED) || + if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || nat_get_blkaddr(ne) != NULL_ADDR)) - allocated = true; - up_read(&nm_i->nat_tree_lock); - if (allocated) return 0; } @@ -1516,22 +1786,24 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, } } -static void build_free_nids(struct f2fs_sb_info *sbi) +void build_free_nids(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; + struct f2fs_journal *journal = curseg->journal; int i = 0; nid_t nid = nm_i->next_scan_nid; /* Enough entries */ - if (nm_i->fcnt > NAT_ENTRY_PER_BLOCK) + if (nm_i->fcnt >= NAT_ENTRY_PER_BLOCK) return; /* readahead nat pages to be scanned */ ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT, true); + down_read(&nm_i->nat_tree_lock); + while (1) { struct page *page = get_current_nat_page(sbi, nid); @@ -1550,16 +1822,19 @@ static void build_free_nids(struct f2fs_sb_info *sbi) nm_i->next_scan_nid = nid; /* find free nids from current sum_pages */ - mutex_lock(&curseg->curseg_mutex); - for (i = 0; i < nats_in_cursum(sum); i++) { - block_t addr = le32_to_cpu(nat_in_journal(sum, i).block_addr); - nid = le32_to_cpu(nid_in_journal(sum, i)); + down_read(&curseg->journal_rwsem); + for (i = 0; i < nats_in_cursum(journal); i++) { + block_t addr; + + addr = le32_to_cpu(nat_in_journal(journal, i).block_addr); + nid = le32_to_cpu(nid_in_journal(journal, i)); if (addr == NULL_ADDR) add_free_nid(sbi, nid, true); else remove_free_nid(nm_i, nid); } - mutex_unlock(&curseg->curseg_mutex); + up_read(&curseg->journal_rwsem); + up_read(&nm_i->nat_tree_lock); ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), nm_i->ra_nid_pages, META_NAT, false); @@ -1575,6 +1850,10 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i = NULL; retry: +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_ALLOC_NID)) + return false; +#endif if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids)) return false; @@ -1582,8 +1861,6 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) /* We should not use stale free nids created by build_free_nids */ if (nm_i->fcnt && !on_build_free_nids(nm_i)) { - struct node_info ni; - f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); list_for_each_entry(i, &nm_i->free_nid_list, list) if (i->state == NID_NEW) @@ -1594,13 +1871,6 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) i->state = NID_ALLOC; nm_i->fcnt--; spin_unlock(&nm_i->free_nid_list_lock); - - /* check nid is allocated already */ - get_node_info(sbi, *nid, &ni); - if (ni.blk_addr != NULL_ADDR) { - alloc_nid_done(sbi, *nid); - goto retry; - } return true; } spin_unlock(&nm_i->free_nid_list_lock); @@ -1663,12 +1933,15 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) struct free_nid *i, *next; int nr = nr_shrink; + if (nm_i->fcnt <= MAX_FREE_NIDS) + return 0; + if (!mutex_trylock(&nm_i->build_lock)) return 0; spin_lock(&nm_i->free_nid_list_lock); list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) { - if (nr_shrink <= 0 || nm_i->fcnt <= NAT_ENTRY_PER_BLOCK) + if (nr_shrink <= 0 || nm_i->fcnt <= MAX_FREE_NIDS) break; if (i->state == NID_ALLOC) continue; @@ -1695,7 +1968,7 @@ void recover_inline_xattr(struct inode *inode, struct page *page) ri = F2FS_INODE(page); if (!(ri->i_inline & F2FS_INLINE_XATTR)) { - clear_inode_flag(F2FS_I(inode), FI_INLINE_XATTR); + clear_inode_flag(inode, FI_INLINE_XATTR); goto update_inode; } @@ -1703,7 +1976,7 @@ void recover_inline_xattr(struct inode *inode, struct page *page) src_addr = inline_xattr_addr(page); inline_size = inline_xattr_size(inode); - f2fs_wait_on_page_writeback(ipage, NODE); + f2fs_wait_on_page_writeback(ipage, NODE, true); memcpy(dst_addr, src_addr, inline_size); update_inode: update_inode(inode, ipage); @@ -1737,13 +2010,11 @@ void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) get_node_info(sbi, new_xnid, &ni); ni.ino = inode->i_ino; set_node_addr(sbi, &ni, NEW_ADDR, false); - F2FS_I(inode)->i_xattr_nid = new_xnid; + f2fs_i_xnid_write(inode, new_xnid); /* 3: update xattr blkaddr */ refresh_sit_entry(sbi, NEW_ADDR, blkaddr); set_node_addr(sbi, &ni, blkaddr, false); - - update_inode_page(inode); } int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) @@ -1757,15 +2028,18 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) if (unlikely(old_ni.blk_addr != NULL_ADDR)) return -EINVAL; - - ipage = grab_cache_page(NODE_MAPPING(sbi), ino); - if (!ipage) - return -ENOMEM; +retry: + ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false); + if (!ipage) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } /* Should not use this inode from free nid list */ remove_free_nid(NM_I(sbi), ino); - SetPageUptodate(ipage); + if (!PageUptodate(ipage)) + SetPageUptodate(ipage); fill_node_footer(ipage, ino, ino, 0, true); src = F2FS_INODE(page); @@ -1831,28 +2105,26 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; + struct f2fs_journal *journal = curseg->journal; int i; - mutex_lock(&curseg->curseg_mutex); - for (i = 0; i < nats_in_cursum(sum); i++) { + down_write(&curseg->journal_rwsem); + for (i = 0; i < nats_in_cursum(journal); i++) { struct nat_entry *ne; struct f2fs_nat_entry raw_ne; - nid_t nid = le32_to_cpu(nid_in_journal(sum, i)); + nid_t nid = le32_to_cpu(nid_in_journal(journal, i)); - raw_ne = nat_in_journal(sum, i); + raw_ne = nat_in_journal(journal, i); - down_write(&nm_i->nat_tree_lock); ne = __lookup_nat_cache(nm_i, nid); if (!ne) { ne = grab_nat_entry(nm_i, nid); node_info_from_raw_nat(&ne->ni, &raw_ne); } __set_nat_cache_dirty(nm_i, ne); - up_write(&nm_i->nat_tree_lock); } - update_nats_in_cursum(sum, -i); - mutex_unlock(&curseg->curseg_mutex); + update_nats_in_cursum(journal, -i); + up_write(&curseg->journal_rwsem); } static void __adjust_nat_entry_set(struct nat_entry_set *nes, @@ -1877,24 +2149,23 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, struct nat_entry_set *set) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; + struct f2fs_journal *journal = curseg->journal; nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK; bool to_journal = true; struct f2fs_nat_block *nat_blk; struct nat_entry *ne, *cur; struct page *page = NULL; - struct f2fs_nm_info *nm_i = NM_I(sbi); /* * there are two steps to flush nat entries: * #1, flush nat entries to journal in current hot data summary block. * #2, flush nat entries to nat page. */ - if (!__has_cursum_space(sum, set->entry_cnt, NAT_JOURNAL)) + if (!__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL)) to_journal = false; if (to_journal) { - mutex_lock(&curseg->curseg_mutex); + down_write(&curseg->journal_rwsem); } else { page = get_next_nat_page(sbi, start_nid); nat_blk = page_address(page); @@ -1911,35 +2182,29 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, continue; if (to_journal) { - offset = lookup_journal_in_cursum(sum, + offset = lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 1); f2fs_bug_on(sbi, offset < 0); - raw_ne = &nat_in_journal(sum, offset); - nid_in_journal(sum, offset) = cpu_to_le32(nid); + raw_ne = &nat_in_journal(journal, offset); + nid_in_journal(journal, offset) = cpu_to_le32(nid); } else { raw_ne = &nat_blk->entries[nid - start_nid]; } raw_nat_from_node_info(raw_ne, &ne->ni); - - down_write(&NM_I(sbi)->nat_tree_lock); nat_reset_flag(ne); __clear_nat_cache_dirty(NM_I(sbi), ne); - up_write(&NM_I(sbi)->nat_tree_lock); - if (nat_get_blkaddr(ne) == NULL_ADDR) add_free_nid(sbi, nid, false); } if (to_journal) - mutex_unlock(&curseg->curseg_mutex); + up_write(&curseg->journal_rwsem); else f2fs_put_page(page, 1); f2fs_bug_on(sbi, set->entry_cnt); - down_write(&nm_i->nat_tree_lock); radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); - up_write(&nm_i->nat_tree_lock); kmem_cache_free(nat_entry_set_slab, set); } @@ -1950,7 +2215,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; + struct f2fs_journal *journal = curseg->journal; struct nat_entry_set *setvec[SETVEC_SIZE]; struct nat_entry_set *set, *tmp; unsigned int found; @@ -1959,29 +2224,32 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) if (!nm_i->dirty_nat_cnt) return; + + down_write(&nm_i->nat_tree_lock); + /* * if there are no enough space in journal to store dirty nat * entries, remove all entries from journal and merge them * into nat entry set. */ - if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL)) + if (!__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL)) remove_nats_in_journal(sbi); - down_write(&nm_i->nat_tree_lock); while ((found = __gang_lookup_nat_set(nm_i, set_idx, SETVEC_SIZE, setvec))) { unsigned idx; set_idx = setvec[found - 1]->set + 1; for (idx = 0; idx < found; idx++) __adjust_nat_entry_set(setvec[idx], &sets, - MAX_NAT_JENTRIES(sum)); + MAX_NAT_JENTRIES(journal)); } - up_write(&nm_i->nat_tree_lock); /* flush dirty nats in nat entry set */ list_for_each_entry_safe(set, tmp, &sets, set_list) __flush_nat_entry_set(sbi, set); + up_write(&nm_i->nat_tree_lock); + f2fs_bug_on(sbi, nm_i->dirty_nat_cnt); } @@ -2006,6 +2274,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) nm_i->nat_cnt = 0; nm_i->ram_thresh = DEF_RAM_THRESHOLD; nm_i->ra_nid_pages = DEF_RA_NID_PAGES; + nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD; INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->free_nid_list); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index e4fffd2d98c4..868bec65e51c 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -15,15 +15,21 @@ #define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK) /* # of pages to perform synchronous readahead before building free nids */ -#define FREE_NID_PAGES 4 +#define FREE_NID_PAGES 8 +#define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES) -#define DEF_RA_NID_PAGES 4 /* # of nid pages to be readaheaded */ +#define DEF_RA_NID_PAGES 0 /* # of nid pages to be readaheaded */ /* maximum readahead size for node during getting data blocks */ #define MAX_RA_NODE 128 /* control the memory footprint threshold (10MB per 1GB ram) */ -#define DEF_RAM_THRESHOLD 10 +#define DEF_RAM_THRESHOLD 1 + +/* control dirty nats ratio threshold (default: 10% over max nid count) */ +#define DEF_DIRTY_NAT_RATIO_THRESHOLD 10 +/* control total # of nats */ +#define DEF_NAT_CACHE_THRESHOLD 100000 /* vector size for gang look-up from nat cache that consists of radix tree */ #define NATVEC_SIZE 64 @@ -117,6 +123,17 @@ static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne, raw_ne->version = ni->version; } +static inline bool excess_dirty_nats(struct f2fs_sb_info *sbi) +{ + return NM_I(sbi)->dirty_nat_cnt >= NM_I(sbi)->max_nid * + NM_I(sbi)->dirty_nats_ratio / 100; +} + +static inline bool excess_cached_nats(struct f2fs_sb_info *sbi) +{ + return NM_I(sbi)->nat_cnt >= DEF_NAT_CACHE_THRESHOLD; +} + enum mem_type { FREE_NIDS, /* indicates the free nid list */ NAT_ENTRIES, /* indicates the cached nat entry */ @@ -183,7 +200,7 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start) block_addr = (pgoff_t)(nm_i->nat_blkaddr + (seg_off << sbi->log_blocks_per_seg << 1) + - (block_off & ((1 << sbi->log_blocks_per_seg) - 1))); + (block_off & (sbi->blocks_per_seg - 1))); if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) block_addr += sbi->blocks_per_seg; @@ -212,6 +229,37 @@ static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid) f2fs_change_bit(block_off, nm_i->nat_bitmap); } +static inline nid_t ino_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le32_to_cpu(rn->footer.ino); +} + +static inline nid_t nid_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le32_to_cpu(rn->footer.nid); +} + +static inline unsigned int ofs_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + unsigned flag = le32_to_cpu(rn->footer.flag); + return flag >> OFFSET_BIT_SHIFT; +} + +static inline __u64 cpver_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le64_to_cpu(rn->footer.cp_ver); +} + +static inline block_t next_blkaddr_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le32_to_cpu(rn->footer.next_blkaddr); +} + static inline void fill_node_footer(struct page *page, nid_t nid, nid_t ino, unsigned int ofs, bool reset) { @@ -242,40 +290,30 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); struct f2fs_node *rn = F2FS_NODE(page); + size_t crc_offset = le32_to_cpu(ckpt->checksum_offset); + __u64 cp_ver = le64_to_cpu(ckpt->checkpoint_ver); - rn->footer.cp_ver = ckpt->checkpoint_ver; + if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) { + __u64 crc = le32_to_cpu(*((__le32 *) + ((unsigned char *)ckpt + crc_offset))); + cp_ver |= (crc << 32); + } + rn->footer.cp_ver = cpu_to_le64(cp_ver); rn->footer.next_blkaddr = cpu_to_le32(blkaddr); } -static inline nid_t ino_of_node(struct page *node_page) +static inline bool is_recoverable_dnode(struct page *page) { - struct f2fs_node *rn = F2FS_NODE(node_page); - return le32_to_cpu(rn->footer.ino); -} + struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); + size_t crc_offset = le32_to_cpu(ckpt->checksum_offset); + __u64 cp_ver = cur_cp_version(ckpt); -static inline nid_t nid_of_node(struct page *node_page) -{ - struct f2fs_node *rn = F2FS_NODE(node_page); - return le32_to_cpu(rn->footer.nid); -} - -static inline unsigned int ofs_of_node(struct page *node_page) -{ - struct f2fs_node *rn = F2FS_NODE(node_page); - unsigned flag = le32_to_cpu(rn->footer.flag); - return flag >> OFFSET_BIT_SHIFT; -} - -static inline unsigned long long cpver_of_node(struct page *node_page) -{ - struct f2fs_node *rn = F2FS_NODE(node_page); - return le64_to_cpu(rn->footer.cp_ver); -} - -static inline block_t next_blkaddr_of_node(struct page *node_page) -{ - struct f2fs_node *rn = F2FS_NODE(node_page); - return le32_to_cpu(rn->footer.next_blkaddr); + if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) { + __u64 crc = le32_to_cpu(*((__le32 *) + ((unsigned char *)ckpt + crc_offset))); + cp_ver |= (crc << 32); + } + return cpu_to_le64(cp_ver) == cpver_of_node(page); } /* @@ -317,17 +355,17 @@ static inline bool IS_DNODE(struct page *node_page) return true; } -static inline void set_nid(struct page *p, int off, nid_t nid, bool i) +static inline int set_nid(struct page *p, int off, nid_t nid, bool i) { struct f2fs_node *rn = F2FS_NODE(p); - f2fs_wait_on_page_writeback(p, NODE); + f2fs_wait_on_page_writeback(p, NODE, true); if (i) rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid); else rn->in.nid[off] = cpu_to_le32(nid); - set_page_dirty(p); + return set_page_dirty(p); } static inline nid_t get_nid(struct page *p, int off, bool i) @@ -370,6 +408,21 @@ static inline int is_node(struct page *page, int type) #define is_fsync_dnode(page) is_node(page, FSYNC_BIT_SHIFT) #define is_dent_dnode(page) is_node(page, DENT_BIT_SHIFT) +static inline int is_inline_node(struct page *page) +{ + return PageChecked(page); +} + +static inline void set_inline_node(struct page *page) +{ + SetPageChecked(page); +} + +static inline void clear_inline_node(struct page *page) +{ + ClearPageChecked(page); +} + static inline void set_cold_node(struct inode *inode, struct page *page) { struct f2fs_node *rn = F2FS_NODE(page); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index cbf74f47cce8..2fc84a991325 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -49,8 +49,9 @@ static struct kmem_cache *fsync_entry_slab; bool space_for_roll_forward(struct f2fs_sb_info *sbi) { - if (sbi->last_valid_block_count + sbi->alloc_valid_block_count - > sbi->user_block_count) + s64 nalloc = percpu_counter_sum_positive(&sbi->alloc_valid_block_count); + + if (sbi->last_valid_block_count + nalloc > sbi->user_block_count) return false; return true; } @@ -67,42 +68,71 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, return NULL; } -static int recover_dentry(struct inode *inode, struct page *ipage) +static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi, + struct list_head *head, nid_t ino) +{ + struct inode *inode; + struct fsync_inode_entry *entry; + + inode = f2fs_iget_retry(sbi->sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + entry = f2fs_kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO); + entry->inode = inode; + list_add_tail(&entry->list, head); + + return entry; +} + +static void del_fsync_inode(struct fsync_inode_entry *entry) +{ + iput(entry->inode); + list_del(&entry->list); + kmem_cache_free(fsync_entry_slab, entry); +} + +static int recover_dentry(struct inode *inode, struct page *ipage, + struct list_head *dir_list) { struct f2fs_inode *raw_inode = F2FS_INODE(ipage); nid_t pino = le32_to_cpu(raw_inode->i_pino); struct f2fs_dir_entry *de; - struct qstr name; + struct fscrypt_name fname; struct page *page; struct inode *dir, *einode; + struct fsync_inode_entry *entry; int err = 0; + char *name; - dir = f2fs_iget(inode->i_sb, pino); - if (IS_ERR(dir)) { - err = PTR_ERR(dir); - goto out; + entry = get_fsync_inode(dir_list, pino); + if (!entry) { + entry = add_fsync_inode(F2FS_I_SB(inode), dir_list, pino); + if (IS_ERR(entry)) { + dir = ERR_CAST(entry); + err = PTR_ERR(entry); + goto out; + } } - if (file_enc_name(inode)) { - iput(dir); - return 0; - } + dir = entry->inode; - name.len = le32_to_cpu(raw_inode->i_namelen); - name.name = raw_inode->i_name; + memset(&fname, 0, sizeof(struct fscrypt_name)); + fname.disk_name.len = le32_to_cpu(raw_inode->i_namelen); + fname.disk_name.name = raw_inode->i_name; - if (unlikely(name.len > F2FS_NAME_LEN)) { + if (unlikely(fname.disk_name.len > F2FS_NAME_LEN)) { WARN_ON(1); err = -ENAMETOOLONG; - goto out_err; + goto out; } retry: - de = f2fs_find_entry(dir, &name, &page); + de = __f2fs_find_entry(dir, &fname, &page); if (de && inode->i_ino == le32_to_cpu(de->ino)) goto out_unmap_put; if (de) { - einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino)); + einode = f2fs_iget_retry(inode->i_sb, le32_to_cpu(de->ino)); if (IS_ERR(einode)) { WARN_ON(1); err = PTR_ERR(einode); @@ -118,29 +148,27 @@ static int recover_dentry(struct inode *inode, struct page *ipage) f2fs_delete_entry(de, page, dir, einode); iput(einode); goto retry; - } - err = __f2fs_add_link(dir, &name, inode, inode->i_ino, inode->i_mode); - if (err) - goto out_err; - - if (is_inode_flag_set(F2FS_I(dir), FI_DELAY_IPUT)) { - iput(dir); + } else if (IS_ERR(page)) { + err = PTR_ERR(page); } else { - add_dirty_dir_inode(dir); - set_inode_flag(F2FS_I(dir), FI_DELAY_IPUT); + err = __f2fs_do_add_link(dir, &fname, inode, + inode->i_ino, inode->i_mode); } - + if (err == -ENOMEM) + goto retry; goto out; out_unmap_put: f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); -out_err: - iput(dir); out: + if (file_enc_name(inode)) + name = ""; + else + name = raw_inode->i_name; f2fs_msg(inode->i_sb, KERN_NOTICE, "%s: ino = %x, name = %s, dir = %lx, err = %d", - __func__, ino_of_node(ipage), raw_inode->i_name, + __func__, ino_of_node(ipage), name, IS_ERR(dir) ? 0 : dir->i_ino, err); return err; } @@ -151,7 +179,7 @@ static void recover_inode(struct inode *inode, struct page *page) char *name; inode->i_mode = le16_to_cpu(raw->i_mode); - i_size_write(inode, le64_to_cpu(raw->i_size)); + f2fs_i_size_write(inode, le64_to_cpu(raw->i_size)); inode->i_atime.tv_sec = le64_to_cpu(raw->i_mtime); inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime); inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime); @@ -168,9 +196,34 @@ static void recover_inode(struct inode *inode, struct page *page) ino_of_node(page), name); } +static bool is_same_inode(struct inode *inode, struct page *ipage) +{ + struct f2fs_inode *ri = F2FS_INODE(ipage); + struct timespec disk; + + if (!IS_INODE(ipage)) + return true; + + disk.tv_sec = le64_to_cpu(ri->i_ctime); + disk.tv_nsec = le32_to_cpu(ri->i_ctime_nsec); + if (timespec_compare(&inode->i_ctime, &disk) > 0) + return false; + + disk.tv_sec = le64_to_cpu(ri->i_atime); + disk.tv_nsec = le32_to_cpu(ri->i_atime_nsec); + if (timespec_compare(&inode->i_atime, &disk) > 0) + return false; + + disk.tv_sec = le64_to_cpu(ri->i_mtime); + disk.tv_nsec = le32_to_cpu(ri->i_mtime_nsec); + if (timespec_compare(&inode->i_mtime, &disk) > 0) + return false; + + return true; +} + static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) { - unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); struct curseg_info *curseg; struct page *page = NULL; block_t blkaddr; @@ -180,8 +233,6 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); - ra_meta_pages(sbi, blkaddr, 1, META_POR, true); - while (1) { struct fsync_inode_entry *entry; @@ -190,49 +241,41 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) page = get_tmp_page(sbi, blkaddr); - if (cp_ver != cpver_of_node(page)) + if (!is_recoverable_dnode(page)) break; if (!is_fsync_dnode(page)) goto next; entry = get_fsync_inode(head, ino_of_node(page)); - if (!entry) { + if (entry) { + if (!is_same_inode(entry->inode, page)) + goto next; + } else { if (IS_INODE(page) && is_dent_dnode(page)) { err = recover_inode_page(sbi, page); if (err) break; } - /* add this fsync inode to the list */ - entry = kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO); - if (!entry) { - err = -ENOMEM; - break; - } /* * CP | dnode(F) | inode(DF) * For this case, we should not give up now. */ - entry->inode = f2fs_iget(sbi->sb, ino_of_node(page)); - if (IS_ERR(entry->inode)) { - err = PTR_ERR(entry->inode); - kmem_cache_free(fsync_entry_slab, entry); + entry = add_fsync_inode(sbi, head, ino_of_node(page)); + if (IS_ERR(entry)) { + err = PTR_ERR(entry); if (err == -ENOENT) { err = 0; goto next; } break; } - list_add_tail(&entry->list, head); } entry->blkaddr = blkaddr; - if (IS_INODE(page)) { - entry->last_inode = blkaddr; - if (is_dent_dnode(page)) - entry->last_dentry = blkaddr; - } + if (IS_INODE(page) && is_dent_dnode(page)) + entry->last_dentry = blkaddr; next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); @@ -248,11 +291,8 @@ static void destroy_fsync_dnodes(struct list_head *head) { struct fsync_inode_entry *entry, *tmp; - list_for_each_entry_safe(entry, tmp, head, list) { - iput(entry->inode); - list_del(&entry->list); - kmem_cache_free(fsync_entry_slab, entry); - } + list_for_each_entry_safe(entry, tmp, head, list) + del_fsync_inode(entry); } static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, @@ -314,15 +354,14 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, if (ino != dn->inode->i_ino) { /* Deallocate previous index in the node page */ - inode = f2fs_iget(sbi->sb, ino); + inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) return PTR_ERR(inode); } else { inode = dn->inode; } - bidx = start_bidx_of_node(offset, F2FS_I(inode)) + - le16_to_cpu(sum.ofs_in_node); + bidx = start_bidx_of_node(offset, inode) + le16_to_cpu(sum.ofs_in_node); /* * if inode page is locked, unlock temporarily, but its reference @@ -357,10 +396,9 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, struct page *page, block_t blkaddr) { - struct f2fs_inode_info *fi = F2FS_I(inode); - unsigned int start, end; struct dnode_of_data dn; struct node_info ni; + unsigned int start, end; int err = 0, recovered = 0; /* step 1: recover xattr */ @@ -380,16 +418,21 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, goto out; /* step 3: recover data indices */ - start = start_bidx_of_node(ofs_of_node(page), fi); - end = start + ADDRS_PER_PAGE(page, fi); + start = start_bidx_of_node(ofs_of_node(page), inode); + end = start + ADDRS_PER_PAGE(page, inode); set_new_dnode(&dn, inode, NULL, NULL, 0); - +retry_dn: err = get_dnode_of_data(&dn, start, ALLOC_NODE); - if (err) + if (err) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry_dn; + } goto out; + } - f2fs_wait_on_page_writeback(dn.node_page, NODE); + f2fs_wait_on_page_writeback(dn.node_page, NODE, true); get_node_info(sbi, dn.nid, &ni); f2fs_bug_on(sbi, ni.ino != ino_of_node(page)); @@ -411,14 +454,16 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, continue; } + if ((start + 1) << PAGE_SHIFT > i_size_read(inode)) + f2fs_i_size_write(inode, (start + 1) << PAGE_SHIFT); + /* * dest is reserved block, invalidate src block * and then reserve one new block in dnode page. */ if (dest == NEW_ADDR) { truncate_data_blocks_range(&dn, 1); - err = reserve_new_block(&dn); - f2fs_bug_on(sbi, err); + reserve_new_block(&dn); continue; } @@ -427,25 +472,33 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, if (src == NULL_ADDR) { err = reserve_new_block(&dn); +#ifdef CONFIG_F2FS_FAULT_INJECTION + while (err) + err = reserve_new_block(&dn); +#endif /* We should not get -ENOSPC */ f2fs_bug_on(sbi, err); + if (err) + goto err; } - +retry_prev: /* Check the previous node page having this index */ err = check_index_in_prev_nodes(sbi, dest, &dn); - if (err) + if (err) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry_prev; + } goto err; + } /* write dummy data page */ f2fs_replace_block(sbi, &dn, src, dest, - ni.version, false); + ni.version, false, false); recovered++; } } - if (IS_INODE(dn.node_page)) - sync_inode_page(&dn); - copy_node_footer(dn.node_page, page); fill_node_footer(dn.node_page, dn.nid, ni.ino, ofs_of_node(page), false); @@ -459,17 +512,16 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, return err; } -static int recover_data(struct f2fs_sb_info *sbi, - struct list_head *head, int type) +static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, + struct list_head *dir_list) { - unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); struct curseg_info *curseg; struct page *page = NULL; int err = 0; block_t blkaddr; /* get node pages in the current segment */ - curseg = CURSEG_I(sbi, type); + curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); while (1) { @@ -482,12 +534,12 @@ static int recover_data(struct f2fs_sb_info *sbi, page = get_tmp_page(sbi, blkaddr); - if (cp_ver != cpver_of_node(page)) { + if (!is_recoverable_dnode(page)) { f2fs_put_page(page, 1); break; } - entry = get_fsync_inode(head, ino_of_node(page)); + entry = get_fsync_inode(inode_list, ino_of_node(page)); if (!entry) goto next; /* @@ -495,10 +547,10 @@ static int recover_data(struct f2fs_sb_info *sbi, * In this case, we can lose the latest inode(x). * So, call recover_inode for the inode update. */ - if (entry->last_inode == blkaddr) + if (IS_INODE(page)) recover_inode(entry->inode, page); if (entry->last_dentry == blkaddr) { - err = recover_dentry(entry->inode, page); + err = recover_dentry(entry->inode, page, dir_list); if (err) { f2fs_put_page(page, 1); break; @@ -510,11 +562,8 @@ static int recover_data(struct f2fs_sb_info *sbi, break; } - if (entry->blkaddr == blkaddr) { - iput(entry->inode); - list_del(&entry->list); - kmem_cache_free(fsync_entry_slab, entry); - } + if (entry->blkaddr == blkaddr) + del_fsync_inode(entry); next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); @@ -525,12 +574,14 @@ static int recover_data(struct f2fs_sb_info *sbi, return err; } -int recover_fsync_data(struct f2fs_sb_info *sbi) +int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); struct list_head inode_list; + struct list_head dir_list; block_t blkaddr; int err; + int ret = 0; bool need_writecp = false; fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", @@ -539,6 +590,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) return -ENOMEM; INIT_LIST_HEAD(&inode_list); + INIT_LIST_HEAD(&dir_list); /* prevent checkpoint */ mutex_lock(&sbi->cp_mutex); @@ -547,25 +599,26 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) /* step #1: find fsynced inode numbers */ err = find_fsync_dnodes(sbi, &inode_list); - if (err) + if (err || list_empty(&inode_list)) goto out; - if (list_empty(&inode_list)) + if (check_only) { + ret = 1; goto out; + } need_writecp = true; /* step #2: recover data */ - err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE); + err = recover_data(sbi, &inode_list, &dir_list); if (!err) f2fs_bug_on(sbi, !list_empty(&inode_list)); out: destroy_fsync_dnodes(&inode_list); - kmem_cache_destroy(fsync_entry_slab); /* truncate meta pages to be used by the recovery */ truncate_inode_pages_range(META_MAPPING(sbi), - (loff_t)MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1); + (loff_t)MAIN_BLKADDR(sbi) << PAGE_SHIFT, -1); if (err) { truncate_inode_pages_final(NODE_MAPPING(sbi)); @@ -573,31 +626,20 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) } clear_sbi_flag(sbi, SBI_POR_DOING); - if (err) { - bool invalidate = false; + if (err) + set_ckpt_flags(sbi, CP_ERROR_FLAG); + mutex_unlock(&sbi->cp_mutex); - if (discard_next_dnode(sbi, blkaddr)) - invalidate = true; + /* let's drop all the directory inodes for clean checkpoint */ + destroy_fsync_dnodes(&dir_list); - /* Flush all the NAT/SIT pages */ - while (get_pages(sbi, F2FS_DIRTY_META)) - sync_meta_pages(sbi, META, LONG_MAX); - - /* invalidate temporary meta page */ - if (invalidate) - invalidate_mapping_pages(META_MAPPING(sbi), - blkaddr, blkaddr); - - set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); - mutex_unlock(&sbi->cp_mutex); - } else if (need_writecp) { + if (!err && need_writecp) { struct cp_control cpc = { .reason = CP_RECOVERY, }; - mutex_unlock(&sbi->cp_mutex); - write_checkpoint(sbi, &cpc); - } else { - mutex_unlock(&sbi->cp_mutex); + err = write_checkpoint(sbi, &cpc); } - return err; + + kmem_cache_destroy(fsync_entry_slab); + return ret ? ret: err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f77b3258454a..b3c61ae37f92 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -86,6 +86,7 @@ static inline unsigned long __reverse_ffs(unsigned long word) /* * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because * f2fs_set_bit makes MSB and LSB reversed in a byte. + * @size must be integral times of unsigned long. * Example: * MSB <--> LSB * f2fs_set_bit(0, bitmap) => 1000 0000 @@ -95,94 +96,73 @@ static unsigned long __find_rev_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BIT_WORD(offset); - unsigned long result = offset & ~(BITS_PER_LONG - 1); + unsigned long result = size; unsigned long tmp; if (offset >= size) return size; - size -= result; + size -= (offset & ~(BITS_PER_LONG - 1)); offset %= BITS_PER_LONG; - if (!offset) - goto aligned; - tmp = __reverse_ulong((unsigned char *)p); - tmp &= ~0UL >> offset; + while (1) { + if (*p == 0) + goto pass; - if (size < BITS_PER_LONG) - goto found_first; - if (tmp) - goto found_middle; - - size -= BITS_PER_LONG; - result += BITS_PER_LONG; - p++; -aligned: - while (size & ~(BITS_PER_LONG-1)) { tmp = __reverse_ulong((unsigned char *)p); + + tmp &= ~0UL >> offset; + if (size < BITS_PER_LONG) + tmp &= (~0UL << (BITS_PER_LONG - size)); if (tmp) - goto found_middle; - result += BITS_PER_LONG; + goto found; +pass: + if (size <= BITS_PER_LONG) + break; size -= BITS_PER_LONG; + offset = 0; p++; } - if (!size) - return result; - - tmp = __reverse_ulong((unsigned char *)p); -found_first: - tmp &= (~0UL << (BITS_PER_LONG - size)); - if (!tmp) /* Are any bits set? */ - return result + size; /* Nope. */ -found_middle: - return result + __reverse_ffs(tmp); + return result; +found: + return result - size + __reverse_ffs(tmp); } static unsigned long __find_rev_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BIT_WORD(offset); - unsigned long result = offset & ~(BITS_PER_LONG - 1); + unsigned long result = size; unsigned long tmp; if (offset >= size) return size; - size -= result; + size -= (offset & ~(BITS_PER_LONG - 1)); offset %= BITS_PER_LONG; - if (!offset) - goto aligned; - tmp = __reverse_ulong((unsigned char *)p); - tmp |= ~((~0UL << offset) >> offset); + while (1) { + if (*p == ~0UL) + goto pass; - if (size < BITS_PER_LONG) - goto found_first; - if (tmp != ~0UL) - goto found_middle; - - size -= BITS_PER_LONG; - result += BITS_PER_LONG; - p++; -aligned: - while (size & ~(BITS_PER_LONG - 1)) { tmp = __reverse_ulong((unsigned char *)p); + + if (offset) + tmp |= ~0UL << (BITS_PER_LONG - offset); + if (size < BITS_PER_LONG) + tmp |= ~0UL >> size; if (tmp != ~0UL) - goto found_middle; - result += BITS_PER_LONG; + goto found; +pass: + if (size <= BITS_PER_LONG) + break; size -= BITS_PER_LONG; + offset = 0; p++; } - if (!size) - return result; - - tmp = __reverse_ulong((unsigned char *)p); -found_first: - tmp |= ~(~0UL << (BITS_PER_LONG - size)); - if (tmp == ~0UL) /* Are any bits zero? */ - return result + size; /* Nope. */ -found_middle: - return result + __reverse_ffz(tmp); + return result; +found: + return result - size + __reverse_ffz(tmp); } void register_inmem_page(struct inode *inode, struct page *page) @@ -211,69 +191,149 @@ void register_inmem_page(struct inode *inode, struct page *page) trace_f2fs_register_inmem_page(page, INMEM); } -int commit_inmem_pages(struct inode *inode, bool abort) +static int __revoke_inmem_pages(struct inode *inode, + struct list_head *head, bool drop, bool recover) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct inmem_pages *cur, *tmp; + int err = 0; + + list_for_each_entry_safe(cur, tmp, head, list) { + struct page *page = cur->page; + + if (drop) + trace_f2fs_commit_inmem_page(page, INMEM_DROP); + + lock_page(page); + + if (recover) { + struct dnode_of_data dn; + struct node_info ni; + + trace_f2fs_commit_inmem_page(page, INMEM_REVOKE); + + set_new_dnode(&dn, inode, NULL, NULL, 0); + if (get_dnode_of_data(&dn, page->index, LOOKUP_NODE)) { + err = -EAGAIN; + goto next; + } + get_node_info(sbi, dn.nid, &ni); + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, + cur->old_addr, ni.version, true, true); + f2fs_put_dnode(&dn); + } +next: + /* we don't need to invalidate this in the sccessful status */ + if (drop || recover) + ClearPageUptodate(page); + set_page_private(page, 0); + ClearPagePrivate(page); + f2fs_put_page(page, 1); + + list_del(&cur->list); + kmem_cache_free(inmem_entry_slab, cur); + dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); + } + return err; +} + +void drop_inmem_pages(struct inode *inode) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + + clear_inode_flag(inode, FI_ATOMIC_FILE); + + mutex_lock(&fi->inmem_lock); + __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); + mutex_unlock(&fi->inmem_lock); +} + +static int __commit_inmem_pages(struct inode *inode, + struct list_head *revoke_list) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); struct inmem_pages *cur, *tmp; - bool submit_bio = false; struct f2fs_io_info fio = { .sbi = sbi, .type = DATA, .rw = WRITE_SYNC | REQ_PRIO, .encrypted_page = NULL, }; + bool submit_bio = false; int err = 0; - /* - * The abort is true only when f2fs_evict_inode is called. - * Basically, the f2fs_evict_inode doesn't produce any data writes, so - * that we don't need to call f2fs_balance_fs. - * Otherwise, f2fs_gc in f2fs_balance_fs can wait forever until this - * inode becomes free by iget_locked in f2fs_iget. - */ - if (!abort) { - f2fs_balance_fs(sbi); - f2fs_lock_op(sbi); + list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { + struct page *page = cur->page; + + lock_page(page); + if (page->mapping == inode->i_mapping) { + trace_f2fs_commit_inmem_page(page, INMEM); + + set_page_dirty(page); + f2fs_wait_on_page_writeback(page, DATA, true); + if (clear_page_dirty_for_io(page)) + inode_dec_dirty_pages(inode); + + fio.page = page; + err = do_write_data_page(&fio); + if (err) { + unlock_page(page); + break; + } + + /* record old blkaddr for revoking */ + cur->old_addr = fio.old_blkaddr; + + clear_cold_data(page); + submit_bio = true; + } + unlock_page(page); + list_move_tail(&cur->list, revoke_list); } - mutex_lock(&fi->inmem_lock); - list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { - lock_page(cur->page); - if (!abort) { - if (cur->page->mapping == inode->i_mapping) { - set_page_dirty(cur->page); - f2fs_wait_on_page_writeback(cur->page, DATA); - if (clear_page_dirty_for_io(cur->page)) - inode_dec_dirty_pages(inode); - trace_f2fs_commit_inmem_page(cur->page, INMEM); - fio.page = cur->page; - err = do_write_data_page(&fio); - if (err) { - unlock_page(cur->page); - break; - } - clear_cold_data(cur->page); - submit_bio = true; - } - } else { - trace_f2fs_commit_inmem_page(cur->page, INMEM_DROP); - } - set_page_private(cur->page, 0); - ClearPagePrivate(cur->page); - f2fs_put_page(cur->page, 1); + if (submit_bio) + f2fs_submit_merged_bio_cond(sbi, inode, NULL, 0, DATA, WRITE); - list_del(&cur->list); - kmem_cache_free(inmem_entry_slab, cur); - dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); + if (!err) + __revoke_inmem_pages(inode, revoke_list, false, false); + + return err; +} + +int commit_inmem_pages(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct list_head revoke_list; + int err; + + INIT_LIST_HEAD(&revoke_list); + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); + + mutex_lock(&fi->inmem_lock); + err = __commit_inmem_pages(inode, &revoke_list); + if (err) { + int ret; + /* + * try to revoke all committed pages, but still we could fail + * due to no memory or other reason, if that happened, EAGAIN + * will be returned, which means in such case, transaction is + * already not integrity, caller should use journal to do the + * recovery or rewrite & commit last transaction. For other + * error number, revoking was done by filesystem itself. + */ + ret = __revoke_inmem_pages(inode, &revoke_list, false, true); + if (ret) + err = ret; + + /* drop all uncommitted pages */ + __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); } mutex_unlock(&fi->inmem_lock); - if (!abort) { - f2fs_unlock_op(sbi); - if (submit_bio) - f2fs_submit_merged_bio(sbi, DATA, WRITE); - } + f2fs_unlock_op(sbi); return err; } @@ -281,13 +341,25 @@ int commit_inmem_pages(struct inode *inode, bool abort) * This function balances dirty node and dentry pages. * In addition, it controls garbage collection. */ -void f2fs_balance_fs(struct f2fs_sb_info *sbi) +void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_CHECKPOINT)) + f2fs_stop_checkpoint(sbi, false); +#endif + + if (!need) + return; + + /* balance_fs_bg is able to be pending */ + if (excess_cached_nats(sbi)) + f2fs_balance_fs_bg(sbi); + /* * We should do GC or end up with checkpoint, if there are so many dirty * dir/node pages without enough free segments. */ - if (has_not_enough_free_secs(sbi, 0)) { + if (has_not_enough_free_secs(sbi, 0, 0)) { mutex_lock(&sbi->gc_mutex); f2fs_gc(sbi, false); } @@ -304,14 +376,26 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK); if (!available_free_memory(sbi, FREE_NIDS)) - try_to_free_nids(sbi, NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES); + try_to_free_nids(sbi, MAX_FREE_NIDS); + else + build_free_nids(sbi); /* checkpoint is the only way to shrink partial cached entries */ if (!available_free_memory(sbi, NAT_ENTRIES) || - excess_prefree_segs(sbi) || !available_free_memory(sbi, INO_ENTRIES) || - jiffies > sbi->cp_expires) + excess_prefree_segs(sbi) || + excess_dirty_nats(sbi) || + (is_idle(sbi) && f2fs_time_over(sbi, CP_TIME))) { + if (test_opt(sbi, DATA_FLUSH)) { + struct blk_plug plug; + + blk_start_plug(&plug); + sync_dirty_inodes(sbi, FILE_INODE); + blk_finish_plug(&plug); + } f2fs_sync_fs(sbi->sb, true); + stat_inc_bg_cp_count(sbi->stat_info); + } } static int issue_flush_thread(void *data) @@ -361,24 +445,28 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) if (test_opt(sbi, NOBARRIER)) return 0; - if (!test_opt(sbi, FLUSH_MERGE)) { + if (!test_opt(sbi, FLUSH_MERGE) || !atomic_read(&fcc->submit_flush)) { struct bio *bio = f2fs_bio_alloc(0); int ret; + atomic_inc(&fcc->submit_flush); bio->bi_bdev = sbi->sb->s_bdev; ret = submit_bio_wait(WRITE_FLUSH, bio); + atomic_dec(&fcc->submit_flush); bio_put(bio); return ret; } init_completion(&cmd.wait); + atomic_inc(&fcc->submit_flush); llist_add(&cmd.llnode, &fcc->issue_list); if (!fcc->dispatch_list) wake_up(&fcc->flush_wait_queue); wait_for_completion(&cmd.wait); + atomic_dec(&fcc->submit_flush); return cmd.ret; } @@ -392,6 +480,7 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL); if (!fcc) return -ENOMEM; + atomic_set(&fcc->submit_flush, 0); init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); SM_I(sbi)->cmd_control_info = fcc; @@ -513,28 +602,6 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0); } -bool discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr) -{ - int err = -ENOTSUPP; - - if (test_opt(sbi, DISCARD)) { - struct seg_entry *se = get_seg_entry(sbi, - GET_SEGNO(sbi, blkaddr)); - unsigned int offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); - - if (f2fs_test_bit(offset, se->discard_map)) - return false; - - err = f2fs_issue_discard(sbi, blkaddr, 1); - } - - if (err) { - update_meta_page(sbi, NULL, blkaddr); - return true; - } - return false; -} - static void __add_discard_entry(struct f2fs_sb_info *sbi, struct cp_control *cpc, struct seg_entry *se, unsigned int start, unsigned int end) @@ -573,7 +640,7 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) bool force = (cpc->reason == CP_DISCARD); int i; - if (se->valid_blocks == max_blocks) + if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi)) return; if (!force) { @@ -593,6 +660,10 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) break; end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1); + if (force && start && end != max_blocks + && (end - start) < cpc->trim_minlen) + continue; + __add_discard_entry(sbi, cpc, se, start, end); } } @@ -630,6 +701,8 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int start = 0, end = -1; + unsigned int secno, start_segno; + bool force = (cpc->reason == CP_DISCARD); mutex_lock(&dirty_i->seglist_lock); @@ -646,17 +719,31 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) dirty_i->nr_dirty[PRE] -= end - start; - if (!test_opt(sbi, DISCARD)) + if (force || !test_opt(sbi, DISCARD)) continue; - f2fs_issue_discard(sbi, START_BLOCK(sbi, start), + if (!test_opt(sbi, LFS) || sbi->segs_per_sec == 1) { + f2fs_issue_discard(sbi, START_BLOCK(sbi, start), (end - start) << sbi->log_blocks_per_seg); + continue; + } +next: + secno = GET_SECNO(sbi, start); + start_segno = secno * sbi->segs_per_sec; + if (!IS_CURSEC(sbi, secno) && + !get_valid_blocks(sbi, start, sbi->segs_per_sec)) + f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno), + sbi->segs_per_sec << sbi->log_blocks_per_seg); + + start = start_segno + sbi->segs_per_sec; + if (start < end) + goto next; } mutex_unlock(&dirty_i->seglist_lock); /* send small discards */ list_for_each_entry_safe(entry, this, head, list) { - if (cpc->reason == CP_DISCARD && entry->len < cpc->trim_minlen) + if (force && entry->len < cpc->trim_minlen) goto skip; f2fs_issue_discard(sbi, entry->blkaddr, entry->len); cpc->trimmed += entry->len; @@ -711,12 +798,14 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) if (del > 0) { if (f2fs_test_and_set_bit(offset, se->cur_valid_map)) f2fs_bug_on(sbi, 1); - if (!f2fs_test_and_set_bit(offset, se->discard_map)) + if (f2fs_discard_en(sbi) && + !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; } else { if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) f2fs_bug_on(sbi, 1); - if (f2fs_test_and_clear_bit(offset, se->discard_map)) + if (f2fs_discard_en(sbi) && + f2fs_test_and_clear_bit(offset, se->discard_map)) sbi->discard_blks++; } if (!f2fs_test_bit(offset, se->ckpt_valid_map)) @@ -817,12 +906,12 @@ int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) } } - sum_in_page = (PAGE_CACHE_SIZE - 2 * SUM_JOURNAL_SIZE - + sum_in_page = (PAGE_SIZE - 2 * SUM_JOURNAL_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE; if (valid_sum_count <= sum_in_page) return 1; else if ((valid_sum_count - sum_in_page) <= - (PAGE_CACHE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE) + (PAGE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE) return 2; return 3; } @@ -841,9 +930,9 @@ void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr) void *dst = page_address(page); if (src) - memcpy(dst, src, PAGE_CACHE_SIZE); + memcpy(dst, src, PAGE_SIZE); else - memset(dst, 0, PAGE_CACHE_SIZE); + memset(dst, 0, PAGE_SIZE); set_page_dirty(page); f2fs_put_page(page, 1); } @@ -854,6 +943,31 @@ static void write_sum_page(struct f2fs_sb_info *sbi, update_meta_page(sbi, (void *)sum_blk, blk_addr); } +static void write_current_sum_page(struct f2fs_sb_info *sbi, + int type, block_t blk_addr) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + struct page *page = grab_meta_page(sbi, blk_addr); + struct f2fs_summary_block *src = curseg->sum_blk; + struct f2fs_summary_block *dst; + + dst = (struct f2fs_summary_block *)page_address(page); + + mutex_lock(&curseg->curseg_mutex); + + down_read(&curseg->journal_rwsem); + memcpy(&dst->journal, curseg->journal, SUM_JOURNAL_SIZE); + up_read(&curseg->journal_rwsem); + + memcpy(dst->entries, src->entries, SUM_ENTRY_SIZE); + memcpy(&dst->footer, &src->footer, SUM_FOOTER_SIZE); + + mutex_unlock(&curseg->curseg_mutex); + + set_page_dirty(page); + f2fs_put_page(page, 1); +} + static int is_next_segment_free(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -886,9 +1000,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi, if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) { segno = find_next_zero_bit(free_i->free_segmap, - MAIN_SEGS(sbi), *newseg + 1); - if (segno - *newseg < sbi->segs_per_sec - - (*newseg % sbi->segs_per_sec)) + (hint + 1) * sbi->segs_per_sec, *newseg + 1); + if (segno < (hint + 1) * sbi->segs_per_sec) goto got_it; } find_other_zone: @@ -1071,7 +1184,7 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) struct curseg_info *curseg = CURSEG_I(sbi, type); const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; - if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0)) + if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0, 0)) return v_ops->get_victim(sbi, &(curseg)->next_segno, BG_GC, type, SSR); @@ -1120,6 +1233,9 @@ void allocate_new_segments(struct f2fs_sb_info *sbi) { int i; + if (test_opt(sbi, LFS)) + return; + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) __allocate_new_segments(sbi, i); } @@ -1134,6 +1250,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) __u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1; unsigned int start_segno, end_segno; struct cp_control cpc; + int err = 0; if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize) return -EINVAL; @@ -1142,6 +1259,12 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) if (end <= MAIN_BLKADDR(sbi)) goto out; + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) { + f2fs_msg(sbi->sb, KERN_WARNING, + "Found FS corruption, run fsck to fix."); + goto out; + } + /* start/end segment number in main_area */ start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start); end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 : @@ -1164,12 +1287,16 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) sbi->segs_per_sec) - 1, end_segno); mutex_lock(&sbi->gc_mutex); - write_checkpoint(sbi, &cpc); + err = write_checkpoint(sbi, &cpc); mutex_unlock(&sbi->gc_mutex); + if (err) + break; + + schedule(); } out: range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); - return 0; + return err; } static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) @@ -1256,7 +1383,7 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, /* direct_io'ed data is aligned to the segment for better performance */ if (direct_io && curseg->next_blkoff && - !has_not_enough_free_secs(sbi, 0)) + !has_not_enough_free_secs(sbi, 0, 0)) __allocate_new_segments(sbi, type); *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); @@ -1292,11 +1419,17 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { int type = __get_segment_type(fio->page, fio->type); - allocate_data_block(fio->sbi, fio->page, fio->blk_addr, - &fio->blk_addr, sum, type); + if (fio->type == NODE || fio->type == DATA) + mutex_lock(&fio->sbi->wio_mutex[fio->type]); + + allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, + &fio->new_blkaddr, sum, type); /* writeout dirty page into bdev */ f2fs_submit_page_mbio(fio); + + if (fio->type == NODE || fio->type == DATA) + mutex_unlock(&fio->sbi->wio_mutex[fio->type]); } void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) @@ -1305,7 +1438,8 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) .sbi = sbi, .type = META, .rw = WRITE_SYNC | REQ_META | REQ_PRIO, - .blk_addr = page->index, + .old_blkaddr = page->index, + .new_blkaddr = page->index, .page = page, .encrypted_page = NULL, }; @@ -1335,19 +1469,19 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); do_write_page(&sum, fio); - dn->data_blkaddr = fio->blk_addr; + f2fs_update_data_blkaddr(dn, fio->new_blkaddr); } void rewrite_data_page(struct f2fs_io_info *fio) { + fio->new_blkaddr = fio->old_blkaddr; stat_inc_inplace_blocks(fio->sbi); f2fs_submit_page_mbio(fio); } -static void __f2fs_replace_block(struct f2fs_sb_info *sbi, - struct f2fs_summary *sum, +void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, block_t old_blkaddr, block_t new_blkaddr, - bool recover_curseg) + bool recover_curseg, bool recover_newaddr) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg; @@ -1390,7 +1524,7 @@ static void __f2fs_replace_block(struct f2fs_sb_info *sbi, curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); __add_sum_entry(sbi, type, sum); - if (!recover_curseg) + if (!recover_curseg || recover_newaddr) update_sit_entry(sbi, new_blkaddr, 1); if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) update_sit_entry(sbi, old_blkaddr, -1); @@ -1414,66 +1548,30 @@ static void __f2fs_replace_block(struct f2fs_sb_info *sbi, void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, block_t old_addr, block_t new_addr, - unsigned char version, bool recover_curseg) + unsigned char version, bool recover_curseg, + bool recover_newaddr) { struct f2fs_summary sum; set_summary(&sum, dn->nid, dn->ofs_in_node, version); - __f2fs_replace_block(sbi, &sum, old_addr, new_addr, recover_curseg); + __f2fs_replace_block(sbi, &sum, old_addr, new_addr, + recover_curseg, recover_newaddr); - dn->data_blkaddr = new_addr; - set_data_blkaddr(dn); - f2fs_update_extent_cache(dn); -} - -static inline bool is_merged_page(struct f2fs_sb_info *sbi, - struct page *page, enum page_type type) -{ - enum page_type btype = PAGE_TYPE_OF_BIO(type); - struct f2fs_bio_info *io = &sbi->write_io[btype]; - struct bio_vec *bvec; - struct page *target; - int i; - - down_read(&io->io_rwsem); - if (!io->bio) { - up_read(&io->io_rwsem); - return false; - } - - bio_for_each_segment_all(bvec, io->bio, i) { - - if (bvec->bv_page->mapping) { - target = bvec->bv_page; - } else { - struct f2fs_crypto_ctx *ctx; - - /* encrypted page */ - ctx = (struct f2fs_crypto_ctx *)page_private( - bvec->bv_page); - target = ctx->w.control_page; - } - - if (page == target) { - up_read(&io->io_rwsem); - return true; - } - } - - up_read(&io->io_rwsem); - return false; + f2fs_update_data_blkaddr(dn, new_addr); } void f2fs_wait_on_page_writeback(struct page *page, - enum page_type type) + enum page_type type, bool ordered) { if (PageWriteback(page)) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); - if (is_merged_page(sbi, page, type)) - f2fs_submit_merged_bio(sbi, type, WRITE); - wait_on_page_writeback(page); + f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, type, WRITE); + if (ordered) + wait_on_page_writeback(page); + else + wait_for_stable_page(page); } } @@ -1482,14 +1580,12 @@ void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi, { struct page *cpage; - if (blkaddr == NEW_ADDR) + if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) return; - f2fs_bug_on(sbi, blkaddr == NULL_ADDR); - cpage = find_lock_page(META_MAPPING(sbi), blkaddr); if (cpage) { - f2fs_wait_on_page_writeback(cpage, DATA); + f2fs_wait_on_page_writeback(cpage, DATA, true); f2fs_put_page(cpage, 1); } } @@ -1510,12 +1606,11 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi) /* Step 1: restore nat cache */ seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); - memcpy(&seg_i->sum_blk->n_nats, kaddr, SUM_JOURNAL_SIZE); + memcpy(seg_i->journal, kaddr, SUM_JOURNAL_SIZE); /* Step 2: restore sit cache */ seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA); - memcpy(&seg_i->sum_blk->n_sits, kaddr + SUM_JOURNAL_SIZE, - SUM_JOURNAL_SIZE); + memcpy(seg_i->journal, kaddr + SUM_JOURNAL_SIZE, SUM_JOURNAL_SIZE); offset = 2 * SUM_JOURNAL_SIZE; /* Step 3: restore summary entries */ @@ -1539,7 +1634,7 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi) s = (struct f2fs_summary *)(kaddr + offset); seg_i->sum_blk->entries[j] = *s; offset += SUMMARY_SIZE; - if (offset + SUMMARY_SIZE <= PAGE_CACHE_SIZE - + if (offset + SUMMARY_SIZE <= PAGE_SIZE - SUM_FOOTER_SIZE) continue; @@ -1611,7 +1706,14 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) /* set uncompleted segment to curseg */ curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); - memcpy(curseg->sum_blk, sum, PAGE_CACHE_SIZE); + + /* update journal info */ + down_write(&curseg->journal_rwsem); + memcpy(curseg->journal, &sum->journal, SUM_JOURNAL_SIZE); + up_write(&curseg->journal_rwsem); + + memcpy(curseg->sum_blk->entries, sum->entries, SUM_ENTRY_SIZE); + memcpy(&curseg->sum_blk->footer, &sum->footer, SUM_FOOTER_SIZE); curseg->next_segno = segno; reset_curseg(sbi, type, 0); curseg->alloc_type = ckpt->alloc_type[type]; @@ -1626,7 +1728,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) int type = CURSEG_HOT_DATA; int err; - if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) { + if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) { int npages = npages_for_summary_flush(sbi, true); if (npages >= 2) @@ -1666,13 +1768,12 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) /* Step 1: write nat cache */ seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); - memcpy(kaddr, &seg_i->sum_blk->n_nats, SUM_JOURNAL_SIZE); + memcpy(kaddr, seg_i->journal, SUM_JOURNAL_SIZE); written_size += SUM_JOURNAL_SIZE; /* Step 2: write sit cache */ seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA); - memcpy(kaddr + written_size, &seg_i->sum_blk->n_sits, - SUM_JOURNAL_SIZE); + memcpy(kaddr + written_size, seg_i->journal, SUM_JOURNAL_SIZE); written_size += SUM_JOURNAL_SIZE; /* Step 3: write summary entries */ @@ -1694,7 +1795,7 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) *summary = seg_i->sum_blk->entries[j]; written_size += SUMMARY_SIZE; - if (written_size + SUMMARY_SIZE <= PAGE_CACHE_SIZE - + if (written_size + SUMMARY_SIZE <= PAGE_SIZE - SUM_FOOTER_SIZE) continue; @@ -1718,17 +1819,13 @@ static void write_normal_summaries(struct f2fs_sb_info *sbi, else end = type + NR_CURSEG_NODE_TYPE; - for (i = type; i < end; i++) { - struct curseg_info *sum = CURSEG_I(sbi, i); - mutex_lock(&sum->curseg_mutex); - write_sum_page(sbi, sum->sum_blk, blkaddr + (i - type)); - mutex_unlock(&sum->curseg_mutex); - } + for (i = type; i < end; i++) + write_current_sum_page(sbi, i, blkaddr + (i - type)); } void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk) { - if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) + if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) write_compacted_summaries(sbi, start_blk); else write_normal_summaries(sbi, start_blk, CURSEG_HOT_DATA); @@ -1739,24 +1836,24 @@ void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk) write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE); } -int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type, +int lookup_journal_in_cursum(struct f2fs_journal *journal, int type, unsigned int val, int alloc) { int i; if (type == NAT_JOURNAL) { - for (i = 0; i < nats_in_cursum(sum); i++) { - if (le32_to_cpu(nid_in_journal(sum, i)) == val) + for (i = 0; i < nats_in_cursum(journal); i++) { + if (le32_to_cpu(nid_in_journal(journal, i)) == val) return i; } - if (alloc && nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES) - return update_nats_in_cursum(sum, 1); + if (alloc && __has_cursum_space(journal, 1, NAT_JOURNAL)) + return update_nats_in_cursum(journal, 1); } else if (type == SIT_JOURNAL) { - for (i = 0; i < sits_in_cursum(sum); i++) - if (le32_to_cpu(segno_in_journal(sum, i)) == val) + for (i = 0; i < sits_in_cursum(journal); i++) + if (le32_to_cpu(segno_in_journal(journal, i)) == val) return i; - if (alloc && sits_in_cursum(sum) < SIT_JOURNAL_ENTRIES) - return update_sits_in_cursum(sum, 1); + if (alloc && __has_cursum_space(journal, 1, SIT_JOURNAL)) + return update_sits_in_cursum(journal, 1); } return -1; } @@ -1785,7 +1882,7 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, src_addr = page_address(src_page); dst_addr = page_address(dst_page); - memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE); + memcpy(dst_addr, src_addr, PAGE_SIZE); set_page_dirty(dst_page); f2fs_put_page(src_page, 1); @@ -1860,20 +1957,22 @@ static void add_sits_in_set(struct f2fs_sb_info *sbi) static void remove_sits_in_journal(struct f2fs_sb_info *sbi) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; + struct f2fs_journal *journal = curseg->journal; int i; - for (i = sits_in_cursum(sum) - 1; i >= 0; i--) { + down_write(&curseg->journal_rwsem); + for (i = 0; i < sits_in_cursum(journal); i++) { unsigned int segno; bool dirtied; - segno = le32_to_cpu(segno_in_journal(sum, i)); + segno = le32_to_cpu(segno_in_journal(journal, i)); dirtied = __mark_sit_entry_dirty(sbi, segno); if (!dirtied) add_sit_entry(segno, &SM_I(sbi)->sit_entry_set); } - update_sits_in_cursum(sum, -sits_in_cursum(sum)); + update_sits_in_cursum(journal, -i); + up_write(&curseg->journal_rwsem); } /* @@ -1885,13 +1984,12 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct sit_info *sit_i = SIT_I(sbi); unsigned long *bitmap = sit_i->dirty_sentries_bitmap; struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; + struct f2fs_journal *journal = curseg->journal; struct sit_entry_set *ses, *tmp; struct list_head *head = &SM_I(sbi)->sit_entry_set; bool to_journal = true; struct seg_entry *se; - mutex_lock(&curseg->curseg_mutex); mutex_lock(&sit_i->sentry_lock); if (!sit_i->dirty_sentries) @@ -1908,7 +2006,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * entries, remove all entries from journal and add and account * them in sit entry set. */ - if (!__has_cursum_space(sum, sit_i->dirty_sentries, SIT_JOURNAL)) + if (!__has_cursum_space(journal, sit_i->dirty_sentries, SIT_JOURNAL)) remove_sits_in_journal(sbi); /* @@ -1925,10 +2023,12 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) unsigned int segno = start_segno; if (to_journal && - !__has_cursum_space(sum, ses->entry_cnt, SIT_JOURNAL)) + !__has_cursum_space(journal, ses->entry_cnt, SIT_JOURNAL)) to_journal = false; - if (!to_journal) { + if (to_journal) { + down_write(&curseg->journal_rwsem); + } else { page = get_next_sit_page(sbi, start_segno); raw_sit = page_address(page); } @@ -1946,13 +2046,13 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) } if (to_journal) { - offset = lookup_journal_in_cursum(sum, + offset = lookup_journal_in_cursum(journal, SIT_JOURNAL, segno, 1); f2fs_bug_on(sbi, offset < 0); - segno_in_journal(sum, offset) = + segno_in_journal(journal, offset) = cpu_to_le32(segno); seg_info_to_raw_sit(se, - &sit_in_journal(sum, offset)); + &sit_in_journal(journal, offset)); } else { sit_offset = SIT_ENTRY_OFFSET(sit_i, segno); seg_info_to_raw_sit(se, @@ -1964,7 +2064,9 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) ses->entry_cnt--; } - if (!to_journal) + if (to_journal) + up_write(&curseg->journal_rwsem); + else f2fs_put_page(page, 1); f2fs_bug_on(sbi, ses->entry_cnt); @@ -1979,7 +2081,6 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) add_discard_addrs(sbi, cpc); } mutex_unlock(&sit_i->sentry_lock); - mutex_unlock(&curseg->curseg_mutex); set_prefree_as_free_segments(sbi); } @@ -2015,12 +2116,16 @@ static int build_sit_info(struct f2fs_sb_info *sbi) = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); sit_i->sentries[start].ckpt_valid_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); - sit_i->sentries[start].discard_map - = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); if (!sit_i->sentries[start].cur_valid_map || - !sit_i->sentries[start].ckpt_valid_map || - !sit_i->sentries[start].discard_map) + !sit_i->sentries[start].ckpt_valid_map) return -ENOMEM; + + if (f2fs_discard_en(sbi)) { + sit_i->sentries[start].discard_map + = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + if (!sit_i->sentries[start].discard_map) + return -ENOMEM; + } } sit_i->tmp_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); @@ -2108,9 +2213,14 @@ static int build_curseg(struct f2fs_sb_info *sbi) for (i = 0; i < NR_CURSEG_TYPE; i++) { mutex_init(&array[i].curseg_mutex); - array[i].sum_blk = kzalloc(PAGE_CACHE_SIZE, GFP_KERNEL); + array[i].sum_blk = kzalloc(PAGE_SIZE, GFP_KERNEL); if (!array[i].sum_blk) return -ENOMEM; + init_rwsem(&array[i].journal_rwsem); + array[i].journal = kzalloc(sizeof(struct f2fs_journal), + GFP_KERNEL); + if (!array[i].journal) + return -ENOMEM; array[i].segno = NULL_SEGNO; array[i].next_blkoff = 0; } @@ -2121,11 +2231,13 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; + struct f2fs_journal *journal = curseg->journal; + struct seg_entry *se; + struct f2fs_sit_entry sit; int sit_blk_cnt = SIT_BLK_CNT(sbi); unsigned int i, start, end; unsigned int readed, start_blk = 0; - int nrpages = MAX_BIO_BLOCKS(sbi); + int nrpages = MAX_BIO_BLOCKS(sbi) * 8; do { readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT, true); @@ -2134,41 +2246,58 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) end = (start_blk + readed) * sit_i->sents_per_block; for (; start < end && start < MAIN_SEGS(sbi); start++) { - struct seg_entry *se = &sit_i->sentries[start]; struct f2fs_sit_block *sit_blk; - struct f2fs_sit_entry sit; struct page *page; - mutex_lock(&curseg->curseg_mutex); - for (i = 0; i < sits_in_cursum(sum); i++) { - if (le32_to_cpu(segno_in_journal(sum, i)) - == start) { - sit = sit_in_journal(sum, i); - mutex_unlock(&curseg->curseg_mutex); - goto got_it; - } - } - mutex_unlock(&curseg->curseg_mutex); - + se = &sit_i->sentries[start]; page = get_current_sit_page(sbi, start); sit_blk = (struct f2fs_sit_block *)page_address(page); sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)]; f2fs_put_page(page, 1); -got_it: + check_block_count(sbi, start, &sit); seg_info_from_raw_sit(se, &sit); /* build discard map only one time */ - memcpy(se->discard_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += sbi->blocks_per_seg - se->valid_blocks; - - if (sbi->segs_per_sec > 1) { - struct sec_entry *e = get_sec_entry(sbi, start); - e->valid_blocks += se->valid_blocks; + if (f2fs_discard_en(sbi)) { + memcpy(se->discard_map, se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += sbi->blocks_per_seg - + se->valid_blocks; } + + if (sbi->segs_per_sec > 1) + get_sec_entry(sbi, start)->valid_blocks += + se->valid_blocks; } start_blk += readed; } while (start_blk < sit_blk_cnt); + + down_read(&curseg->journal_rwsem); + for (i = 0; i < sits_in_cursum(journal); i++) { + unsigned int old_valid_blocks; + + start = le32_to_cpu(segno_in_journal(journal, i)); + se = &sit_i->sentries[start]; + sit = sit_in_journal(journal, i); + + old_valid_blocks = se->valid_blocks; + + check_block_count(sbi, start, &sit); + seg_info_from_raw_sit(se, &sit); + + if (f2fs_discard_en(sbi)) { + memcpy(se->discard_map, se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += old_valid_blocks - + se->valid_blocks; + } + + if (sbi->segs_per_sec > 1) + get_sec_entry(sbi, start)->valid_blocks += + se->valid_blocks - old_valid_blocks; + } + up_read(&curseg->journal_rwsem); } static void init_free_segmap(struct f2fs_sb_info *sbi) @@ -2301,7 +2430,11 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); sm_info->rec_prefree_segments = sm_info->main_segments * DEF_RECLAIM_PREFREE_SEGMENTS / 100; - sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; + if (sm_info->rec_prefree_segments > DEF_MAX_RECLAIM_PREFREE_SEGMENTS) + sm_info->rec_prefree_segments = DEF_MAX_RECLAIM_PREFREE_SEGMENTS; + + if (!test_opt(sbi, LFS)) + sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; @@ -2383,8 +2516,10 @@ static void destroy_curseg(struct f2fs_sb_info *sbi) if (!array) return; SM_I(sbi)->curseg_array = NULL; - for (i = 0; i < NR_CURSEG_TYPE; i++) + for (i = 0; i < NR_CURSEG_TYPE; i++) { kfree(array[i].sum_blk); + kfree(array[i].journal); + } kfree(array); } @@ -2450,7 +2585,7 @@ int __init create_segment_manager_caches(void) sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set", sizeof(struct sit_entry_set)); if (!sit_entry_set_slab) - goto destory_discard_entry; + goto destroy_discard_entry; inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry", sizeof(struct inmem_pages)); @@ -2460,7 +2595,7 @@ int __init create_segment_manager_caches(void) destroy_sit_entry_set: kmem_cache_destroy(sit_entry_set_slab); -destory_discard_entry: +destroy_discard_entry: kmem_cache_destroy(discard_entry_slab); fail: return -ENOMEM; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index ee44d346ea44..fecb856ad874 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -16,6 +16,7 @@ #define NULL_SECNO ((unsigned int)(~0)) #define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */ +#define DEF_MAX_RECLAIM_PREFREE_SEGMENTS 4096 /* 8GB in maximum */ /* L: Logical segment # in volume, R: Relative segment # in main area */ #define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) @@ -158,16 +159,17 @@ struct victim_sel_policy { }; struct seg_entry { - unsigned short valid_blocks; /* # of valid blocks */ + unsigned int type:6; /* segment type like CURSEG_XXX_TYPE */ + unsigned int valid_blocks:10; /* # of valid blocks */ + unsigned int ckpt_valid_blocks:10; /* # of valid blocks last cp */ + unsigned int padding:6; /* padding */ unsigned char *cur_valid_map; /* validity bitmap of blocks */ /* * # of valid blocks and the validity bitmap stored in the the last * checkpoint pack. This information is used by the SSR mode. */ - unsigned short ckpt_valid_blocks; - unsigned char *ckpt_valid_map; + unsigned char *ckpt_valid_map; /* validity bitmap of blocks last cp */ unsigned char *discard_map; - unsigned char type; /* segment type like CURSEG_XXX_TYPE */ unsigned long long mtime; /* modification time of the segment */ }; @@ -183,7 +185,7 @@ struct segment_allocation { * this value is set in page as a private data which indicate that * the page is atomically written, and it is in inmem_pages list. */ -#define ATOMIC_WRITTEN_PAGE 0x0000ffff +#define ATOMIC_WRITTEN_PAGE ((unsigned long)-1) #define IS_ATOMIC_WRITTEN_PAGE(page) \ (page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE) @@ -191,6 +193,7 @@ struct segment_allocation { struct inmem_pages { struct list_head list; struct page *page; + block_t old_addr; /* for revoking when fail to commit */ }; struct sit_info { @@ -257,6 +260,8 @@ struct victim_selection { struct curseg_info { struct mutex curseg_mutex; /* lock for consistency */ struct f2fs_summary_block *sum_blk; /* cached summary block */ + struct rw_semaphore journal_rwsem; /* protect journal area */ + struct f2fs_journal *journal; /* cached journal info */ unsigned char alloc_type; /* current allocation type */ unsigned int segno; /* current segment number */ unsigned short next_blkoff; /* next block offset to write */ @@ -466,20 +471,27 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi) { int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + + if (test_opt(sbi, LFS)) + return false; + return free_sections(sbi) <= (node_secs + 2 * dent_secs + reserved_sections(sbi) + 1); } -static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) +static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, + int freed, int needed) { int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + node_secs += get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; - return (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs + - reserved_sections(sbi)); + return (free_sections(sbi) + freed) <= + (node_secs + 2 * dent_secs + reserved_sections(sbi) + needed); } static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi) @@ -527,6 +539,9 @@ static inline bool need_inplace_update(struct inode *inode) if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode)) return false; + if (test_opt(sbi, LFS)) + return false; + if (policy & (0x1 << F2FS_IPU_FORCE)) return true; if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi)) @@ -540,7 +555,7 @@ static inline bool need_inplace_update(struct inode *inode) /* this is only set during fdatasync */ if (policy & (0x1 << F2FS_IPU_FSYNC) && - is_inode_flag_set(F2FS_I(inode), FI_NEED_IPU)) + is_inode_flag_set(inode, FI_NEED_IPU)) return true; return false; @@ -573,8 +588,8 @@ static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) { - f2fs_bug_on(sbi, blk_addr < SEG0_BLKADDR(sbi) - || blk_addr >= MAX_BLKADDR(sbi)); + BUG_ON(blk_addr < SEG0_BLKADDR(sbi) + || blk_addr >= MAX_BLKADDR(sbi)); } /* @@ -702,9 +717,9 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) if (type == DATA) return sbi->blocks_per_seg; else if (type == NODE) - return 3 * sbi->blocks_per_seg; + return 8 * sbi->blocks_per_seg; else if (type == META) - return MAX_BIO_BLOCKS(sbi); + return 8 * MAX_BIO_BLOCKS(sbi); else return 0; } @@ -722,10 +737,8 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, nr_to_write = wbc->nr_to_write; - if (type == DATA) - desired = 4096; - else if (type == NODE) - desired = 3 * max_hw_blocks(sbi); + if (type == NODE) + desired = 2 * max_hw_blocks(sbi); else desired = MAX_BIO_BLOCKS(sbi); diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index da0d8e0b55a5..46c915425923 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -13,6 +13,7 @@ #include #include "f2fs.h" +#include "node.h" static LIST_HEAD(f2fs_list); static DEFINE_SPINLOCK(f2fs_list_lock); @@ -25,14 +26,15 @@ static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) { - if (NM_I(sbi)->fcnt > NAT_ENTRY_PER_BLOCK) - return NM_I(sbi)->fcnt - NAT_ENTRY_PER_BLOCK; + if (NM_I(sbi)->fcnt > MAX_FREE_NIDS) + return NM_I(sbi)->fcnt - MAX_FREE_NIDS; return 0; } static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi) { - return sbi->total_ext_tree + atomic_read(&sbi->total_ext_node); + return atomic_read(&sbi->total_zombie_tree) + + atomic_read(&sbi->total_ext_node); } unsigned long f2fs_shrink_count(struct shrinker *shrink, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3a65e0132352..fd249cc9b96e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -39,6 +39,35 @@ static struct proc_dir_entry *f2fs_proc_root; static struct kmem_cache *f2fs_inode_cachep; static struct kset *f2fs_kset; +#ifdef CONFIG_F2FS_FAULT_INJECTION + +char *fault_name[FAULT_MAX] = { + [FAULT_KMALLOC] = "kmalloc", + [FAULT_PAGE_ALLOC] = "page alloc", + [FAULT_ALLOC_NID] = "alloc nid", + [FAULT_ORPHAN] = "orphan", + [FAULT_BLOCK] = "no more block", + [FAULT_DIR_DEPTH] = "too big dir depth", + [FAULT_EVICT_INODE] = "evict_inode fail", + [FAULT_IO] = "IO error", + [FAULT_CHECKPOINT] = "checkpoint error", +}; + +static void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, + unsigned int rate) +{ + struct f2fs_fault_info *ffi = &sbi->fault_info; + + if (rate) { + atomic_set(&ffi->inject_ops, 0); + ffi->inject_rate = rate; + ffi->inject_type = (1 << FAULT_MAX) - 1; + } else { + memset(ffi, 0, sizeof(struct f2fs_fault_info)); + } +} +#endif + /* f2fs-wide shrinker description */ static struct shrinker f2fs_shrinker_info = { .scan_objects = f2fs_shrink_scan, @@ -51,6 +80,7 @@ enum { Opt_disable_roll_forward, Opt_norecovery, Opt_discard, + Opt_nodiscard, Opt_noheap, Opt_user_xattr, Opt_nouser_xattr, @@ -61,12 +91,19 @@ enum { Opt_inline_xattr, Opt_inline_data, Opt_inline_dentry, + Opt_noinline_dentry, Opt_flush_merge, + Opt_noflush_merge, Opt_nobarrier, Opt_fastboot, Opt_extent_cache, Opt_noextent_cache, Opt_noinline_data, + Opt_data_flush, + Opt_mode, + Opt_fault_injection, + Opt_lazytime, + Opt_nolazytime, Opt_err, }; @@ -75,6 +112,7 @@ static match_table_t f2fs_tokens = { {Opt_disable_roll_forward, "disable_roll_forward"}, {Opt_norecovery, "norecovery"}, {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, {Opt_noheap, "no_heap"}, {Opt_user_xattr, "user_xattr"}, {Opt_nouser_xattr, "nouser_xattr"}, @@ -85,12 +123,19 @@ static match_table_t f2fs_tokens = { {Opt_inline_xattr, "inline_xattr"}, {Opt_inline_data, "inline_data"}, {Opt_inline_dentry, "inline_dentry"}, + {Opt_noinline_dentry, "noinline_dentry"}, {Opt_flush_merge, "flush_merge"}, + {Opt_noflush_merge, "noflush_merge"}, {Opt_nobarrier, "nobarrier"}, {Opt_fastboot, "fastboot"}, {Opt_extent_cache, "extent_cache"}, {Opt_noextent_cache, "noextent_cache"}, {Opt_noinline_data, "noinline_data"}, + {Opt_data_flush, "data_flush"}, + {Opt_mode, "mode=%s"}, + {Opt_fault_injection, "fault_injection=%u"}, + {Opt_lazytime, "lazytime"}, + {Opt_nolazytime, "nolazytime"}, {Opt_err, NULL}, }; @@ -100,6 +145,10 @@ enum { SM_INFO, /* struct f2fs_sm_info */ NM_INFO, /* struct f2fs_nm_info */ F2FS_SBI, /* struct f2fs_sb_info */ +#ifdef CONFIG_F2FS_FAULT_INJECTION + FAULT_INFO_RATE, /* struct f2fs_fault_info */ + FAULT_INFO_TYPE, /* struct f2fs_fault_info */ +#endif }; struct f2fs_attr { @@ -121,9 +170,27 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) return (unsigned char *)NM_I(sbi); else if (struct_type == F2FS_SBI) return (unsigned char *)sbi; +#ifdef CONFIG_F2FS_FAULT_INJECTION + else if (struct_type == FAULT_INFO_RATE || + struct_type == FAULT_INFO_TYPE) + return (unsigned char *)&sbi->fault_info; +#endif return NULL; } +static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->sb; + + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); + + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)(sbi->kbytes_written + + BD_PART_WRITTEN(sbi))); +} + static ssize_t f2fs_sbi_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -157,6 +224,10 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, ret = kstrtoul(skip_spaces(buf), 0, &t); if (ret < 0) return ret; +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX)) + return -EINVAL; +#endif *ui = t; return count; } @@ -202,6 +273,9 @@ static struct f2fs_attr f2fs_attr_##_name = { \ f2fs_sbi_show, f2fs_sbi_store, \ offsetof(struct struct_name, elname)) +#define F2FS_GENERAL_RO_ATTR(name) \ +static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL) + F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); @@ -214,9 +288,16 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, cp_interval); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); +#ifdef CONFIG_F2FS_FAULT_INJECTION +F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); +F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); +#endif +F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -234,7 +315,14 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(dir_level), ATTR_LIST(ram_thresh), ATTR_LIST(ra_nid_pages), + ATTR_LIST(dirty_nats_ratio), ATTR_LIST(cp_interval), + ATTR_LIST(idle_interval), +#ifdef CONFIG_F2FS_FAULT_INJECTION + ATTR_LIST(inject_rate), + ATTR_LIST(inject_type), +#endif + ATTR_LIST(lifetime_write_kbytes), NULL, }; @@ -330,6 +418,8 @@ static int parse_options(struct super_block *sb, char *options) "the device does not support discard"); } break; + case Opt_nodiscard: + clear_opt(sbi, DISCARD); case Opt_noheap: set_opt(sbi, NOHEAP); break; @@ -388,9 +478,15 @@ static int parse_options(struct super_block *sb, char *options) case Opt_inline_dentry: set_opt(sbi, INLINE_DENTRY); break; + case Opt_noinline_dentry: + clear_opt(sbi, INLINE_DENTRY); + break; case Opt_flush_merge: set_opt(sbi, FLUSH_MERGE); break; + case Opt_noflush_merge: + clear_opt(sbi, FLUSH_MERGE); + break; case Opt_nobarrier: set_opt(sbi, NOBARRIER); break; @@ -406,6 +502,42 @@ static int parse_options(struct super_block *sb, char *options) case Opt_noinline_data: clear_opt(sbi, INLINE_DATA); break; + case Opt_data_flush: + set_opt(sbi, DATA_FLUSH); + break; + case Opt_mode: + name = match_strdup(&args[0]); + + if (!name) + return -ENOMEM; + if (strlen(name) == 8 && + !strncmp(name, "adaptive", 8)) { + set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE); + } else if (strlen(name) == 3 && + !strncmp(name, "lfs", 3)) { + set_opt_mode(sbi, F2FS_MOUNT_LFS); + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; + case Opt_fault_injection: + if (args->from && match_int(args, &arg)) + return -EINVAL; +#ifdef CONFIG_F2FS_FAULT_INJECTION + f2fs_build_fault_attr(sbi, arg); +#else + f2fs_msg(sb, KERN_INFO, + "FAULT_INJECTION was not selected"); +#endif + break; + case Opt_lazytime: + sb->s_flags |= MS_LAZYTIME; + break; + case Opt_nolazytime: + sb->s_flags &= ~MS_LAZYTIME; + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -426,26 +558,25 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) init_once((void *) fi); + if (percpu_counter_init(&fi->dirty_pages, 0, GFP_NOFS)) { + kmem_cache_free(f2fs_inode_cachep, fi); + return NULL; + } + /* Initialize f2fs-specific inode info */ fi->vfs_inode.i_version = 1; - atomic_set(&fi->dirty_pages, 0); fi->i_current_depth = 1; fi->i_advise = 0; init_rwsem(&fi->i_sem); + INIT_LIST_HEAD(&fi->dirty_list); + INIT_LIST_HEAD(&fi->gdirty_list); INIT_LIST_HEAD(&fi->inmem_pages); mutex_init(&fi->inmem_lock); - - set_inode_flag(fi, FI_NEW_INODE); - - if (test_opt(F2FS_SB(sb), INLINE_XATTR)) - set_inode_flag(fi, FI_INLINE_XATTR); + init_rwsem(&fi->dio_rwsem[READ]); + init_rwsem(&fi->dio_rwsem[WRITE]); /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; - -#ifdef CONFIG_F2FS_FS_ENCRYPTION - fi->i_crypt_info = NULL; -#endif return &fi->vfs_inode; } @@ -458,7 +589,7 @@ static int f2fs_drop_inode(struct inode *inode) * - f2fs_gc -> iput -> evict * - inode_wait_for_writeback(inode) */ - if (!inode_unhashed(inode) && inode->i_state & I_SYNC) { + if ((!inode_unhashed(inode) && inode->i_state & I_SYNC)) { if (!inode->i_nlink && !is_bad_inode(inode)) { /* to avoid evict_inode call simultaneously */ atomic_inc(&inode->i_count); @@ -466,32 +597,66 @@ static int f2fs_drop_inode(struct inode *inode) /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - commit_inmem_pages(inode, true); + drop_inmem_pages(inode); /* should remain fi->extent_tree for writepage */ f2fs_destroy_extent_node(inode); sb_start_intwrite(inode->i_sb); - i_size_write(inode, 0); + f2fs_i_size_write(inode, 0); if (F2FS_HAS_BLOCKS(inode)) - f2fs_truncate(inode, true); + f2fs_truncate(inode); sb_end_intwrite(inode->i_sb); -#ifdef CONFIG_F2FS_FS_ENCRYPTION - if (F2FS_I(inode)->i_crypt_info) - f2fs_free_encryption_info(inode, - F2FS_I(inode)->i_crypt_info); -#endif + fscrypt_put_encryption_info(inode, NULL); spin_lock(&inode->i_lock); atomic_dec(&inode->i_count); } return 0; } + return generic_drop_inode(inode); } +int f2fs_inode_dirtied(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + spin_lock(&sbi->inode_lock[DIRTY_META]); + if (is_inode_flag_set(inode, FI_DIRTY_INODE)) { + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return 1; + } + + set_inode_flag(inode, FI_DIRTY_INODE); + list_add_tail(&F2FS_I(inode)->gdirty_list, + &sbi->inode_list[DIRTY_META]); + inc_page_count(sbi, F2FS_DIRTY_IMETA); + stat_inc_dirty_inode(sbi, DIRTY_META); + spin_unlock(&sbi->inode_lock[DIRTY_META]); + + return 0; +} + +void f2fs_inode_synced(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + spin_lock(&sbi->inode_lock[DIRTY_META]); + if (!is_inode_flag_set(inode, FI_DIRTY_INODE)) { + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return; + } + list_del_init(&F2FS_I(inode)->gdirty_list); + clear_inode_flag(inode, FI_DIRTY_INODE); + clear_inode_flag(inode, FI_AUTO_RECOVER); + dec_page_count(sbi, F2FS_DIRTY_IMETA); + stat_dec_dirty_inode(F2FS_I_SB(inode), DIRTY_META); + spin_unlock(&sbi->inode_lock[DIRTY_META]); +} + /* * f2fs_dirty_inode() is called from __mark_inode_dirty() * @@ -499,7 +664,19 @@ static int f2fs_drop_inode(struct inode *inode) */ static void f2fs_dirty_inode(struct inode *inode, int flags) { - set_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (inode->i_ino == F2FS_NODE_INO(sbi) || + inode->i_ino == F2FS_META_INO(sbi)) + return; + + if (flags == I_DIRTY_TIME) + return; + + if (is_inode_flag_set(inode, FI_AUTO_RECOVER)) + clear_inode_flag(inode, FI_AUTO_RECOVER); + + f2fs_inode_dirtied(inode); } static void f2fs_i_callback(struct rcu_head *head) @@ -510,15 +687,27 @@ static void f2fs_i_callback(struct rcu_head *head) static void f2fs_destroy_inode(struct inode *inode) { + percpu_counter_destroy(&F2FS_I(inode)->dirty_pages); call_rcu(&inode->i_rcu, f2fs_i_callback); } +static void destroy_percpu_info(struct f2fs_sb_info *sbi) +{ + int i; + + for (i = 0; i < NR_COUNT_TYPE; i++) + percpu_counter_destroy(&sbi->nr_pages[i]); + percpu_counter_destroy(&sbi->alloc_valid_block_count); + percpu_counter_destroy(&sbi->total_valid_inode_count); +} + static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); if (sbi->s_proc) { remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry(sb->s_id, f2fs_proc_root); } kobject_del(&sbi->s_kobj); @@ -534,7 +723,7 @@ static void f2fs_put_super(struct super_block *sb) * clean checkpoint again. */ if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) || - !is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG)) { + !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { struct cp_control cpc = { .reason = CP_UMOUNT, }; @@ -548,12 +737,15 @@ static void f2fs_put_super(struct super_block *sb) * normally superblock is clean, so we need to release this. * In addition, EIO will skip do checkpoint, we need this as well. */ - release_dirty_inode(sbi); + release_ino_entry(sbi, true); release_discard_addrs(sbi); f2fs_leave_shrinker(sbi); mutex_unlock(&sbi->umount_mutex); + /* our cp_error case, we can wait for any writeback page */ + f2fs_flush_merged_bios(sbi); + iput(sbi->node_inode); iput(sbi->meta_inode); @@ -566,13 +758,18 @@ static void f2fs_put_super(struct super_block *sb) wait_for_completion(&sbi->s_kobj_unregister); sb->s_fs_info = NULL; - brelse(sbi->raw_super_buf); + if (sbi->s_chksum_driver) + crypto_free_shash(sbi->s_chksum_driver); + kfree(sbi->raw_super); + + destroy_percpu_info(sbi); kfree(sbi); } int f2fs_sync_fs(struct super_block *sb, int sync) { struct f2fs_sb_info *sbi = F2FS_SB(sb); + int err = 0; trace_f2fs_sync_fs(sb, sync); @@ -582,14 +779,12 @@ int f2fs_sync_fs(struct super_block *sb, int sync) cpc.reason = __get_cp_reason(sbi); mutex_lock(&sbi->gc_mutex); - write_checkpoint(sbi, &cpc); + err = write_checkpoint(sbi, &cpc); mutex_unlock(&sbi->gc_mutex); - } else { - f2fs_balance_fs(sbi); } f2fs_trace_ios(NULL, 1); - return 0; + return err; } static int f2fs_freeze(struct super_block *sb) @@ -623,7 +818,7 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bsize = sbi->blocksize; buf->f_blocks = total_count - start_count; - buf->f_bfree = buf->f_blocks - valid_user_blocks(sbi) - ovp_count; + buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count; buf->f_bavail = user_block_count - valid_user_blocks(sbi); buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; @@ -676,6 +871,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",noinline_data"); if (test_opt(sbi, INLINE_DENTRY)) seq_puts(seq, ",inline_dentry"); + else + seq_puts(seq, ",noinline_dentry"); if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE)) seq_puts(seq, ",flush_merge"); if (test_opt(sbi, NOBARRIER)) @@ -686,6 +883,14 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",extent_cache"); else seq_puts(seq, ",noextent_cache"); + if (test_opt(sbi, DATA_FLUSH)) + seq_puts(seq, ",data_flush"); + + seq_puts(seq, ",mode="); + if (test_opt(sbi, ADAPTIVE)) + seq_puts(seq, "adaptive"); + else if (test_opt(sbi, LFS)) + seq_puts(seq, "lfs"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); return 0; @@ -718,19 +923,47 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset) return 0; } -static int segment_info_open_fs(struct inode *inode, struct file *file) +static int segment_bits_seq_show(struct seq_file *seq, void *offset) { - return single_open(file, segment_info_seq_show, PDE_DATA(inode)); + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + unsigned int total_segs = + le32_to_cpu(sbi->raw_super->segment_count_main); + int i, j; + + seq_puts(seq, "format: segment_type|valid_blocks|bitmaps\n" + "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); + + for (i = 0; i < total_segs; i++) { + struct seg_entry *se = get_seg_entry(sbi, i); + + seq_printf(seq, "%-10d", i); + seq_printf(seq, "%d|%-3u|", se->type, + get_valid_blocks(sbi, i, 1)); + for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++) + seq_printf(seq, " %.2x", se->cur_valid_map[j]); + seq_putc(seq, '\n'); + } + return 0; } -static const struct file_operations f2fs_seq_segment_info_fops = { - .owner = THIS_MODULE, - .open = segment_info_open_fs, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, +#define F2FS_PROC_FILE_DEF(_name) \ +static int _name##_open_fs(struct inode *inode, struct file *file) \ +{ \ + return single_open(file, _name##_seq_show, PDE_DATA(inode)); \ +} \ + \ +static const struct file_operations f2fs_seq_##_name##_fops = { \ + .owner = THIS_MODULE, \ + .open = _name##_open_fs, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ }; +F2FS_PROC_FILE_DEF(segment_info); +F2FS_PROC_FILE_DEF(segment_bits); + static void default_options(struct f2fs_sb_info *sbi) { /* init some FS parameters */ @@ -738,7 +971,16 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, BG_GC); set_opt(sbi, INLINE_DATA); + set_opt(sbi, INLINE_DENTRY); set_opt(sbi, EXTENT_CACHE); + sbi->sb->s_flags |= MS_LAZYTIME; + set_opt(sbi, FLUSH_MERGE); + if (f2fs_sb_mounted_hmsmr(sbi->sb)) { + set_opt_mode(sbi, F2FS_MOUNT_LFS); + set_opt(sbi, DISCARD); + } else { + set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE); + } #ifdef CONFIG_F2FS_FS_XATTR set_opt(sbi, XATTR_USER); @@ -746,6 +988,10 @@ static void default_options(struct f2fs_sb_info *sbi) #ifdef CONFIG_F2FS_FS_POSIX_ACL set_opt(sbi, POSIX_ACL); #endif + +#ifdef CONFIG_F2FS_FAULT_INJECTION + f2fs_build_fault_attr(sbi, 0); +#endif } static int f2fs_remount(struct super_block *sb, int *flags, char *data) @@ -756,8 +1002,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool need_restart_gc = false; bool need_stop_gc = false; bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); - - sync_filesystem(sb); +#ifdef CONFIG_F2FS_FAULT_INJECTION + struct f2fs_fault_info ffi = sbi->fault_info; +#endif /* * Save the old mount options in case we @@ -766,6 +1013,15 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) org_mount_opt = sbi->mount_opt; active_logs = sbi->active_logs; + /* recover superblocks we couldn't write due to previous RO mount */ + if (!(*flags & MS_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) { + err = f2fs_commit_super(sbi, false); + f2fs_msg(sb, KERN_INFO, + "Try to recover all the superblocks, ret: %d", err); + if (!err) + clear_sbi_flag(sbi, SBI_NEED_SB_WRITE); + } + sbi->mount_opt.opt = 0; default_options(sbi); @@ -797,7 +1053,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) { if (sbi->gc_thread) { stop_gc_thread(sbi); - f2fs_sync_fs(sb, 1); need_restart_gc = true; } } else if (!sbi->gc_thread) { @@ -807,6 +1062,16 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) need_stop_gc = true; } + if (*flags & MS_RDONLY) { + writeback_inodes_sb(sb, WB_REASON_SYNC); + sync_inodes_sb(sb); + + set_sbi_flag(sbi, SBI_IS_DIRTY); + set_sbi_flag(sbi, SBI_IS_CLOSE); + f2fs_sync_fs(sb, 1); + clear_sbi_flag(sbi, SBI_IS_CLOSE); + } + /* * We stop issue flush thread if FS is mounted as RO * or if flush_merge is not passed in mount option. @@ -820,8 +1085,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } skip: /* Update the POSIXACL Flag */ - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); + return 0; restore_gc: if (need_restart_gc) { @@ -834,6 +1100,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) restore_opts: sbi->mount_opt = org_mount_opt; sbi->active_logs = active_logs; +#ifdef CONFIG_F2FS_FAULT_INJECTION + sbi->fault_info = ffi; +#endif return err; } @@ -853,6 +1122,48 @@ static struct super_operations f2fs_sops = { .remount_fs = f2fs_remount, }; +#ifdef CONFIG_F2FS_FS_ENCRYPTION +static int f2fs_get_context(struct inode *inode, void *ctx, size_t len) +{ + return f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, + F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len, NULL); +} + +static int f2fs_key_prefix(struct inode *inode, u8 **key) +{ + *key = F2FS_I_SB(inode)->key_prefix; + return F2FS_I_SB(inode)->key_prefix_size; +} + +static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len, + void *fs_data) +{ + return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, + F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len, fs_data, XATTR_CREATE); +} + +static unsigned f2fs_max_namelen(struct inode *inode) +{ + return S_ISLNK(inode->i_mode) ? + inode->i_sb->s_blocksize : F2FS_NAME_LEN; +} + +static struct fscrypt_operations f2fs_cryptops = { + .get_context = f2fs_get_context, + .key_prefix = f2fs_key_prefix, + .set_context = f2fs_set_context, + .is_encrypted = f2fs_encrypted_inode, + .empty_dir = f2fs_empty_dir, + .max_namelen = f2fs_max_namelen, +}; +#else +static struct fscrypt_operations f2fs_cryptops = { + .is_encrypted = f2fs_encrypted_inode, +}; +#endif + static struct inode *f2fs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { @@ -898,7 +1209,7 @@ static const struct export_operations f2fs_export_ops = { .get_parent = f2fs_get_parent, }; -static loff_t max_file_size(unsigned bits) +static loff_t max_file_blocks(void) { loff_t result = (DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS); loff_t leaf_count = ADDRS_PER_BLOCK; @@ -914,13 +1225,131 @@ static loff_t max_file_size(unsigned bits) leaf_count *= NIDS_PER_BLOCK; result += leaf_count; - result <<= bits; return result; } -static int sanity_check_raw_super(struct super_block *sb, - struct f2fs_super_block *raw_super) +static int __f2fs_commit_super(struct buffer_head *bh, + struct f2fs_super_block *super) { + lock_buffer(bh); + if (super) + memcpy(bh->b_data + F2FS_SUPER_OFFSET, super, sizeof(*super)); + set_buffer_uptodate(bh); + set_buffer_dirty(bh); + unlock_buffer(bh); + + /* it's rare case, we can do fua all the time */ + return __sync_dirty_buffer(bh, WRITE_FLUSH_FUA); +} + +static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, + struct buffer_head *bh) +{ + struct f2fs_super_block *raw_super = (struct f2fs_super_block *) + (bh->b_data + F2FS_SUPER_OFFSET); + struct super_block *sb = sbi->sb; + u32 segment0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr); + u32 cp_blkaddr = le32_to_cpu(raw_super->cp_blkaddr); + u32 sit_blkaddr = le32_to_cpu(raw_super->sit_blkaddr); + u32 nat_blkaddr = le32_to_cpu(raw_super->nat_blkaddr); + u32 ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); + u32 main_blkaddr = le32_to_cpu(raw_super->main_blkaddr); + u32 segment_count_ckpt = le32_to_cpu(raw_super->segment_count_ckpt); + u32 segment_count_sit = le32_to_cpu(raw_super->segment_count_sit); + u32 segment_count_nat = le32_to_cpu(raw_super->segment_count_nat); + u32 segment_count_ssa = le32_to_cpu(raw_super->segment_count_ssa); + u32 segment_count_main = le32_to_cpu(raw_super->segment_count_main); + u32 segment_count = le32_to_cpu(raw_super->segment_count); + u32 log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); + u64 main_end_blkaddr = main_blkaddr + + (segment_count_main << log_blocks_per_seg); + u64 seg_end_blkaddr = segment0_blkaddr + + (segment_count << log_blocks_per_seg); + + if (segment0_blkaddr != cp_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Mismatch start address, segment0(%u) cp_blkaddr(%u)", + segment0_blkaddr, cp_blkaddr); + return true; + } + + if (cp_blkaddr + (segment_count_ckpt << log_blocks_per_seg) != + sit_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Wrong CP boundary, start(%u) end(%u) blocks(%u)", + cp_blkaddr, sit_blkaddr, + segment_count_ckpt << log_blocks_per_seg); + return true; + } + + if (sit_blkaddr + (segment_count_sit << log_blocks_per_seg) != + nat_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Wrong SIT boundary, start(%u) end(%u) blocks(%u)", + sit_blkaddr, nat_blkaddr, + segment_count_sit << log_blocks_per_seg); + return true; + } + + if (nat_blkaddr + (segment_count_nat << log_blocks_per_seg) != + ssa_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Wrong NAT boundary, start(%u) end(%u) blocks(%u)", + nat_blkaddr, ssa_blkaddr, + segment_count_nat << log_blocks_per_seg); + return true; + } + + if (ssa_blkaddr + (segment_count_ssa << log_blocks_per_seg) != + main_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Wrong SSA boundary, start(%u) end(%u) blocks(%u)", + ssa_blkaddr, main_blkaddr, + segment_count_ssa << log_blocks_per_seg); + return true; + } + + if (main_end_blkaddr > seg_end_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Wrong MAIN_AREA boundary, start(%u) end(%u) block(%u)", + main_blkaddr, + segment0_blkaddr + + (segment_count << log_blocks_per_seg), + segment_count_main << log_blocks_per_seg); + return true; + } else if (main_end_blkaddr < seg_end_blkaddr) { + int err = 0; + char *res; + + /* fix in-memory information all the time */ + raw_super->segment_count = cpu_to_le32((main_end_blkaddr - + segment0_blkaddr) >> log_blocks_per_seg); + + if (f2fs_readonly(sb) || bdev_read_only(sb->s_bdev)) { + set_sbi_flag(sbi, SBI_NEED_SB_WRITE); + res = "internally"; + } else { + err = __f2fs_commit_super(bh, NULL); + res = err ? "failed" : "done"; + } + f2fs_msg(sb, KERN_INFO, + "Fix alignment : %s, start(%u) end(%u) block(%u)", + res, main_blkaddr, + segment0_blkaddr + + (segment_count << log_blocks_per_seg), + segment_count_main << log_blocks_per_seg); + if (err) + return true; + } + return false; +} + +static int sanity_check_raw_super(struct f2fs_sb_info *sbi, + struct buffer_head *bh) +{ + struct f2fs_super_block *raw_super = (struct f2fs_super_block *) + (bh->b_data + F2FS_SUPER_OFFSET); + struct super_block *sb = sbi->sb; unsigned int blocksize; if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) { @@ -931,10 +1360,10 @@ static int sanity_check_raw_super(struct super_block *sb, } /* Currently, support only 4KB page cache size */ - if (F2FS_BLKSIZE != PAGE_CACHE_SIZE) { + if (F2FS_BLKSIZE != PAGE_SIZE) { f2fs_msg(sb, KERN_INFO, "Invalid page_cache_size (%lu), supports only 4KB\n", - PAGE_CACHE_SIZE); + PAGE_SIZE); return 1; } @@ -947,6 +1376,14 @@ static int sanity_check_raw_super(struct super_block *sb, return 1; } + /* check log blocks per segment */ + if (le32_to_cpu(raw_super->log_blocks_per_seg) != 9) { + f2fs_msg(sb, KERN_INFO, + "Invalid log blocks per segment (%u)\n", + le32_to_cpu(raw_super->log_blocks_per_seg)); + return 1; + } + /* Currently, support 512/1024/2048/4096 bytes sector size */ if (le32_to_cpu(raw_super->log_sectorsize) > F2FS_MAX_LOG_SECTOR_SIZE || @@ -965,10 +1402,27 @@ static int sanity_check_raw_super(struct super_block *sb, le32_to_cpu(raw_super->log_sectorsize)); return 1; } + + /* check reserved ino info */ + if (le32_to_cpu(raw_super->node_ino) != 1 || + le32_to_cpu(raw_super->meta_ino) != 2 || + le32_to_cpu(raw_super->root_ino) != 3) { + f2fs_msg(sb, KERN_INFO, + "Invalid Fs Meta Ino: node(%u) meta(%u) root(%u)", + le32_to_cpu(raw_super->node_ino), + le32_to_cpu(raw_super->meta_ino), + le32_to_cpu(raw_super->root_ino)); + return 1; + } + + /* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */ + if (sanity_check_area_boundary(sbi, bh)) + return 1; + return 0; } -static int sanity_check_ckpt(struct f2fs_sb_info *sbi) +int sanity_check_ckpt(struct f2fs_sb_info *sbi) { unsigned int total, fsmeta; struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); @@ -994,7 +1448,6 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi) static void init_sb_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = sbi->raw_super; - int i; sbi->log_sectors_per_block = le32_to_cpu(raw_super->log_sectors_per_block); @@ -1014,111 +1467,131 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->cur_victim_sec = NULL_SECNO; sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH; - for (i = 0; i < NR_COUNT_TYPE; i++) - atomic_set(&sbi->nr_pages[i], 0); - sbi->dir_level = DEF_DIR_LEVEL; - sbi->cp_interval = DEF_CP_INTERVAL; + sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL; + sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL; clear_sbi_flag(sbi, SBI_NEED_FSCK); INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); + mutex_init(&sbi->wio_mutex[NODE]); + mutex_init(&sbi->wio_mutex[DATA]); + spin_lock_init(&sbi->cp_lock); + +#ifdef CONFIG_F2FS_FS_ENCRYPTION + memcpy(sbi->key_prefix, F2FS_KEY_DESC_PREFIX, + F2FS_KEY_DESC_PREFIX_SIZE); + sbi->key_prefix_size = F2FS_KEY_DESC_PREFIX_SIZE; +#endif +} + +static int init_percpu_info(struct f2fs_sb_info *sbi) +{ + int i, err; + + for (i = 0; i < NR_COUNT_TYPE; i++) { + err = percpu_counter_init(&sbi->nr_pages[i], 0, GFP_KERNEL); + if (err) + return err; + } + + err = percpu_counter_init(&sbi->alloc_valid_block_count, 0, GFP_KERNEL); + if (err) + return err; + + return percpu_counter_init(&sbi->total_valid_inode_count, 0, + GFP_KERNEL); } /* * Read f2fs raw super block. - * Because we have two copies of super block, so read the first one at first, - * if the first one is invalid, move to read the second one. + * Because we have two copies of super block, so read both of them + * to get the first valid one. If any one of them is broken, we pass + * them recovery flag back to the caller. */ -static int read_raw_super_block(struct super_block *sb, +static int read_raw_super_block(struct f2fs_sb_info *sbi, struct f2fs_super_block **raw_super, - struct buffer_head **raw_super_buf, - int *recovery) + int *valid_super_block, int *recovery) { - int block = 0; - struct buffer_head *buffer; + struct super_block *sb = sbi->sb; + int block; + struct buffer_head *bh; struct f2fs_super_block *super; int err = 0; -retry: - buffer = sb_bread(sb, block); - if (!buffer) { - *recovery = 1; - f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock", + super = kzalloc(sizeof(struct f2fs_super_block), GFP_KERNEL); + if (!super) + return -ENOMEM; + + for (block = 0; block < 2; block++) { + bh = sb_bread(sb, block); + if (!bh) { + f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock", block + 1); - if (block == 0) { - block++; - goto retry; - } else { err = -EIO; - goto out; + continue; } - } - super = (struct f2fs_super_block *) - ((char *)(buffer)->b_data + F2FS_SUPER_OFFSET); - - /* sanity checking of raw super */ - if (sanity_check_raw_super(sb, super)) { - brelse(buffer); - *recovery = 1; - f2fs_msg(sb, KERN_ERR, - "Can't find valid F2FS filesystem in %dth superblock", - block + 1); - if (block == 0) { - block++; - goto retry; - } else { + /* sanity checking of raw super */ + if (sanity_check_raw_super(sbi, bh)) { + f2fs_msg(sb, KERN_ERR, + "Can't find valid F2FS filesystem in %dth superblock", + block + 1); err = -EINVAL; - goto out; + brelse(bh); + continue; } + + if (!*raw_super) { + memcpy(super, bh->b_data + F2FS_SUPER_OFFSET, + sizeof(*super)); + *valid_super_block = block; + *raw_super = super; + } + brelse(bh); } - if (!*raw_super) { - *raw_super_buf = buffer; - *raw_super = super; - } else { - /* already have a valid superblock */ - brelse(buffer); - } + /* Fail to read any one of the superblocks*/ + if (err < 0) + *recovery = 1; - /* check the validity of the second superblock */ - if (block == 0) { - block++; - goto retry; - } - -out: /* No valid superblock */ if (!*raw_super) - return err; + kfree(super); + else + err = 0; - return 0; + return err; } int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) { - struct buffer_head *sbh = sbi->raw_super_buf; - sector_t block = sbh->b_blocknr; + struct buffer_head *bh; int err; - /* write back-up superblock first */ - sbh->b_blocknr = block ? 0 : 1; - mark_buffer_dirty(sbh); - err = sync_dirty_buffer(sbh); + if ((recover && f2fs_readonly(sbi->sb)) || + bdev_read_only(sbi->sb->s_bdev)) { + set_sbi_flag(sbi, SBI_NEED_SB_WRITE); + return -EROFS; + } - sbh->b_blocknr = block; + /* write back-up superblock first */ + bh = sb_getblk(sbi->sb, sbi->valid_super_block ? 0: 1); + if (!bh) + return -EIO; + err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi)); + brelse(bh); /* if we are in recovery path, skip writing valid superblock */ if (recover || err) - goto out; + return err; /* write current valid superblock */ - mark_buffer_dirty(sbh); - err = sync_dirty_buffer(sbh); -out: - clear_buffer_write_io_error(sbh); - set_buffer_uptodate(sbh); + bh = sb_getblk(sbi->sb, sbi->valid_super_block); + if (!bh) + return -EIO; + err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi)); + brelse(bh); return err; } @@ -1126,17 +1599,17 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) { struct f2fs_sb_info *sbi; struct f2fs_super_block *raw_super; - struct buffer_head *raw_super_buf; struct inode *root; - long err; + int err; bool retry = true, need_fsck = false; char *options = NULL; - int recovery, i; + int recovery, i, valid_super_block; + struct curseg_info *seg_i; try_onemore: err = -EINVAL; raw_super = NULL; - raw_super_buf = NULL; + valid_super_block = -1; recovery = 0; /* allocate memory for f2fs-specific super block info */ @@ -1144,17 +1617,31 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (!sbi) return -ENOMEM; + sbi->sb = sb; + + /* Load the checksum driver */ + sbi->s_chksum_driver = crypto_alloc_shash("crc32", 0, 0); + if (IS_ERR(sbi->s_chksum_driver)) { + f2fs_msg(sb, KERN_ERR, "Cannot load crc32 driver."); + err = PTR_ERR(sbi->s_chksum_driver); + sbi->s_chksum_driver = NULL; + goto free_sbi; + } + /* set a block size */ if (unlikely(!sb_set_blocksize(sb, F2FS_BLKSIZE))) { f2fs_msg(sb, KERN_ERR, "unable to set blocksize"); goto free_sbi; } - err = read_raw_super_block(sb, &raw_super, &raw_super_buf, &recovery); + err = read_raw_super_block(sbi, &raw_super, &valid_super_block, + &recovery); if (err) goto free_sbi; sb->s_fs_info = sbi; + sbi->raw_super = raw_super; + default_options(sbi); /* parse mount options */ options = kstrdup((const char *)data, GFP_KERNEL); @@ -1167,11 +1654,14 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (err) goto free_options; - sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize)); + sbi->max_file_blocks = max_file_blocks(); + sb->s_maxbytes = sbi->max_file_blocks << + le32_to_cpu(raw_super->log_blocksize); sb->s_max_links = F2FS_LINK_MAX; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); sb->s_op = &f2fs_sops; + sb->s_cop = &f2fs_cryptops; sb->s_xattr = f2fs_xattr_handlers; sb->s_export_op = &f2fs_export_ops; sb->s_magic = F2FS_SUPER_MAGIC; @@ -1181,11 +1671,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) memcpy(sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid)); /* init f2fs-specific super block info */ - sbi->sb = sb; - sbi->raw_super = raw_super; - sbi->raw_super_buf = raw_super_buf; + sbi->valid_super_block = valid_super_block; mutex_init(&sbi->gc_mutex); - mutex_init(&sbi->writepages); mutex_init(&sbi->cp_mutex); init_rwsem(&sbi->node_write); @@ -1206,6 +1693,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) init_waitqueue_head(&sbi->cp_wait); init_sb_info(sbi); + err = init_percpu_info(sbi); + if (err) + goto free_options; + /* get an inode for meta space */ sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi)); if (IS_ERR(sbi->meta_inode)) { @@ -1220,24 +1711,19 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) goto free_meta_inode; } - /* sanity checking of checkpoint */ - err = -EINVAL; - if (sanity_check_ckpt(sbi)) { - f2fs_msg(sb, KERN_ERR, "Invalid F2FS checkpoint"); - goto free_cp; - } - sbi->total_valid_node_count = le32_to_cpu(sbi->ckpt->valid_node_count); - sbi->total_valid_inode_count = - le32_to_cpu(sbi->ckpt->valid_inode_count); + percpu_counter_set(&sbi->total_valid_inode_count, + le32_to_cpu(sbi->ckpt->valid_inode_count)); sbi->user_block_count = le64_to_cpu(sbi->ckpt->user_block_count); sbi->total_valid_block_count = le64_to_cpu(sbi->ckpt->valid_block_count); sbi->last_valid_block_count = sbi->total_valid_block_count; - sbi->alloc_valid_block_count = 0; - INIT_LIST_HEAD(&sbi->dir_inode_list); - spin_lock_init(&sbi->dir_inode_lock); + + for (i = 0; i < NR_INODE_TYPE; i++) { + INIT_LIST_HEAD(&sbi->inode_list[i]); + spin_lock_init(&sbi->inode_lock[i]); + } init_extent_cache_info(sbi); @@ -1257,6 +1743,17 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) goto free_nm; } + /* For write statistics */ + if (sb->s_bdev->bd_part) + sbi->sectors_written_start = + (u64)part_stat_read(sb->s_bdev->bd_part, sectors[1]); + + /* Read accumulated write IO statistics if exists */ + seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); + if (__exist_node_summaries(sbi)) + sbi->kbytes_written = + le64_to_cpu(seg_i->journal->info.kbytes_written); + build_gc_manager(sbi); /* get an inode for node space */ @@ -1300,9 +1797,12 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); - if (sbi->s_proc) + if (sbi->s_proc) { proc_create_data("segment_info", S_IRUGO, sbi->s_proc, &f2fs_seq_segment_info_fops, sb); + proc_create_data("segment_bits", S_IRUGO, sbi->s_proc, + &f2fs_seq_segment_bits_fops, sb); + } sbi->s_kobj.kset = f2fs_kset; init_completion(&sbi->s_kobj_unregister); @@ -1318,7 +1818,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) * previous checkpoint was not done by clean system shutdown. */ if (bdev_read_only(sb->s_bdev) && - !is_set_ckpt_flags(sbi->ckpt, CP_UMOUNT_FLAG)) { + !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { err = -EROFS; goto free_kobj; } @@ -1326,14 +1826,27 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (need_fsck) set_sbi_flag(sbi, SBI_NEED_FSCK); - err = recover_fsync_data(sbi); - if (err) { + if (!retry) + goto skip_recovery; + + err = recover_fsync_data(sbi, false); + if (err < 0) { need_fsck = true; f2fs_msg(sb, KERN_ERR, - "Cannot recover all fsync data errno=%ld", err); + "Cannot recover all fsync data errno=%d", err); + goto free_kobj; + } + } else { + err = recover_fsync_data(sbi, true); + + if (!f2fs_readonly(sb) && err > 0) { + err = -EINVAL; + f2fs_msg(sb, KERN_ERR, + "Need to recover fsync data"); goto free_kobj; } } +skip_recovery: /* recover_fsync_data() cleared this already */ clear_sbi_flag(sbi, SBI_POR_DOING); @@ -1350,20 +1863,26 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) kfree(options); /* recover broken superblock */ - if (recovery && !f2fs_readonly(sb) && !bdev_read_only(sb->s_bdev)) { - f2fs_msg(sb, KERN_INFO, "Recover invalid superblock"); - f2fs_commit_super(sbi, true); + if (recovery) { + err = f2fs_commit_super(sbi, true); + f2fs_msg(sb, KERN_INFO, + "Try to recover %dth superblock, ret: %d", + sbi->valid_super_block ? 1 : 2, err); } - sbi->cp_expires = round_jiffies_up(jiffies); - + f2fs_update_time(sbi, CP_TIME); + f2fs_update_time(sbi, REQ_TIME); return 0; free_kobj: + f2fs_sync_inode_meta(sbi); kobject_del(&sbi->s_kobj); + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); free_proc: if (sbi->s_proc) { remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry(sb->s_id, f2fs_proc_root); } f2fs_destroy_stats(sbi); @@ -1371,7 +1890,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) dput(sb->s_root); sb->s_root = NULL; free_node_inode: + truncate_inode_pages_final(NODE_MAPPING(sbi)); mutex_lock(&sbi->umount_mutex); + release_ino_entry(sbi, true); f2fs_leave_shrinker(sbi); iput(sbi->node_inode); mutex_unlock(&sbi->umount_mutex); @@ -1379,16 +1900,18 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) destroy_node_manager(sbi); free_sm: destroy_segment_manager(sbi); -free_cp: kfree(sbi->ckpt); free_meta_inode: make_bad_inode(sbi->meta_inode); iput(sbi->meta_inode); free_options: + destroy_percpu_info(sbi); kfree(options); free_sb_buf: - brelse(raw_super_buf); + kfree(raw_super); free_sbi: + if (sbi->s_chksum_driver) + crypto_free_shash(sbi->s_chksum_driver); kfree(sbi); /* give only one another chance */ @@ -1424,8 +1947,9 @@ MODULE_ALIAS_FS("f2fs"); static int __init init_inodecache(void) { - f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache", - sizeof(struct f2fs_inode_info)); + f2fs_inode_cachep = kmem_cache_create("f2fs_inode_cache", + sizeof(struct f2fs_inode_info), 0, + SLAB_RECLAIM_ACCOUNT, NULL); if (!f2fs_inode_cachep) return -ENOMEM; return 0; @@ -1467,25 +1991,23 @@ static int __init init_f2fs_fs(void) err = -ENOMEM; goto free_extent_cache; } - err = f2fs_init_crypto(); - if (err) - goto free_kset; - err = register_shrinker(&f2fs_shrinker_info); if (err) - goto free_crypto; + goto free_kset; err = register_filesystem(&f2fs_fs_type); if (err) goto free_shrinker; - f2fs_create_root_stats(); + err = f2fs_create_root_stats(); + if (err) + goto free_filesystem; f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); return 0; +free_filesystem: + unregister_filesystem(&f2fs_fs_type); free_shrinker: unregister_shrinker(&f2fs_shrinker_info); -free_crypto: - f2fs_exit_crypto(); free_kset: kset_unregister(f2fs_kset); free_extent_cache: @@ -1506,15 +2028,14 @@ static void __exit exit_f2fs_fs(void) { remove_proc_entry("fs/f2fs", NULL); f2fs_destroy_root_stats(); - unregister_shrinker(&f2fs_shrinker_info); unregister_filesystem(&f2fs_fs_type); - f2fs_exit_crypto(); + unregister_shrinker(&f2fs_shrinker_info); + kset_unregister(f2fs_kset); destroy_extent_cache(); destroy_checkpoint_caches(); destroy_segment_manager_caches(); destroy_node_manager_caches(); destroy_inodecache(); - kset_unregister(f2fs_kset); f2fs_destroy_trace_ios(); } diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index 145fb659ad44..562ce0821559 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -29,7 +29,8 @@ static inline void __print_last_io(void) last_io.major, last_io.minor, last_io.pid, "----------------", last_io.type, - last_io.fio.rw, last_io.fio.blk_addr, + last_io.fio.rw, + last_io.fio.new_blkaddr, last_io.len); memset(&last_io, 0, sizeof(last_io)); } @@ -101,7 +102,8 @@ void f2fs_trace_ios(struct f2fs_io_info *fio, int flush) last_io.pid == pid && last_io.type == __file_type(inode, pid) && last_io.fio.rw == fio->rw && - last_io.fio.blk_addr + last_io.len == fio->blk_addr) { + last_io.fio.new_blkaddr + last_io.len == + fio->new_blkaddr) { last_io.len++; return; } diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 862368a32e53..69c6bb9cf207 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -151,7 +151,7 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler, return -EINVAL; F2FS_I(inode)->i_advise |= *(char *)value; - mark_inode_dirty(inode); + f2fs_mark_inode_dirty_sync(inode); return 0; } @@ -264,18 +264,20 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index, return entry; } -static void *read_all_xattrs(struct inode *inode, struct page *ipage) +static int read_all_xattrs(struct inode *inode, struct page *ipage, + void **base_addr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_xattr_header *header; size_t size = PAGE_SIZE, inline_size = 0; void *txattr_addr; + int err; inline_size = inline_xattr_size(inode); txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO); if (!txattr_addr) - return NULL; + return -ENOMEM; /* read from inline xattr */ if (inline_size) { @@ -286,8 +288,10 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage) inline_addr = inline_xattr_addr(ipage); } else { page = get_node_page(sbi, inode->i_ino); - if (IS_ERR(page)) + if (IS_ERR(page)) { + err = PTR_ERR(page); goto fail; + } inline_addr = inline_xattr_addr(page); } memcpy(txattr_addr, inline_addr, inline_size); @@ -301,8 +305,10 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage) /* The inode already has an extended attribute block. */ xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); - if (IS_ERR(xpage)) + if (IS_ERR(xpage)) { + err = PTR_ERR(xpage); goto fail; + } xattr_addr = page_address(xpage); memcpy(txattr_addr + inline_size, xattr_addr, PAGE_SIZE); @@ -316,10 +322,11 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage) header->h_magic = cpu_to_le32(F2FS_XATTR_MAGIC); header->h_refcount = cpu_to_le32(1); } - return txattr_addr; + *base_addr = txattr_addr; + return 0; fail: kzfree(txattr_addr); - return NULL; + return err; } static inline int write_all_xattrs(struct inode *inode, __u32 hsize, @@ -345,7 +352,8 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, if (ipage) { inline_addr = inline_xattr_addr(ipage); - f2fs_wait_on_page_writeback(ipage, NODE); + f2fs_wait_on_page_writeback(ipage, NODE, true); + set_page_dirty(ipage); } else { page = get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) { @@ -353,7 +361,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, return PTR_ERR(page); } inline_addr = inline_xattr_addr(page); - f2fs_wait_on_page_writeback(page, NODE); + f2fs_wait_on_page_writeback(page, NODE, true); } memcpy(inline_addr, txattr_addr, inline_size); f2fs_put_page(page, 1); @@ -374,7 +382,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, return PTR_ERR(xpage); } f2fs_bug_on(sbi, new_nid); - f2fs_wait_on_page_writeback(xpage, NODE); + f2fs_wait_on_page_writeback(xpage, NODE, true); } else { struct dnode_of_data dn; set_new_dnode(&dn, inode, NULL, NULL, new_nid); @@ -412,9 +420,9 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, if (len > F2FS_NAME_LEN) return -ERANGE; - base_addr = read_all_xattrs(inode, ipage); - if (!base_addr) - return -ENOMEM; + error = read_all_xattrs(inode, ipage, &base_addr); + if (error) + return error; entry = __find_xattr(base_addr, index, len, name); if (IS_XATTR_LAST_ENTRY(entry)) { @@ -448,9 +456,9 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) int error = 0; size_t rest = buffer_size; - base_addr = read_all_xattrs(inode, NULL); - if (!base_addr) - return -ENOMEM; + error = read_all_xattrs(inode, NULL, &base_addr); + if (error) + return error; list_for_each_xattr(entry, base_addr) { const struct xattr_handler *handler = @@ -481,13 +489,12 @@ static int __f2fs_setxattr(struct inode *inode, int index, const char *name, const void *value, size_t size, struct page *ipage, int flags) { - struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_xattr_entry *here, *last; void *base_addr; int found, newsize; size_t len; __u32 new_hsize; - int error = -ENOMEM; + int error = 0; if (name == NULL) return -EINVAL; @@ -503,9 +510,9 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (size > MAX_VALUE_LEN(inode)) return -E2BIG; - base_addr = read_all_xattrs(inode, ipage); - if (!base_addr) - goto exit; + error = read_all_xattrs(inode, ipage, &base_addr); + if (error) + return error; /* find entry with wanted name. */ here = __find_xattr(base_addr, index, len, name); @@ -538,7 +545,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, free = free + ENTRY_SIZE(here); if (unlikely(free < newsize)) { - error = -ENOSPC; + error = -E2BIG; goto exit; } } @@ -566,7 +573,6 @@ static int __f2fs_setxattr(struct inode *inode, int index, * Before we come here, old entry is removed. * We just write new entry. */ - memset(last, 0, newsize); last->e_name_index = index; last->e_name_len = len; memcpy(last->e_name, name, len); @@ -580,19 +586,17 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (error) goto exit; - if (is_inode_flag_set(fi, FI_ACL_MODE)) { - inode->i_mode = fi->i_acl_mode; + if (is_inode_flag_set(inode, FI_ACL_MODE)) { + inode->i_mode = F2FS_I(inode)->i_acl_mode; inode->i_ctime = CURRENT_TIME; - clear_inode_flag(fi, FI_ACL_MODE); + clear_inode_flag(inode, FI_ACL_MODE); } if (index == F2FS_XATTR_INDEX_ENCRYPTION && !strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT)) f2fs_set_encrypted_inode(inode); - - if (ipage) - update_inode(inode, ipage); - else - update_inode_page(inode); + f2fs_mark_inode_dirty_sync(inode); + if (!error && S_ISDIR(inode->i_mode)) + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP); exit: kzfree(base_addr); return error; @@ -609,7 +613,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, if (ipage) return __f2fs_setxattr(inode, index, name, value, size, ipage, flags); - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); /* protect xattr_ver */ @@ -618,5 +622,6 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, up_write(&F2FS_I(inode)->i_sem); f2fs_unlock_op(sbi); + f2fs_update_time(sbi, REQ_TIME); return err; } diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index 71a7100d5492..d2fd0387a3c7 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -126,7 +126,8 @@ extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t); #define f2fs_xattr_handlers NULL static inline int f2fs_setxattr(struct inode *inode, int index, - const char *name, const void *value, size_t size, int flags) + const char *name, const void *value, size_t size, + struct page *page, int flags) { return -EOPNOTSUPP; } diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 8d7151eb6ceb..2d0a78050936 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -228,6 +228,7 @@ struct dentry_operations { #define DCACHE_MAY_FREE 0x00800000 #define DCACHE_FALLTHRU 0x01000000 /* Fall through to lower layer */ #define DCACHE_OP_SELECT_INODE 0x02000000 /* Unioned entry: dcache op selects inode */ +#define DCACHE_ENCRYPTED_WITH_KEY 0x04000000 /* dir is encrypted with a valid key */ #define DCACHE_OP_REAL 0x08000000 extern seqlock_t rename_lock; diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 25c6324a0dd0..422630b8e588 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -21,7 +21,7 @@ #define F2FS_BLKSIZE 4096 /* support only 4KB block */ #define F2FS_BLKSIZE_BITS 12 /* bits for F2FS_BLKSIZE */ #define F2FS_MAX_EXTENSION 64 /* # of extension entries */ -#define F2FS_BLK_ALIGN(x) (((x) + F2FS_BLKSIZE - 1) / F2FS_BLKSIZE) +#define F2FS_BLK_ALIGN(x) (((x) + F2FS_BLKSIZE - 1) >> F2FS_BLKSIZE_BITS) #define NULL_ADDR ((block_t)0) /* used as block_t addresses */ #define NEW_ADDR ((block_t)-1) /* used as block_t addresses */ @@ -51,6 +51,7 @@ #define MAX_ACTIVE_DATA_LOGS 8 #define VERSION_LEN 256 +#define MAX_VOLUME_NAME 512 /* * For superblock @@ -84,7 +85,7 @@ struct f2fs_super_block { __le32 node_ino; /* node inode number */ __le32 meta_ino; /* meta inode number */ __u8 uuid[16]; /* 128-bit uuid for volume */ - __le16 volume_name[512]; /* volume name */ + __le16 volume_name[MAX_VOLUME_NAME]; /* volume name */ __le32 extension_count; /* # of extensions below */ __u8 extension_list[F2FS_MAX_EXTENSION][8]; /* extension array */ __le32 cp_payload; @@ -99,6 +100,7 @@ struct f2fs_super_block { /* * For checkpoint */ +#define CP_CRC_RECOVERY_FLAG 0x00000040 #define CP_FASTBOOT_FLAG 0x00000020 #define CP_FSCK_FLAG 0x00000010 #define CP_ERROR_FLAG 0x00000008 @@ -169,12 +171,12 @@ struct f2fs_extent { #define F2FS_INLINE_XATTR_ADDRS 50 /* 200 bytes for inline xattrs */ #define DEF_ADDRS_PER_INODE 923 /* Address Pointers in an Inode */ #define DEF_NIDS_PER_INODE 5 /* Node IDs in an Inode */ -#define ADDRS_PER_INODE(fi) addrs_per_inode(fi) +#define ADDRS_PER_INODE(inode) addrs_per_inode(inode) #define ADDRS_PER_BLOCK 1018 /* Address Pointers in a Direct Block */ #define NIDS_PER_BLOCK 1018 /* Node IDs in an Indirect Block */ -#define ADDRS_PER_PAGE(page, fi) \ - (IS_INODE(page) ? ADDRS_PER_INODE(fi) : ADDRS_PER_BLOCK) +#define ADDRS_PER_PAGE(page, inode) \ + (IS_INODE(page) ? ADDRS_PER_INODE(inode) : ADDRS_PER_BLOCK) #define NODE_DIR1_BLOCK (DEF_ADDRS_PER_INODE + 1) #define NODE_DIR2_BLOCK (DEF_ADDRS_PER_INODE + 2) @@ -261,7 +263,7 @@ struct f2fs_node { /* * For NAT entries */ -#define NAT_ENTRY_PER_BLOCK (PAGE_CACHE_SIZE / sizeof(struct f2fs_nat_entry)) +#define NAT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_nat_entry)) struct f2fs_nat_entry { __u8 version; /* latest version of cached nat entry */ @@ -281,7 +283,7 @@ struct f2fs_nat_block { * Not allow to change this. */ #define SIT_VBLOCK_MAP_SIZE 64 -#define SIT_ENTRY_PER_BLOCK (PAGE_CACHE_SIZE / sizeof(struct f2fs_sit_entry)) +#define SIT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_sit_entry)) /* * Note that f2fs_sit_entry->vblocks has the following bit-field information. @@ -344,7 +346,7 @@ struct f2fs_summary { struct summary_footer { unsigned char entry_type; /* SUM_TYPE_XXX */ - __u32 check_sum; /* summary checksum */ + __le32 check_sum; /* summary checksum */ } __packed; #define SUM_JOURNAL_SIZE (F2FS_BLKSIZE - SUM_FOOTER_SIZE -\ @@ -357,6 +359,12 @@ struct summary_footer { sizeof(struct sit_journal_entry)) #define SIT_JOURNAL_RESERVED ((SUM_JOURNAL_SIZE - 2) %\ sizeof(struct sit_journal_entry)) + +/* Reserved area should make size of f2fs_extra_info equals to + * that of nat_journal and sit_journal. + */ +#define EXTRA_INFO_RESERVED (SUM_JOURNAL_SIZE - 2 - 8) + /* * frequently updated NAT/SIT entries can be stored in the spare area in * summary blocks @@ -386,18 +394,28 @@ struct sit_journal { __u8 reserved[SIT_JOURNAL_RESERVED]; } __packed; -/* 4KB-sized summary block structure */ -struct f2fs_summary_block { - struct f2fs_summary entries[ENTRIES_IN_SUM]; +struct f2fs_extra_info { + __le64 kbytes_written; + __u8 reserved[EXTRA_INFO_RESERVED]; +} __packed; + +struct f2fs_journal { union { __le16 n_nats; __le16 n_sits; }; - /* spare area is used by NAT or SIT journals */ + /* spare area is used by NAT or SIT journals or extra info */ union { struct nat_journal nat_j; struct sit_journal sit_j; + struct f2fs_extra_info info; }; +} __packed; + +/* 4KB-sized summary block structure */ +struct f2fs_summary_block { + struct f2fs_summary entries[ENTRIES_IN_SUM]; + struct f2fs_journal journal; struct summary_footer footer; } __packed; @@ -491,4 +509,6 @@ enum { F2FS_FT_MAX }; +#define S_SHIFT 12 + #endif /* _LINUX_F2FS_FS_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 0166582c4d78..a88271902ff2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -52,6 +52,8 @@ struct swap_info_struct; struct seq_file; struct workqueue_struct; struct iov_iter; +struct fscrypt_info; +struct fscrypt_operations; extern void __init inode_init(void); extern void __init inode_init_early(void); @@ -676,6 +678,9 @@ struct inode { struct hlist_head i_fsnotify_marks; #endif +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + struct fscrypt_info *i_crypt_info; +#endif void *i_private; /* fs or device private pointer */ }; @@ -1331,6 +1336,8 @@ struct super_block { #endif const struct xattr_handler **s_xattr; + const struct fscrypt_operations *s_cop; + struct hlist_bl_head s_anon; /* anonymous dentries for (nfs) exporting */ struct list_head s_mounts; /* list of mounts; _not_ for fs use */ struct block_device *s_bdev; diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h new file mode 100644 index 000000000000..76cff18bb032 --- /dev/null +++ b/include/linux/fscrypto.h @@ -0,0 +1,435 @@ +/* + * General per-file encryption definition + * + * Copyright (C) 2015, Google, Inc. + * + * Written by Michael Halcrow, 2015. + * Modified by Jaegeuk Kim, 2015. + */ + +#ifndef _LINUX_FSCRYPTO_H +#define _LINUX_FSCRYPTO_H + +#include +#include +#include +#include +#include +#include +#include + +#define FS_KEY_DERIVATION_NONCE_SIZE 16 +#define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1 + +#define FS_POLICY_FLAGS_PAD_4 0x00 +#define FS_POLICY_FLAGS_PAD_8 0x01 +#define FS_POLICY_FLAGS_PAD_16 0x02 +#define FS_POLICY_FLAGS_PAD_32 0x03 +#define FS_POLICY_FLAGS_PAD_MASK 0x03 +#define FS_POLICY_FLAGS_VALID 0x03 + +/* Encryption algorithms */ +#define FS_ENCRYPTION_MODE_INVALID 0 +#define FS_ENCRYPTION_MODE_AES_256_XTS 1 +#define FS_ENCRYPTION_MODE_AES_256_GCM 2 +#define FS_ENCRYPTION_MODE_AES_256_CBC 3 +#define FS_ENCRYPTION_MODE_AES_256_CTS 4 + +/** + * Encryption context for inode + * + * Protector format: + * 1 byte: Protector format (1 = this version) + * 1 byte: File contents encryption mode + * 1 byte: File names encryption mode + * 1 byte: Flags + * 8 bytes: Master Key descriptor + * 16 bytes: Encryption Key derivation nonce + */ +struct fscrypt_context { + u8 format; + u8 contents_encryption_mode; + u8 filenames_encryption_mode; + u8 flags; + u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; + u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; +} __packed; + +/* Encryption parameters */ +#define FS_XTS_TWEAK_SIZE 16 +#define FS_AES_128_ECB_KEY_SIZE 16 +#define FS_AES_256_GCM_KEY_SIZE 32 +#define FS_AES_256_CBC_KEY_SIZE 32 +#define FS_AES_256_CTS_KEY_SIZE 32 +#define FS_AES_256_XTS_KEY_SIZE 64 +#define FS_MAX_KEY_SIZE 64 + +#define FS_KEY_DESC_PREFIX "fscrypt:" +#define FS_KEY_DESC_PREFIX_SIZE 8 + +/* This is passed in from userspace into the kernel keyring */ +struct fscrypt_key { + u32 mode; + u8 raw[FS_MAX_KEY_SIZE]; + u32 size; +} __packed; + +struct fscrypt_info { + u8 ci_data_mode; + u8 ci_filename_mode; + u8 ci_flags; + struct crypto_skcipher *ci_ctfm; + struct key *ci_keyring_key; + u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE]; +}; + +#define FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 +#define FS_WRITE_PATH_FL 0x00000002 + +struct fscrypt_ctx { + union { + struct { + struct page *bounce_page; /* Ciphertext page */ + struct page *control_page; /* Original page */ + } w; + struct { + struct bio *bio; + struct work_struct work; + } r; + struct list_head free_list; /* Free list */ + }; + u8 flags; /* Flags */ + u8 mode; /* Encryption mode for tfm */ +}; + +struct fscrypt_completion_result { + struct completion completion; + int res; +}; + +#define DECLARE_FS_COMPLETION_RESULT(ecr) \ + struct fscrypt_completion_result ecr = { \ + COMPLETION_INITIALIZER((ecr).completion), 0 } + +static inline int fscrypt_key_size(int mode) +{ + switch (mode) { + case FS_ENCRYPTION_MODE_AES_256_XTS: + return FS_AES_256_XTS_KEY_SIZE; + case FS_ENCRYPTION_MODE_AES_256_GCM: + return FS_AES_256_GCM_KEY_SIZE; + case FS_ENCRYPTION_MODE_AES_256_CBC: + return FS_AES_256_CBC_KEY_SIZE; + case FS_ENCRYPTION_MODE_AES_256_CTS: + return FS_AES_256_CTS_KEY_SIZE; + default: + BUG(); + } + return 0; +} + +#define FS_FNAME_NUM_SCATTER_ENTRIES 4 +#define FS_CRYPTO_BLOCK_SIZE 16 +#define FS_FNAME_CRYPTO_DIGEST_SIZE 32 + +/** + * For encrypted symlinks, the ciphertext length is stored at the beginning + * of the string in little-endian format. + */ +struct fscrypt_symlink_data { + __le16 len; + char encrypted_path[1]; +} __packed; + +/** + * This function is used to calculate the disk space required to + * store a filename of length l in encrypted symlink format. + */ +static inline u32 fscrypt_symlink_data_len(u32 l) +{ + if (l < FS_CRYPTO_BLOCK_SIZE) + l = FS_CRYPTO_BLOCK_SIZE; + return (l + sizeof(struct fscrypt_symlink_data) - 1); +} + +struct fscrypt_str { + unsigned char *name; + u32 len; +}; + +struct fscrypt_name { + const struct qstr *usr_fname; + struct fscrypt_str disk_name; + u32 hash; + u32 minor_hash; + struct fscrypt_str crypto_buf; +}; + +#define FSTR_INIT(n, l) { .name = n, .len = l } +#define FSTR_TO_QSTR(f) QSTR_INIT((f)->name, (f)->len) +#define fname_name(p) ((p)->disk_name.name) +#define fname_len(p) ((p)->disk_name.len) + +/* + * crypto opertions for filesystems + */ +struct fscrypt_operations { + int (*get_context)(struct inode *, void *, size_t); + int (*key_prefix)(struct inode *, u8 **); + int (*prepare_context)(struct inode *); + int (*set_context)(struct inode *, const void *, size_t, void *); + int (*dummy_context)(struct inode *); + bool (*is_encrypted)(struct inode *); + bool (*empty_dir)(struct inode *); + unsigned (*max_namelen)(struct inode *); +}; + +static inline bool fscrypt_dummy_context_enabled(struct inode *inode) +{ + if (inode->i_sb->s_cop->dummy_context && + inode->i_sb->s_cop->dummy_context(inode)) + return true; + return false; +} + +static inline bool fscrypt_valid_contents_enc_mode(u32 mode) +{ + return (mode == FS_ENCRYPTION_MODE_AES_256_XTS); +} + +static inline bool fscrypt_valid_filenames_enc_mode(u32 mode) +{ + return (mode == FS_ENCRYPTION_MODE_AES_256_CTS); +} + +static inline u32 fscrypt_validate_encryption_key_size(u32 mode, u32 size) +{ + if (size == fscrypt_key_size(mode)) + return size; + return 0; +} + +static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) +{ + if (str->len == 1 && str->name[0] == '.') + return true; + + if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.') + return true; + + return false; +} + +static inline struct page *fscrypt_control_page(struct page *page) +{ +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + return ((struct fscrypt_ctx *)page_private(page))->w.control_page; +#else + WARN_ON_ONCE(1); + return ERR_PTR(-EINVAL); +#endif +} + +static inline int fscrypt_has_encryption_key(struct inode *inode) +{ +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + return (inode->i_crypt_info != NULL); +#else + return 0; +#endif +} + +static inline void fscrypt_set_encrypted_dentry(struct dentry *dentry) +{ +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + spin_lock(&dentry->d_lock); + dentry->d_flags |= DCACHE_ENCRYPTED_WITH_KEY; + spin_unlock(&dentry->d_lock); +#endif +} + +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) +extern const struct dentry_operations fscrypt_d_ops; +#endif + +static inline void fscrypt_set_d_op(struct dentry *dentry) +{ +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + d_set_d_op(dentry, &fscrypt_d_ops); +#endif +} + +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) +/* crypto.c */ +extern struct kmem_cache *fscrypt_info_cachep; +int fscrypt_initialize(void); + +extern struct fscrypt_ctx *fscrypt_get_ctx(struct inode *, gfp_t); +extern void fscrypt_release_ctx(struct fscrypt_ctx *); +extern struct page *fscrypt_encrypt_page(struct inode *, struct page *, gfp_t); +extern int fscrypt_decrypt_page(struct page *); +extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *); +extern void fscrypt_pullback_bio_page(struct page **, bool); +extern void fscrypt_restore_control_page(struct page *); +extern int fscrypt_zeroout_range(struct inode *, pgoff_t, sector_t, + unsigned int); +/* policy.c */ +extern int fscrypt_process_policy(struct file *, const struct fscrypt_policy *); +extern int fscrypt_get_policy(struct inode *, struct fscrypt_policy *); +extern int fscrypt_has_permitted_context(struct inode *, struct inode *); +extern int fscrypt_inherit_context(struct inode *, struct inode *, + void *, bool); +/* keyinfo.c */ +extern int get_crypt_info(struct inode *); +extern int fscrypt_get_encryption_info(struct inode *); +extern void fscrypt_put_encryption_info(struct inode *, struct fscrypt_info *); + +/* fname.c */ +extern int fscrypt_setup_filename(struct inode *, const struct qstr *, + int lookup, struct fscrypt_name *); +extern void fscrypt_free_filename(struct fscrypt_name *); +extern u32 fscrypt_fname_encrypted_size(struct inode *, u32); +extern int fscrypt_fname_alloc_buffer(struct inode *, u32, + struct fscrypt_str *); +extern void fscrypt_fname_free_buffer(struct fscrypt_str *); +extern int fscrypt_fname_disk_to_usr(struct inode *, u32, u32, + const struct fscrypt_str *, struct fscrypt_str *); +extern int fscrypt_fname_usr_to_disk(struct inode *, const struct qstr *, + struct fscrypt_str *); +#endif + +/* crypto.c */ +static inline struct fscrypt_ctx *fscrypt_notsupp_get_ctx(struct inode *i, + gfp_t f) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void fscrypt_notsupp_release_ctx(struct fscrypt_ctx *c) +{ + return; +} + +static inline struct page *fscrypt_notsupp_encrypt_page(struct inode *i, + struct page *p, gfp_t f) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline int fscrypt_notsupp_decrypt_page(struct page *p) +{ + return -EOPNOTSUPP; +} + +static inline void fscrypt_notsupp_decrypt_bio_pages(struct fscrypt_ctx *c, + struct bio *b) +{ + return; +} + +static inline void fscrypt_notsupp_pullback_bio_page(struct page **p, bool b) +{ + return; +} + +static inline void fscrypt_notsupp_restore_control_page(struct page *p) +{ + return; +} + +static inline int fscrypt_notsupp_zeroout_range(struct inode *i, pgoff_t p, + sector_t s, unsigned int f) +{ + return -EOPNOTSUPP; +} + +/* policy.c */ +static inline int fscrypt_notsupp_process_policy(struct file *f, + const struct fscrypt_policy *p) +{ + return -EOPNOTSUPP; +} + +static inline int fscrypt_notsupp_get_policy(struct inode *i, + struct fscrypt_policy *p) +{ + return -EOPNOTSUPP; +} + +static inline int fscrypt_notsupp_has_permitted_context(struct inode *p, + struct inode *i) +{ + return 0; +} + +static inline int fscrypt_notsupp_inherit_context(struct inode *p, + struct inode *i, void *v, bool b) +{ + return -EOPNOTSUPP; +} + +/* keyinfo.c */ +static inline int fscrypt_notsupp_get_encryption_info(struct inode *i) +{ + return -EOPNOTSUPP; +} + +static inline void fscrypt_notsupp_put_encryption_info(struct inode *i, + struct fscrypt_info *f) +{ + return; +} + + /* fname.c */ +static inline int fscrypt_notsupp_setup_filename(struct inode *dir, + const struct qstr *iname, + int lookup, struct fscrypt_name *fname) +{ + if (dir->i_sb->s_cop->is_encrypted(dir)) + return -EOPNOTSUPP; + + memset(fname, 0, sizeof(struct fscrypt_name)); + fname->usr_fname = iname; + fname->disk_name.name = (unsigned char *)iname->name; + fname->disk_name.len = iname->len; + return 0; +} + +static inline void fscrypt_notsupp_free_filename(struct fscrypt_name *fname) +{ + return; +} + +static inline u32 fscrypt_notsupp_fname_encrypted_size(struct inode *i, u32 s) +{ + /* never happens */ + WARN_ON(1); + return 0; +} + +static inline int fscrypt_notsupp_fname_alloc_buffer(struct inode *inode, + u32 ilen, struct fscrypt_str *crypto_str) +{ + return -EOPNOTSUPP; +} + +static inline void fscrypt_notsupp_fname_free_buffer(struct fscrypt_str *c) +{ + return; +} + +static inline int fscrypt_notsupp_fname_disk_to_usr(struct inode *inode, + u32 hash, u32 minor_hash, + const struct fscrypt_str *iname, + struct fscrypt_str *oname) +{ + return -EOPNOTSUPP; +} + +static inline int fscrypt_notsupp_fname_usr_to_disk(struct inode *inode, + const struct qstr *iname, + struct fscrypt_str *oname) +{ + return -EOPNOTSUPP; +} +#endif /* _LINUX_FSCRYPTO_H */ diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 00b4a6308249..3a09bb4dc3b2 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -52,6 +52,7 @@ TRACE_DEFINE_ENUM(CP_DISCARD); { META_FLUSH, "META_FLUSH" }, \ { INMEM, "INMEM" }, \ { INMEM_DROP, "INMEM_DROP" }, \ + { INMEM_REVOKE, "INMEM_REVOKE" }, \ { IPU, "IN-PLACE" }, \ { OPU, "OUT-OF-PLACE" }) @@ -693,28 +694,32 @@ TRACE_EVENT(f2fs_direct_IO_exit, __entry->ret) ); -TRACE_EVENT(f2fs_reserve_new_block, +TRACE_EVENT(f2fs_reserve_new_blocks, - TP_PROTO(struct inode *inode, nid_t nid, unsigned int ofs_in_node), + TP_PROTO(struct inode *inode, nid_t nid, unsigned int ofs_in_node, + blkcnt_t count), - TP_ARGS(inode, nid, ofs_in_node), + TP_ARGS(inode, nid, ofs_in_node, count), TP_STRUCT__entry( __field(dev_t, dev) __field(nid_t, nid) __field(unsigned int, ofs_in_node) + __field(blkcnt_t, count) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->nid = nid; __entry->ofs_in_node = ofs_in_node; + __entry->count = count; ), - TP_printk("dev = (%d,%d), nid = %u, ofs_in_node = %u", + TP_printk("dev = (%d,%d), nid = %u, ofs_in_node = %u, count = %llu", show_dev(__entry), (unsigned int)__entry->nid, - __entry->ofs_in_node) + __entry->ofs_in_node, + (unsigned long long)__entry->count) ); DECLARE_EVENT_CLASS(f2fs__submit_page_bio, @@ -727,7 +732,8 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, __field(dev_t, dev) __field(ino_t, ino) __field(pgoff_t, index) - __field(block_t, blkaddr) + __field(block_t, old_blkaddr) + __field(block_t, new_blkaddr) __field(int, rw) __field(int, type) ), @@ -736,16 +742,18 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, __entry->dev = page->mapping->host->i_sb->s_dev; __entry->ino = page->mapping->host->i_ino; __entry->index = page->index; - __entry->blkaddr = fio->blk_addr; + __entry->old_blkaddr = fio->old_blkaddr; + __entry->new_blkaddr = fio->new_blkaddr; __entry->rw = fio->rw; __entry->type = fio->type; ), TP_printk("dev = (%d,%d), ino = %lu, page_index = 0x%lx, " - "blkaddr = 0x%llx, rw = %s%s, type = %s", + "oldaddr = 0x%llx, newaddr = 0x%llx rw = %s%s, type = %s", show_dev_ino(__entry), (unsigned long)__entry->index, - (unsigned long long)__entry->blkaddr, + (unsigned long long)__entry->old_blkaddr, + (unsigned long long)__entry->new_blkaddr, show_bio_type(__entry->rw), show_block_type(__entry->type)) ); @@ -1265,6 +1273,44 @@ TRACE_EVENT(f2fs_destroy_extent_tree, __entry->node_cnt) ); +DECLARE_EVENT_CLASS(f2fs_sync_dirty_inodes, + + TP_PROTO(struct super_block *sb, int type, s64 count), + + TP_ARGS(sb, type, count), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, type) + __field(s64, count) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->type = type; + __entry->count = count; + ), + + TP_printk("dev = (%d,%d), %s, dirty count = %lld", + show_dev(__entry), + show_file_type(__entry->type), + __entry->count) +); + +DEFINE_EVENT(f2fs_sync_dirty_inodes, f2fs_sync_dirty_inodes_enter, + + TP_PROTO(struct super_block *sb, int type, s64 count), + + TP_ARGS(sb, type, count) +); + +DEFINE_EVENT(f2fs_sync_dirty_inodes, f2fs_sync_dirty_inodes_exit, + + TP_PROTO(struct super_block *sb, int type, s64 count), + + TP_ARGS(sb, type, count) +); + #endif /* _TRACE_F2FS_H */ /* This part must be outside protection */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index f15d980249b5..c8c093e8c83d 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -170,6 +170,24 @@ struct inodes_stat_t { #define FS_IOC32_GETVERSION _IOR('v', 1, int) #define FS_IOC32_SETVERSION _IOW('v', 2, int) +/* + * File system encryption support + */ +/* Policy provided via an ioctl on the topmost directory */ +#define FS_KEY_DESCRIPTOR_SIZE 8 + +struct fscrypt_policy { + __u8 version; + __u8 contents_encryption_mode; + __u8 filenames_encryption_mode; + __u8 flags; + __u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; +} __packed; + +#define FS_IOC_SET_ENCRYPTION_POLICY _IOR('f', 19, struct fscrypt_policy) +#define FS_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16]) +#define FS_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct fscrypt_policy) + /* * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) */ From a1561fae1b30467bddc2e0cf7752125ee54fe2e4 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 12 Oct 2016 13:38:41 -0700 Subject: [PATCH 0002/1212] f2fs: fix wrong sum_page pointer in f2fs_gc This patch fixes using a wrong pointer for sum_page in f2fs_gc. Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 0a0a1ad1fe1f..4336807cc690 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -848,16 +848,16 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, for (segno = start_segno; segno < end_segno; segno++) { - if (get_valid_blocks(sbi, segno, 1) == 0 || - unlikely(f2fs_cp_error(sbi))) - goto next; - /* find segment summary of victim */ sum_page = find_get_page(META_MAPPING(sbi), GET_SUM_BLOCK(sbi, segno)); - f2fs_bug_on(sbi, !PageUptodate(sum_page)); f2fs_put_page(sum_page, 0); + if (get_valid_blocks(sbi, segno, 1) == 0 || + !PageUptodate(sum_page) || + unlikely(f2fs_cp_error(sbi))) + goto next; + sum = page_address(sum_page); f2fs_bug_on(sbi, type != GET_SUM_TYPE((&sum->footer))); From 2ca2001b3a36ad58081e6907c396072a80a1ecc9 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 22 Nov 2016 14:06:03 -0800 Subject: [PATCH 0003/1212] posix_acl: Clear SGID bit when setting file permissions Cherry-pick to f2fs only for generic/375 from: (073931017: posix_acl: Clear SGID bit when setting file permissions) Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 6 ++---- fs/posix_acl.c | 31 +++++++++++++++++++++++++++++++ include/linux/posix_acl.h | 1 + 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index fb0744b94c2f..4a34040932e9 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -215,12 +215,10 @@ static int __f2fs_set_acl(struct inode *inode, int type, case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl) { - error = posix_acl_equiv_mode(acl, &inode->i_mode); - if (error < 0) + error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (error) return error; set_acl_inode(inode, inode->i_mode); - if (error == 0) - acl = NULL; } break; diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 34bd1bd354e6..a60d3cc5b55d 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -592,6 +592,37 @@ posix_acl_create(struct inode *dir, umode_t *mode, } EXPORT_SYMBOL_GPL(posix_acl_create); +/** + * posix_acl_update_mode - update mode in set_acl + * + * Update the file mode when setting an ACL: compute the new file permission + * bits based on the ACL. In addition, if the ACL is equivalent to the new + * file mode, set *acl to NULL to indicate that no ACL should be set. + * + * As with chmod, clear the setgit bit if the caller is not in the owning group + * or capable of CAP_FSETID (see inode_change_ok). + * + * Called from set_acl inode operations. + */ +int posix_acl_update_mode(struct inode *inode, umode_t *mode_p, + struct posix_acl **acl) +{ + umode_t mode = inode->i_mode; + int error; + + error = posix_acl_equiv_mode(*acl, &mode); + if (error < 0) + return error; + if (error == 0) + *acl = NULL; + if (!in_group_p(inode->i_gid) && + !capable_wrt_inode_uidgid(inode, CAP_FSETID)) + mode &= ~S_ISGID; + *mode_p = mode; + return 0; +} +EXPORT_SYMBOL(posix_acl_update_mode); + /* * Fix up the uids and gids in posix acl extended attributes in place. */ diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index 3e96a6a76103..d1a8ad7e5ae4 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -95,6 +95,7 @@ extern int set_posix_acl(struct inode *, int, struct posix_acl *); extern int posix_acl_chmod(struct inode *, umode_t); extern int posix_acl_create(struct inode *, umode_t *, struct posix_acl **, struct posix_acl **); +extern int posix_acl_update_mode(struct inode *, umode_t *, struct posix_acl **); extern int simple_set_acl(struct inode *, struct posix_acl *, int); extern int simple_acl_create(struct inode *, struct inode *); From 4d42545f4996ba997eecd6c02bd1e3a816695bcd Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 23 Nov 2016 10:51:17 -0800 Subject: [PATCH 0004/1212] f2fs: fix overflow due to condition check order In the last ilen case, i was already increased, resulting in accessing out- of-boundary entry of do_replace and blkaddr. Fix to check ilen first to exit the loop. Fixes: 2aa8fbb9693020 ("f2fs: refactor __exchange_data_block for speed up") Cc: stable@vger.kernel.org # 4.8+ Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c6e33258fabf..5c4ea4cf2fb1 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -971,7 +971,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, new_size = (dst + i) << PAGE_SHIFT; if (dst_inode->i_size < new_size) f2fs_i_size_write(dst_inode, new_size); - } while ((do_replace[i] || blkaddr[i] == NULL_ADDR) && --ilen); + } while (--ilen && (do_replace[i] || blkaddr[i] == NULL_ADDR)); f2fs_put_dnode(&dn); } else { From 34a546cb043f95529a24ff042f2cdcf72b25b4f0 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 24 Nov 2016 12:45:15 -0800 Subject: [PATCH 0005/1212] f2fs: fix to determine start_cp_addr by sbi->cur_cp_pack We don't guarantee cp_addr is fixed by cp_version. This is to sync with f2fs-tools. Cc: stable@vger.kernel.org Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 8 +++++++- fs/f2fs/f2fs.h | 28 +++++++++++++++++----------- 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index cb23d6cf676b..1608ae8eea97 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -770,6 +770,11 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) if (sanity_check_ckpt(sbi)) goto fail_no_cp; + if (cur_page == cp1) + sbi->cur_cp_pack = 1; + else + sbi->cur_cp_pack = 2; + if (cp_blks <= 1) goto done; @@ -1121,7 +1126,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) le32_to_cpu(ckpt->checksum_offset))) = cpu_to_le32(crc32); - start_blk = __start_cp_addr(sbi); + start_blk = __start_cp_next_addr(sbi); /* need to wait for end_io results */ wait_on_all_pages_writeback(sbi); @@ -1185,6 +1190,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) clear_prefree_segments(sbi, cpc); clear_sbi_flag(sbi, SBI_IS_DIRTY); clear_sbi_flag(sbi, SBI_NEED_CP); + __set_cp_next_pack(sbi); /* * redirty superblock if metadata like node page or inode cache is diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index af293e84e5cd..45d1e4522760 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -789,6 +789,7 @@ struct f2fs_sb_info { /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ + int cur_cp_pack; /* remain current cp pack */ spinlock_t cp_lock; /* for flag in ckpt */ struct inode *meta_inode; /* cache meta blocks */ struct mutex cp_mutex; /* checkpoint procedure lock */ @@ -1354,22 +1355,27 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi) { - block_t start_addr; - struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); - unsigned long long ckpt_version = cur_cp_version(ckpt); + block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); - start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); - - /* - * odd numbered checkpoint should at cp segment 0 - * and even segment must be at cp segment 1 - */ - if (!(ckpt_version & 1)) + if (sbi->cur_cp_pack == 2) start_addr += sbi->blocks_per_seg; - return start_addr; } +static inline block_t __start_cp_next_addr(struct f2fs_sb_info *sbi) +{ + block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); + + if (sbi->cur_cp_pack == 1) + start_addr += sbi->blocks_per_seg; + return start_addr; +} + +static inline void __set_cp_next_pack(struct f2fs_sb_info *sbi) +{ + sbi->cur_cp_pack = (sbi->cur_cp_pack == 1) ? 2 : 1; +} + static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) { return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); From 75a192655e64b2a76433acdc759cbd509de1efac Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:31:34 +0800 Subject: [PATCH 0006/1212] f2fs: exclude free nids building and allocation During nid allocation, it needs to exclude building and allocating flow of free nids, this is because while building free nid cache, there are two steps: a) load free nids from unused nat entries in NAT pages, b) update free nid cache by checking nat journal. The two steps should be atomical, otherwise an used nid can be allocated as free one after a) and before b). This patch adds missing lock which covers build_free_nids in unlock_operation and f2fs_balance_fs_bg to avoid that. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b1e615ed2bef..a8c2bd3e5029 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1786,7 +1786,7 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, } } -void build_free_nids(struct f2fs_sb_info *sbi) +void __build_free_nids(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -1840,6 +1840,13 @@ void build_free_nids(struct f2fs_sb_info *sbi) nm_i->ra_nid_pages, META_NAT, false); } +void build_free_nids(struct f2fs_sb_info *sbi) +{ + mutex_lock(&NM_I(sbi)->build_lock); + __build_free_nids(sbi); + mutex_unlock(&NM_I(sbi)->build_lock); +} + /* * If this function returns success, caller can obtain a new nid * from second parameter of this function. @@ -1876,9 +1883,7 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) spin_unlock(&nm_i->free_nid_list_lock); /* Let's scan nat pages and its caches to get free nids */ - mutex_lock(&nm_i->build_lock); build_free_nids(sbi); - mutex_unlock(&nm_i->build_lock); goto retry; } From 7a2d5d5f8150767cc1db952bf775f2d712dc33f1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:57:00 +0800 Subject: [PATCH 0007/1212] f2fs: fix to release discard entries during checkpoint In f2fs_fill_super, if there is any IO error occurs during recovery, cached discard entries will be leaked, in order to avoid this, make write_checkpoint() handle memory release by itself, besides, move clear_prefree_segments to write_checkpoint for readability. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/checkpoint.c --- fs/f2fs/checkpoint.c | 5 ++++- fs/f2fs/super.c | 1 - 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 1608ae8eea97..63ca342a3cc8 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1187,7 +1187,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (unlikely(f2fs_cp_error(sbi))) return -EIO; - clear_prefree_segments(sbi, cpc); clear_sbi_flag(sbi, SBI_IS_DIRTY); clear_sbi_flag(sbi, SBI_NEED_CP); __set_cp_next_pack(sbi); @@ -1264,6 +1263,10 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* unlock all the fs_lock[] in do_checkpoint() */ err = do_checkpoint(sbi, cpc); + if (err) + release_discard_addrs(sbi); + else + clear_prefree_segments(sbi, cpc); unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index fd249cc9b96e..006138a6c5ab 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -738,7 +738,6 @@ static void f2fs_put_super(struct super_block *sb) * In addition, EIO will skip do checkpoint, we need this as well. */ release_ino_entry(sbi, true); - release_discard_addrs(sbi); f2fs_leave_shrinker(sbi); mutex_unlock(&sbi->umount_mutex); From 372f295d622c643f865e3cb83b2cf9b23f5bc49b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:57:01 +0800 Subject: [PATCH 0008/1212] f2fs: give a chance to detach from dirty list If there is no dirty pages in inode, we should give a chance to detach the inode from global dirty list, otherwise it needs to call another unnecessary .writepages for detaching. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 8 +++++--- fs/f2fs/dir.c | 1 + fs/f2fs/gc.c | 4 +++- fs/f2fs/inline.c | 4 +++- fs/f2fs/node.c | 1 + fs/f2fs/segment.c | 4 +++- 6 files changed, 16 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7a3ac306a57c..15c0fe40ed5c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1776,12 +1776,14 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, return; if (PageDirty(page)) { - if (inode->i_ino == F2FS_META_INO(sbi)) + if (inode->i_ino == F2FS_META_INO(sbi)) { dec_page_count(sbi, F2FS_DIRTY_META); - else if (inode->i_ino == F2FS_NODE_INO(sbi)) + } else if (inode->i_ino == F2FS_NODE_INO(sbi)) { dec_page_count(sbi, F2FS_DIRTY_NODES); - else + } else { inode_dec_dirty_pages(inode); + remove_dirty_inode(inode); + } } /* This is atomic written page, keep Private */ diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index e634a637c443..c0dba11519cf 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -742,6 +742,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, ClearPagePrivate(page); ClearPageUptodate(page); inode_dec_dirty_pages(dir); + remove_dirty_inode(dir); } f2fs_put_page(page, 1); } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 4336807cc690..72a0ca08f901 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -670,8 +670,10 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) retry: set_page_dirty(page); f2fs_wait_on_page_writeback(page, DATA, true); - if (clear_page_dirty_for_io(page)) + if (clear_page_dirty_for_io(page)) { inode_dec_dirty_pages(inode); + remove_dirty_inode(inode); + } set_cold_data(page); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index a04c1016d511..b21a0788f2cd 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -136,8 +136,10 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) fio.old_blkaddr = dn->data_blkaddr; write_data_page(dn, &fio); f2fs_wait_on_page_writeback(page, DATA, true); - if (dirty) + if (dirty) { inode_dec_dirty_pages(dn->inode); + remove_dirty_inode(dn->inode); + } /* this converted inline_data should be recovered. */ set_inode_flag(dn->inode, FI_APPEND_WRITE); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index a8c2bd3e5029..97eb2c0811b5 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1203,6 +1203,7 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) ret = f2fs_write_inline_data(inode, page); inode_dec_dirty_pages(inode); + remove_dirty_inode(inode); if (ret) set_page_dirty(page); page_out: diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b3c61ae37f92..75477ec6c535 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -272,8 +272,10 @@ static int __commit_inmem_pages(struct inode *inode, set_page_dirty(page); f2fs_wait_on_page_writeback(page, DATA, true); - if (clear_page_dirty_for_io(page)) + if (clear_page_dirty_for_io(page)) { inode_dec_dirty_pages(inode); + remove_dirty_inode(inode); + } fio.page = page; err = do_write_data_page(&fio); From 48cceaae9acbd68469dfd371419fee736bad3e58 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:57:02 +0800 Subject: [PATCH 0009/1212] f2fs: add missing f2fs_balance_fs in f2fs_zero_range f2fs_balance_fs should be called in between node page updating, otherwise node page count will exceeded far beyond watermark of triggering foreground garbage collection, result in facing high risk of hitting LFS allocation failure. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5c4ea4cf2fb1..c0774c98dce4 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1222,6 +1222,9 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = f2fs_do_zero_range(&dn, index, end); f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); + + f2fs_balance_fs(sbi, dn.node_changed); + if (ret) goto out; From fc843bf42b3ab83b44112895ad77b43ea9249eb0 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:57:03 +0800 Subject: [PATCH 0010/1212] f2fs: don't miss any f2fs_balance_fs cases In f2fs_map_blocks, let f2fs_balance_fs detects node page modification with dn.node_changed to avoid miss some corner cases. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 15c0fe40ed5c..01dc6ac79224 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -670,7 +670,6 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, unsigned int ofs_in_node, last_ofs_in_node; blkcnt_t prealloc; struct extent_info ei; - bool allocated = false; block_t blkaddr; if (!maxblocks) @@ -729,10 +728,8 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, } } else { err = __allocate_data_block(&dn); - if (!err) { + if (!err) set_inode_flag(inode, FI_APPEND_WRITE); - allocated = true; - } } if (err) goto sync_out; @@ -787,7 +784,6 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, err = reserve_new_blocks(&dn, prealloc); if (err) goto sync_out; - allocated = dn.node_changed; map->m_len += dn.ofs_in_node - ofs_in_node; if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) { @@ -806,9 +802,8 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, if (create) { f2fs_unlock_op(sbi); - f2fs_balance_fs(sbi, allocated); + f2fs_balance_fs(sbi, dn.node_changed); } - allocated = false; goto next_dnode; sync_out: @@ -816,7 +811,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, unlock_out: if (create) { f2fs_unlock_op(sbi); - f2fs_balance_fs(sbi, allocated); + f2fs_balance_fs(sbi, dn.node_changed); } out: trace_f2fs_map_blocks(inode, map, err); From a24f28d74694db4639ad644246dcc330b0cef2c4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:57:04 +0800 Subject: [PATCH 0011/1212] f2fs: be aware of extent beyond EOF in fiemap f2fs can support fallocating blocks beyond file size without changing the size, but ->fiemap of f2fs was restricted and can't detect these extents fallocated past EOF, now relieve the restriction. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 01dc6ac79224..6e00e017bb4f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -880,7 +880,6 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct buffer_head map_bh; sector_t start_blk, last_blk; pgoff_t next_pgofs; - loff_t isize; u64 logical = 0, phys = 0, size = 0; u32 flags = 0; int ret = 0; @@ -897,13 +896,6 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, inode_lock(inode); - isize = i_size_read(inode); - if (start >= isize) - goto out; - - if (start + len > isize) - len = isize - start; - if (logical_to_blk(inode, len) == 0) len = blk_to_logical(inode, 1); @@ -922,13 +914,11 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, /* HOLE */ if (!buffer_mapped(&map_bh)) { start_blk = next_pgofs; - /* Go through holes util pass the EOF */ - if (blk_to_logical(inode, start_blk) < isize) + + if (blk_to_logical(inode, start_blk) < blk_to_logical(inode, + F2FS_I_SB(inode)->max_file_blocks)) goto prep_next; - /* Found a hole beyond isize means no more extents. - * Note that the premise is that filesystems don't - * punch holes beyond isize and keep size unchanged. - */ + flags |= FIEMAP_EXTENT_LAST; } From 70aa0e6cb1a3be365613997c33bd468b01bce93e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:57:05 +0800 Subject: [PATCH 0012/1212] f2fs: fix to update largest extent under lock In order to avoid racing problem, make largest extent cache being updated under lock. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index d7369895a78a..1fbebcb33a9d 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -252,6 +252,7 @@ struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino) int update_inode(struct inode *inode, struct page *node_page) { struct f2fs_inode *ri; + struct extent_tree *et = F2FS_I(inode)->extent_tree; f2fs_inode_synced(inode); @@ -267,11 +268,13 @@ int update_inode(struct inode *inode, struct page *node_page) ri->i_size = cpu_to_le64(i_size_read(inode)); ri->i_blocks = cpu_to_le64(inode->i_blocks); - if (F2FS_I(inode)->extent_tree) - set_raw_extent(&F2FS_I(inode)->extent_tree->largest, - &ri->i_ext); - else + if (et) { + read_lock(&et->lock); + set_raw_extent(&et->largest, &ri->i_ext); + read_unlock(&et->lock); + } else { memset(&ri->i_ext, 0, sizeof(ri->i_ext)); + } set_raw_inline(inode, ri); ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec); From a6c3b7211039846974b5b80e68032951a0999f86 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:57:06 +0800 Subject: [PATCH 0013/1212] f2fs: fix error handling in fsync_node_pages In fsync_node_pages, if f2fs was taged with CP_ERROR_FLAG, make sure bio cache was flushed before return. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 97eb2c0811b5..bc38e5a92b4b 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1338,7 +1338,8 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, if (unlikely(f2fs_cp_error(sbi))) { f2fs_put_page(last_page, 0); pagevec_release(&pvec); - return -EIO; + ret = -EIO; + goto out; } if (!IS_DNODE(page) || !is_cold_node(page)) @@ -1411,7 +1412,7 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, unlock_page(last_page); goto retry; } - +out: if (nwritten) f2fs_submit_merged_bio_cond(sbi, NULL, NULL, ino, NODE, WRITE); return ret ? -EIO: 0; From 5f3ec1f715c1c18b544e2480472c3a2cdf19a425 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 11 Oct 2016 10:36:12 -0700 Subject: [PATCH 0014/1212] f2fs: fix sparse warnings f2fs contained a number of endianness conversion bugs. Also, one function should have been 'static'. Found with sparse by running 'make C=2 CF=-D__CHECK_ENDIAN__ fs/f2fs/' Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/segment.c --- fs/f2fs/dir.c | 2 +- fs/f2fs/inline.c | 2 +- fs/f2fs/node.c | 5 +++-- fs/f2fs/node.h | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index c0dba11519cf..7136dc1ade11 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -136,7 +136,7 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, /* show encrypted name */ if (fname->hash) { - if (de->hash_code == fname->hash) + if (de->hash_code == cpu_to_le32(fname->hash)) goto found; } else if (de_name.len == name->len && de->hash_code == namehash && diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index b21a0788f2cd..06d20489d532 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -420,7 +420,7 @@ static int f2fs_add_inline_entries(struct inode *dir, } new_name.name = d.filename[bit_pos]; - new_name.len = de->name_len; + new_name.len = le16_to_cpu(de->name_len); ino = le32_to_cpu(de->ino); fake_mode = get_de_type(de) << S_SHIFT; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index bc38e5a92b4b..d2ba37a84f8e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -270,8 +270,9 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, e = grab_nat_entry(nm_i, nid); node_info_from_raw_nat(&e->ni, ne); } else { - f2fs_bug_on(sbi, nat_get_ino(e) != ne->ino || - nat_get_blkaddr(e) != ne->block_addr || + f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) || + nat_get_blkaddr(e) != + le32_to_cpu(ne->block_addr) || nat_get_version(e) != ne->version); } } diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 868bec65e51c..cfdcf98516a1 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -313,7 +313,7 @@ static inline bool is_recoverable_dnode(struct page *page) ((unsigned char *)ckpt + crc_offset))); cp_ver |= (crc << 32); } - return cpu_to_le64(cp_ver) == cpver_of_node(page); + return cp_ver == cpver_of_node(page); } /* From a943c829bed9925e110fe5ea6891a10d7607e781 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:56:59 +0800 Subject: [PATCH 0015/1212] f2fs: clear nlink if fail to add_link We don't need to keep incomplete created inode in cache, so if we fail to add link into directory during new inode creation, it's better to set nlink of inode to zero, then we can evict inode immediately. Otherwise release of nid belong to inode will be delayed until inode cache is being shrunk, it may cause a seemingly endless loop while allocating free nids in time of testing generic/269 case of fstest suit. Signed-off-by: Chao Yu [Jaegeuk Kim: add update_inode_page to fix kernel panic] Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 1fbebcb33a9d..d32fd0343eae 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -387,6 +387,8 @@ void f2fs_evict_inode(struct inode *inode) f2fs_lock_op(sbi); err = remove_inode_page(inode); f2fs_unlock_op(sbi); + if (err == -ENOENT) + err = 0; } /* give more chances, if ENOMEM case */ @@ -427,6 +429,18 @@ void handle_failed_inode(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct node_info ni; + /* + * clear nlink of inode in order to release resource of inode + * immediately. + */ + clear_nlink(inode); + + /* + * we must call this to avoid inode being remained as dirty, resulting + * in a panic when flushing dirty inodes in gdirty_list. + */ + update_inode_page(inode); + /* don't make bad inode, since it becomes a regular file. */ unlock_new_inode(inode); From 3499fdbee609d03aedc25ebd7baa420e343e4dbb Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 12 Oct 2016 19:28:29 +0800 Subject: [PATCH 0016/1212] f2fs: split free nid list During free nid allocation, in order to do preallocation, we will tag free nid entry as allocated one and still leave it in free nid list, for other allocators who want to grab free nids, it needs to traverse the free nid list for lookup. It becomes overhead in scenario of allocating free nid intensively by multithreads. This patch splits free nid list to two list: {free,alloc}_nid_list, to keep free nids and preallocated free nids separately, after that, traverse latency will be gone, besides split nid_cnt for separate statistic. Additionally, introduce __insert_nid_to_list and __remove_nid_from_list for cleanup. Signed-off-by: Chao Yu [Jaegeuk Kim: modify f2fs_bug_on to avoid needless branches] Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 11 ++-- fs/f2fs/f2fs.h | 14 +++-- fs/f2fs/node.c | 136 +++++++++++++++++++++++++++------------------ fs/f2fs/node.h | 11 ++-- fs/f2fs/shrinker.c | 4 +- 5 files changed, 108 insertions(+), 68 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index fb245bd302e4..6af146c48644 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -74,7 +74,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->dirty_nats = NM_I(sbi)->dirty_nat_cnt; si->sits = MAIN_SEGS(sbi); si->dirty_sits = SIT_I(sbi)->dirty_sentries; - si->fnids = NM_I(sbi)->fcnt; + si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID_LIST]; + si->alloc_nids = NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]; si->bg_gc = sbi->bg_gc; si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) @@ -194,7 +195,9 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->cache_mem += sizeof(struct flush_cmd_control); /* free nids */ - si->cache_mem += NM_I(sbi)->fcnt * sizeof(struct free_nid); + si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] + + NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]) * + sizeof(struct free_nid); si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry); si->cache_mem += NM_I(sbi)->dirty_nat_cnt * sizeof(struct nat_entry_set); @@ -324,8 +327,8 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_imeta); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); - seq_printf(s, " - free_nids: %9d\n", - si->fnids); + seq_printf(s, " - free_nids: %9d, alloc_nids: %9d\n", + si->free_nids, si->alloc_nids); seq_puts(s, "\nDistribution of User Blocks:"); seq_puts(s, " [ valid | invalid | free ]\n"); seq_puts(s, " ["); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 45d1e4522760..cec025852c22 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -529,6 +529,12 @@ static inline void __try_update_largest_extent(struct inode *inode, } } +enum nid_list { + FREE_NID_LIST, + ALLOC_NID_LIST, + MAX_NID_LIST, +}; + struct f2fs_nm_info { block_t nat_blkaddr; /* base disk address of NAT */ nid_t max_nid; /* maximum possible node ids */ @@ -548,9 +554,9 @@ struct f2fs_nm_info { /* free node ids management */ struct radix_tree_root free_nid_root;/* root of the free_nid cache */ - struct list_head free_nid_list; /* a list for free nids */ - spinlock_t free_nid_list_lock; /* protect free nid list */ - unsigned int fcnt; /* the number of free node id */ + struct list_head nid_list[MAX_NID_LIST];/* lists for free nids */ + unsigned int nid_cnt[MAX_NID_LIST]; /* the number of free node id */ + spinlock_t nid_list_lock; /* protect nid lists ops */ struct mutex build_lock; /* lock for build free nids */ /* for checkpoint */ @@ -2214,7 +2220,7 @@ struct f2fs_stat_info { s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; s64 inmem_pages; unsigned int ndirty_dirs, ndirty_files, ndirty_all; - int nats, dirty_nats, sits, dirty_sits, fnids; + int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids; int total_count, utilization; int bg_gc, wb_bios; int inline_xattr, inline_inode, inline_dir, orphans; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d2ba37a84f8e..5bb2fa324e68 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -45,8 +45,8 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) * give 25%, 25%, 50%, 50%, 50% memory for each components respectively */ if (type == FREE_NIDS) { - mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >> - PAGE_SHIFT; + mem_size = (nm_i->nid_cnt[FREE_NID_LIST] * + sizeof(struct free_nid)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); } else if (type == NAT_ENTRIES) { mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> @@ -1699,10 +1699,31 @@ static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i, struct free_nid *i) { - list_del(&i->list); radix_tree_delete(&nm_i->free_nid_root, i->nid); } +static void __insert_nid_to_list(struct f2fs_sb_info *sbi, + struct free_nid *i, enum nid_list list) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW : + i->state != NID_ALLOC); + nm_i->nid_cnt[list]++; + list_add_tail(&i->list, &nm_i->nid_list[list]); +} + +static void __remove_nid_from_list(struct f2fs_sb_info *sbi, + struct free_nid *i, enum nid_list list) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW : + i->state != NID_ALLOC); + nm_i->nid_cnt[list]--; + list_del(&i->list); +} + static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -1733,33 +1754,33 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) return 0; } - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) { - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); radix_tree_preload_end(); kmem_cache_free(free_nid_slab, i); return 0; } - list_add_tail(&i->list, &nm_i->free_nid_list); - nm_i->fcnt++; - spin_unlock(&nm_i->free_nid_list_lock); + __insert_nid_to_list(sbi, i, FREE_NID_LIST); + spin_unlock(&nm_i->nid_list_lock); radix_tree_preload_end(); return 1; } -static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid) +static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) { + struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; bool need_free = false; - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); if (i && i->state == NID_NEW) { + __remove_nid_from_list(sbi, i, FREE_NID_LIST); __del_from_free_nid_list(nm_i, i); - nm_i->fcnt--; need_free = true; } - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); if (need_free) kmem_cache_free(free_nid_slab, i); @@ -1798,7 +1819,7 @@ void __build_free_nids(struct f2fs_sb_info *sbi) nid_t nid = nm_i->next_scan_nid; /* Enough entries */ - if (nm_i->fcnt >= NAT_ENTRY_PER_BLOCK) + if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK) return; /* readahead nat pages to be scanned */ @@ -1834,7 +1855,7 @@ void __build_free_nids(struct f2fs_sb_info *sbi) if (addr == NULL_ADDR) add_free_nid(sbi, nid, true); else - remove_free_nid(nm_i, nid); + remove_free_nid(sbi, nid); } up_read(&curseg->journal_rwsem); up_read(&nm_i->nat_tree_lock); @@ -1867,23 +1888,22 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids)) return false; - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); /* We should not use stale free nids created by build_free_nids */ - if (nm_i->fcnt && !on_build_free_nids(nm_i)) { - f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); - list_for_each_entry(i, &nm_i->free_nid_list, list) - if (i->state == NID_NEW) - break; - - f2fs_bug_on(sbi, i->state != NID_NEW); + if (nm_i->nid_cnt[FREE_NID_LIST] && !on_build_free_nids(nm_i)) { + f2fs_bug_on(sbi, list_empty(&nm_i->nid_list[FREE_NID_LIST])); + i = list_first_entry(&nm_i->nid_list[FREE_NID_LIST], + struct free_nid, list); *nid = i->nid; + + __remove_nid_from_list(sbi, i, FREE_NID_LIST); i->state = NID_ALLOC; - nm_i->fcnt--; - spin_unlock(&nm_i->free_nid_list_lock); + __insert_nid_to_list(sbi, i, ALLOC_NID_LIST); + spin_unlock(&nm_i->nid_list_lock); return true; } - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); /* Let's scan nat pages and its caches to get free nids */ build_free_nids(sbi); @@ -1898,11 +1918,12 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); - f2fs_bug_on(sbi, !i || i->state != NID_ALLOC); + f2fs_bug_on(sbi, !i); + __remove_nid_from_list(sbi, i, ALLOC_NID_LIST); __del_from_free_nid_list(nm_i, i); - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); } @@ -1919,17 +1940,20 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) if (!nid) return; - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); - f2fs_bug_on(sbi, !i || i->state != NID_ALLOC); + f2fs_bug_on(sbi, !i); + + __remove_nid_from_list(sbi, i, ALLOC_NID_LIST); + if (!available_free_memory(sbi, FREE_NIDS)) { __del_from_free_nid_list(nm_i, i); need_free = true; } else { i->state = NID_NEW; - nm_i->fcnt++; + __insert_nid_to_list(sbi, i, FREE_NID_LIST); } - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); if (need_free) kmem_cache_free(free_nid_slab, i); @@ -1941,24 +1965,26 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) struct free_nid *i, *next; int nr = nr_shrink; - if (nm_i->fcnt <= MAX_FREE_NIDS) + if (nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS) return 0; if (!mutex_trylock(&nm_i->build_lock)) return 0; - spin_lock(&nm_i->free_nid_list_lock); - list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) { - if (nr_shrink <= 0 || nm_i->fcnt <= MAX_FREE_NIDS) + spin_lock(&nm_i->nid_list_lock); + list_for_each_entry_safe(i, next, &nm_i->nid_list[FREE_NID_LIST], + list) { + if (nr_shrink <= 0 || + nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS) break; - if (i->state == NID_ALLOC) - continue; + + __remove_nid_from_list(sbi, i, FREE_NID_LIST); __del_from_free_nid_list(nm_i, i); + kmem_cache_free(free_nid_slab, i); - nm_i->fcnt--; nr_shrink--; } - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); mutex_unlock(&nm_i->build_lock); return nr - nr_shrink; @@ -2014,7 +2040,7 @@ void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) if (unlikely(!inc_valid_node_count(sbi, inode))) f2fs_bug_on(sbi, 1); - remove_free_nid(NM_I(sbi), new_xnid); + remove_free_nid(sbi, new_xnid); get_node_info(sbi, new_xnid, &ni); ni.ino = inode->i_ino; set_node_addr(sbi, &ni, NEW_ADDR, false); @@ -2044,7 +2070,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) } /* Should not use this inode from free nid list */ - remove_free_nid(NM_I(sbi), ino); + remove_free_nid(sbi, ino); if (!PageUptodate(ipage)) SetPageUptodate(ipage); @@ -2278,20 +2304,22 @@ static int init_node_manager(struct f2fs_sb_info *sbi) /* not used nids: 0, node, meta, (and root counted as valid node) */ nm_i->available_nids = nm_i->max_nid - F2FS_RESERVED_NODE_NUM; - nm_i->fcnt = 0; + nm_i->nid_cnt[FREE_NID_LIST] = 0; + nm_i->nid_cnt[ALLOC_NID_LIST] = 0; nm_i->nat_cnt = 0; nm_i->ram_thresh = DEF_RAM_THRESHOLD; nm_i->ra_nid_pages = DEF_RA_NID_PAGES; nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD; INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); - INIT_LIST_HEAD(&nm_i->free_nid_list); + INIT_LIST_HEAD(&nm_i->nid_list[FREE_NID_LIST]); + INIT_LIST_HEAD(&nm_i->nid_list[ALLOC_NID_LIST]); INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO); INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO); INIT_LIST_HEAD(&nm_i->nat_entries); mutex_init(&nm_i->build_lock); - spin_lock_init(&nm_i->free_nid_list_lock); + spin_lock_init(&nm_i->nid_list_lock); init_rwsem(&nm_i->nat_tree_lock); nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); @@ -2336,17 +2364,19 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) return; /* destroy free nid list */ - spin_lock(&nm_i->free_nid_list_lock); - list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { - f2fs_bug_on(sbi, i->state == NID_ALLOC); + spin_lock(&nm_i->nid_list_lock); + list_for_each_entry_safe(i, next_i, &nm_i->nid_list[FREE_NID_LIST], + list) { + __remove_nid_from_list(sbi, i, FREE_NID_LIST); __del_from_free_nid_list(nm_i, i); - nm_i->fcnt--; - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); } - f2fs_bug_on(sbi, nm_i->fcnt); - spin_unlock(&nm_i->free_nid_list_lock); + f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID_LIST]); + f2fs_bug_on(sbi, nm_i->nid_cnt[ALLOC_NID_LIST]); + f2fs_bug_on(sbi, !list_empty(&nm_i->nid_list[ALLOC_NID_LIST])); + spin_unlock(&nm_i->nid_list_lock); /* destroy nat cache */ down_write(&nm_i->nat_tree_lock); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index cfdcf98516a1..e7997e240366 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -169,14 +169,15 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *fnid; - spin_lock(&nm_i->free_nid_list_lock); - if (nm_i->fcnt <= 0) { - spin_unlock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); + if (nm_i->nid_cnt[FREE_NID_LIST] <= 0) { + spin_unlock(&nm_i->nid_list_lock); return; } - fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list); + fnid = list_entry(nm_i->nid_list[FREE_NID_LIST].next, + struct free_nid, list); *nid = fnid->nid; - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); } /* diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index 46c915425923..ec539f407cc4 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -26,8 +26,8 @@ static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) { - if (NM_I(sbi)->fcnt > MAX_FREE_NIDS) - return NM_I(sbi)->fcnt - MAX_FREE_NIDS; + if (NM_I(sbi)->nid_cnt[FREE_NID_LIST] > MAX_FREE_NIDS) + return NM_I(sbi)->nid_cnt[FREE_NID_LIST] - MAX_FREE_NIDS; return 0; } From bae23863f810952bf54caf9ee56cd3b2763d22bf Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 12 Oct 2016 10:09:59 -0700 Subject: [PATCH 0017/1212] f2fs: clean up free nid list operations This patch cleans up to use consistent free nid list ops. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 56 +++++++++++++++++++++++--------------------------- 1 file changed, 26 insertions(+), 30 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 5bb2fa324e68..ef5357c7af24 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1696,25 +1696,26 @@ static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, return radix_tree_lookup(&nm_i->free_nid_root, n); } -static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i, - struct free_nid *i) -{ - radix_tree_delete(&nm_i->free_nid_root, i->nid); -} - -static void __insert_nid_to_list(struct f2fs_sb_info *sbi, - struct free_nid *i, enum nid_list list) +static int __insert_nid_to_list(struct f2fs_sb_info *sbi, + struct free_nid *i, enum nid_list list, bool new) { struct f2fs_nm_info *nm_i = NM_I(sbi); + if (new) { + int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i); + if (err) + return err; + } + f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW : i->state != NID_ALLOC); nm_i->nid_cnt[list]++; list_add_tail(&i->list, &nm_i->nid_list[list]); + return 0; } static void __remove_nid_from_list(struct f2fs_sb_info *sbi, - struct free_nid *i, enum nid_list list) + struct free_nid *i, enum nid_list list, bool reuse) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -1722,6 +1723,8 @@ static void __remove_nid_from_list(struct f2fs_sb_info *sbi, i->state != NID_ALLOC); nm_i->nid_cnt[list]--; list_del(&i->list); + if (!reuse) + radix_tree_delete(&nm_i->free_nid_root, i->nid); } static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) @@ -1729,6 +1732,7 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; struct nat_entry *ne; + int err; if (!available_free_memory(sbi, FREE_NIDS)) return -1; @@ -1755,15 +1759,13 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) } spin_lock(&nm_i->nid_list_lock); - if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) { - spin_unlock(&nm_i->nid_list_lock); - radix_tree_preload_end(); + err = __insert_nid_to_list(sbi, i, FREE_NID_LIST, true); + spin_unlock(&nm_i->nid_list_lock); + radix_tree_preload_end(); + if (err) { kmem_cache_free(free_nid_slab, i); return 0; } - __insert_nid_to_list(sbi, i, FREE_NID_LIST); - spin_unlock(&nm_i->nid_list_lock); - radix_tree_preload_end(); return 1; } @@ -1776,8 +1778,7 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); if (i && i->state == NID_NEW) { - __remove_nid_from_list(sbi, i, FREE_NID_LIST); - __del_from_free_nid_list(nm_i, i); + __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); need_free = true; } spin_unlock(&nm_i->nid_list_lock); @@ -1897,9 +1898,9 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct free_nid, list); *nid = i->nid; - __remove_nid_from_list(sbi, i, FREE_NID_LIST); + __remove_nid_from_list(sbi, i, FREE_NID_LIST, true); i->state = NID_ALLOC; - __insert_nid_to_list(sbi, i, ALLOC_NID_LIST); + __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false); spin_unlock(&nm_i->nid_list_lock); return true; } @@ -1921,8 +1922,7 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); f2fs_bug_on(sbi, !i); - __remove_nid_from_list(sbi, i, ALLOC_NID_LIST); - __del_from_free_nid_list(nm_i, i); + __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false); spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); @@ -1944,14 +1944,13 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) i = __lookup_free_nid_list(nm_i, nid); f2fs_bug_on(sbi, !i); - __remove_nid_from_list(sbi, i, ALLOC_NID_LIST); - if (!available_free_memory(sbi, FREE_NIDS)) { - __del_from_free_nid_list(nm_i, i); + __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false); need_free = true; } else { + __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, true); i->state = NID_NEW; - __insert_nid_to_list(sbi, i, FREE_NID_LIST); + __insert_nid_to_list(sbi, i, FREE_NID_LIST, false); } spin_unlock(&nm_i->nid_list_lock); @@ -1978,9 +1977,7 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS) break; - __remove_nid_from_list(sbi, i, FREE_NID_LIST); - __del_from_free_nid_list(nm_i, i); - + __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); kmem_cache_free(free_nid_slab, i); nr_shrink--; } @@ -2367,8 +2364,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) spin_lock(&nm_i->nid_list_lock); list_for_each_entry_safe(i, next_i, &nm_i->nid_list[FREE_NID_LIST], list) { - __remove_nid_from_list(sbi, i, FREE_NID_LIST); - __del_from_free_nid_list(nm_i, i); + __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); spin_lock(&nm_i->nid_list_lock); From b1b14da24aab69fbb84159fe5c57035dafc50276 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:31:35 +0800 Subject: [PATCH 0018/1212] f2fs: don't interrupt free nids building during nid allocation Let build_free_nids support sync/async methods, in allocation flow of nids, we use synchronuous method, so that we can avoid looping in alloc_nid when free memory is low; in unblock_operations and f2fs_balance_fs_bg we use asynchronuous method in where low memory condition can interrupt us. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/f2fs.h | 2 +- fs/f2fs/node.c | 22 ++++++++++------------ fs/f2fs/segment.c | 2 +- 4 files changed, 13 insertions(+), 15 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 63ca342a3cc8..1d273f51bc1c 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -990,7 +990,7 @@ static void unblock_operations(struct f2fs_sb_info *sbi) { up_write(&sbi->node_write); - build_free_nids(sbi); + build_free_nids(sbi, false); f2fs_unlock_all(sbi); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index cec025852c22..9e8d3c9af54a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2071,7 +2071,7 @@ void move_node_page(struct page *, int); int fsync_node_pages(struct f2fs_sb_info *, struct inode *, struct writeback_control *, bool); int sync_node_pages(struct f2fs_sb_info *, struct writeback_control *); -void build_free_nids(struct f2fs_sb_info *); +void build_free_nids(struct f2fs_sb_info *, bool); bool alloc_nid(struct f2fs_sb_info *, nid_t *); void alloc_nid_done(struct f2fs_sb_info *, nid_t); void alloc_nid_failed(struct f2fs_sb_info *, nid_t); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index ef5357c7af24..5800a1082fe8 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1734,9 +1734,6 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) struct nat_entry *ne; int err; - if (!available_free_memory(sbi, FREE_NIDS)) - return -1; - /* 0 nid should not be used */ if (unlikely(nid == 0)) return 0; @@ -1804,14 +1801,12 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); f2fs_bug_on(sbi, blk_addr == NEW_ADDR); - if (blk_addr == NULL_ADDR) { - if (add_free_nid(sbi, start_nid, true) < 0) - break; - } + if (blk_addr == NULL_ADDR) + add_free_nid(sbi, start_nid, true); } } -void __build_free_nids(struct f2fs_sb_info *sbi) +void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -1823,6 +1818,9 @@ void __build_free_nids(struct f2fs_sb_info *sbi) if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK) return; + if (!sync && !available_free_memory(sbi, FREE_NIDS)) + return; + /* readahead nat pages to be scanned */ ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT, true); @@ -1865,10 +1863,10 @@ void __build_free_nids(struct f2fs_sb_info *sbi) nm_i->ra_nid_pages, META_NAT, false); } -void build_free_nids(struct f2fs_sb_info *sbi) +void build_free_nids(struct f2fs_sb_info *sbi, bool sync) { mutex_lock(&NM_I(sbi)->build_lock); - __build_free_nids(sbi); + __build_free_nids(sbi, sync); mutex_unlock(&NM_I(sbi)->build_lock); } @@ -1907,7 +1905,7 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) spin_unlock(&nm_i->nid_list_lock); /* Let's scan nat pages and its caches to get free nids */ - build_free_nids(sbi); + build_free_nids(sbi, true); goto retry; } @@ -2344,7 +2342,7 @@ int build_node_manager(struct f2fs_sb_info *sbi) if (err) return err; - build_free_nids(sbi); + build_free_nids(sbi, true); return 0; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 75477ec6c535..48903702de27 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -380,7 +380,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) if (!available_free_memory(sbi, FREE_NIDS)) try_to_free_nids(sbi, MAX_FREE_NIDS); else - build_free_nids(sbi); + build_free_nids(sbi, false); /* checkpoint is the only way to shrink partial cached entries */ if (!available_free_memory(sbi, NAT_ENTRIES) || From 518a2cf9a065fe11df9382dfb089c745c6ebca69 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:31:36 +0800 Subject: [PATCH 0019/1212] f2fs: avoid casted negative value as shrink count This patch makes sure it returns a positive value instead of a probable casted negative value as shrink count. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/shrinker.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index ec539f407cc4..5c60fc28ec75 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -21,14 +21,16 @@ static unsigned int shrinker_run_no; static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) { - return NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt; + long count = NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt; + + return count > 0 ? count : 0; } static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) { - if (NM_I(sbi)->nid_cnt[FREE_NID_LIST] > MAX_FREE_NIDS) - return NM_I(sbi)->nid_cnt[FREE_NID_LIST] - MAX_FREE_NIDS; - return 0; + long count = NM_I(sbi)->nid_cnt[FREE_NID_LIST] - MAX_FREE_NIDS; + + return count > 0 ? count : 0; } static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi) From 2c58f7dea88d81f35e7e78fb6a3af41f9d759346 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 14 Oct 2016 13:28:05 -0700 Subject: [PATCH 0020/1212] f2fs: count dirty inodes to flush node pages during checkpoint If there are a lot of dirty inodes, we need to flush all of them when doing checkpoint. So, we need to count this for enough free space. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index fecb856ad874..762743988426 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -471,11 +471,12 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi) { int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); if (test_opt(sbi, LFS)) return false; - return free_sections(sbi) <= (node_secs + 2 * dent_secs + + return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + reserved_sections(sbi) + 1); } @@ -484,14 +485,14 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, { int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); - - node_secs += get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); + int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; return (free_sections(sbi) + freed) <= - (node_secs + 2 * dent_secs + reserved_sections(sbi) + needed); + (node_secs + 2 * dent_secs + imeta_secs + + reserved_sections(sbi) + needed); } static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi) From 4ce47914814383d34518cb22121b3216fbcefec1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 14 Oct 2016 13:30:31 -0700 Subject: [PATCH 0021/1212] f2fs: call f2fs_balance_fs for setattr If inode becomes dirty, we need to check the # of dirty inodes whether or not further checkpoint would be required. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c0774c98dce4..53ba384cb675 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -695,7 +695,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) err = f2fs_truncate(inode); if (err) return err; - f2fs_balance_fs(F2FS_I_SB(inode), true); } else { /* * do not trim all blocks after i_size if target size is @@ -724,6 +723,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } f2fs_mark_inode_dirty_sync(inode); + + /* inode change will produce dirty node pages flushed by checkpoint */ + f2fs_balance_fs(F2FS_I_SB(inode), true); + return err; } From cab4de5c485aa661a3019ca92bf617294652fb4e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 17 Oct 2016 15:36:31 -0700 Subject: [PATCH 0022/1212] f2fs: declare static function for __build_free_nids This patch avoids build warning. Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 5800a1082fe8..e1ce0b8438fc 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1806,7 +1806,7 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, } } -void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) +static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); From 1d486e74cf8427152f96688d466a1c57a44a7642 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 18 Oct 2016 11:07:45 -0700 Subject: [PATCH 0023/1212] f2fs: use BIO_MAX_PAGES for bio allocation We don't need to allocate bio partially in order to maximize sequential writes. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/data.c | 4 +--- fs/f2fs/node.c | 3 +-- fs/f2fs/segment.c | 4 ++-- fs/f2fs/segment.h | 17 +++-------------- 5 files changed, 8 insertions(+), 22 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 1d273f51bc1c..1dffe86651be 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -226,7 +226,7 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) f2fs_put_page(page, 0); if (readahead) - ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR, true); + ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true); } static int f2fs_write_meta_page(struct page *page, diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 6e00e017bb4f..465fa9d62485 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -272,10 +272,8 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { - int bio_blocks = MAX_BIO_BLOCKS(sbi); - io->bio = __bio_alloc(sbi, fio->new_blkaddr, - bio_blocks, is_read); + BIO_MAX_PAGES, is_read); io->fio = *fio; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index e1ce0b8438fc..389be7f6e07c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2099,7 +2099,6 @@ int restore_node_summary(struct f2fs_sb_info *sbi, struct f2fs_node *rn; struct f2fs_summary *sum_entry; block_t addr; - int bio_blocks = MAX_BIO_BLOCKS(sbi); int i, idx, last_offset, nrpages; /* scan the node segment */ @@ -2108,7 +2107,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi, sum_entry = &sum->entries[0]; for (i = 0; i < last_offset; i += nrpages, addr += nrpages) { - nrpages = min(last_offset - i, bio_blocks); + nrpages = min(last_offset - i, BIO_MAX_PAGES); /* readahead node pages */ ra_meta_pages(sbi, addr, nrpages, META_POR, true); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 48903702de27..ec4d74c26067 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2239,10 +2239,10 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) int sit_blk_cnt = SIT_BLK_CNT(sbi); unsigned int i, start, end; unsigned int readed, start_blk = 0; - int nrpages = MAX_BIO_BLOCKS(sbi) * 8; do { - readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT, true); + readed = ra_meta_pages(sbi, start_blk, BIO_MAX_PAGES, + META_SIT, true); start = start_blk * sit_i->sents_per_block; end = (start_blk + readed) * sit_i->sents_per_block; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 762743988426..89ab4301ef02 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -102,8 +102,6 @@ (((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK) #define SECTOR_TO_BLOCK(sectors) \ (sectors >> F2FS_LOG_SECTORS_PER_BLOCK) -#define MAX_BIO_BLOCKS(sbi) \ - ((int)min((int)max_hw_blocks(sbi), BIO_MAX_PAGES)) /* * indicate a block allocation direction: RIGHT and LEFT. @@ -696,13 +694,6 @@ static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) return false; } -static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi) -{ - struct block_device *bdev = sbi->sb->s_bdev; - struct request_queue *q = bdev_get_queue(bdev); - return SECTOR_TO_BLOCK(queue_max_sectors(q)); -} - /* * It is very important to gather dirty pages and write at once, so that we can * submit a big bio without interfering other data writes. @@ -720,7 +711,7 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) else if (type == NODE) return 8 * sbi->blocks_per_seg; else if (type == META) - return 8 * MAX_BIO_BLOCKS(sbi); + return 8 * BIO_MAX_PAGES; else return 0; } @@ -737,11 +728,9 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, return 0; nr_to_write = wbc->nr_to_write; - + desired = BIO_MAX_PAGES; if (type == NODE) - desired = 2 * max_hw_blocks(sbi); - else - desired = MAX_BIO_BLOCKS(sbi); + desired <<= 1; wbc->nr_to_write = desired; return desired - nr_to_write; From 3139e5f850cdbafc1b0b6f787e1473c5ea687603 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 10 Nov 2016 18:04:05 -0800 Subject: [PATCH 0024/1212] f2fs: Replace CURRENT_TIME_SEC with current_time() for inode timestamps This is for backport only. fs: Replace CURRENT_TIME_SEC with current_time() for inode timestamps Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 8 ++++---- fs/f2fs/f2fs.h | 22 ++++++++++++++++++++++ fs/f2fs/file.c | 8 ++++---- fs/f2fs/inline.c | 2 +- fs/f2fs/namei.c | 8 ++++---- fs/f2fs/xattr.c | 2 +- 6 files changed, 36 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 7136dc1ade11..3b8ebec0450b 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -312,7 +312,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, f2fs_dentry_kunmap(dir, page); set_page_dirty(page); - dir->i_mtime = dir->i_ctime = CURRENT_TIME; + dir->i_mtime = dir->i_ctime = current_time(dir); f2fs_mark_inode_dirty_sync(dir); f2fs_put_page(page, 1); } @@ -465,7 +465,7 @@ void update_parent_metadata(struct inode *dir, struct inode *inode, f2fs_i_links_write(dir, true); clear_inode_flag(inode, FI_NEW_INODE); } - dir->i_mtime = dir->i_ctime = CURRENT_TIME; + dir->i_mtime = dir->i_ctime = current_time(dir); f2fs_mark_inode_dirty_sync(dir); if (F2FS_I(dir)->i_current_depth != current_depth) @@ -683,7 +683,7 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode) if (S_ISDIR(inode->i_mode)) f2fs_i_links_write(dir, false); - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_time(inode); f2fs_i_links_write(inode, false); if (S_ISDIR(inode->i_mode)) { @@ -730,7 +730,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, kunmap(page); /* kunmap - pair of f2fs_find_entry */ set_page_dirty(page); - dir->i_ctime = dir->i_mtime = CURRENT_TIME; + dir->i_ctime = dir->i_mtime = current_time(dir); f2fs_mark_inode_dirty_sync(dir); if (inode) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9e8d3c9af54a..1938fe457041 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -138,6 +138,28 @@ static inline void inode_nohighmem(struct inode *inode) mapping_set_gfp_mask(inode->i_mapping, GFP_USER); } +/** + * current_time - Return FS time + * @inode: inode. + * + * Return the current time truncated to the time granularity supported by + * the fs. + * + * Note that inode and inode->sb cannot be NULL. + * Otherwise, the function warns and returns time without truncation. + */ +static inline struct timespec current_time(struct inode *inode) +{ + struct timespec now = current_kernel_time(); + + if (unlikely(!inode->i_sb)) { + WARN(1, "current_time() called with uninitialized super_block in the inode"); + return now; + } + + return timespec_trunc(now, inode->i_sb->s_time_gran); +} + /* * For checkpoint manager */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 53ba384cb675..04a3205d2934 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -632,7 +632,7 @@ int f2fs_truncate(struct inode *inode) if (err) return err; - inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode); return 0; } @@ -708,7 +708,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (err) return err; } - inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_ctime = current_time(inode); } } @@ -1402,7 +1402,7 @@ static long f2fs_fallocate(struct file *file, int mode, } if (!ret) { - inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } @@ -1494,7 +1494,7 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) fi->i_flags = flags; inode_unlock(inode); - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_time(inode); f2fs_set_inode_flags(inode); out: mnt_drop_write_file(filp); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 06d20489d532..3106155994b4 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -573,7 +573,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, set_page_dirty(page); f2fs_put_page(page, 1); - dir->i_ctime = dir->i_mtime = CURRENT_TIME; + dir->i_ctime = dir->i_mtime = current_time(dir); f2fs_mark_inode_dirty_sync(dir); if (inode) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 0f071a70522d..ae29726afff0 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -46,7 +46,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) inode->i_ino = ino; inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_generation = sbi->s_next_generation++; err = insert_inode_locked(inode); @@ -182,7 +182,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, f2fs_balance_fs(sbi, true); - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_time(inode); ihold(inode); set_inode_flag(inode, FI_INC_LINK); @@ -720,7 +720,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_set_link(new_dir, new_entry, new_page, old_inode); - new_inode->i_ctime = CURRENT_TIME; + new_inode->i_ctime = current_time(new_inode); down_write(&F2FS_I(new_inode)->i_sem); if (old_dir_entry) f2fs_i_links_write(new_inode, false); @@ -774,7 +774,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, file_set_enc_name(old_inode); up_write(&F2FS_I(old_inode)->i_sem); - old_inode->i_ctime = CURRENT_TIME; + old_inode->i_ctime = current_time(old_inode); f2fs_mark_inode_dirty_sync(old_inode); f2fs_delete_entry(old_entry, old_page, old_dir, NULL); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 69c6bb9cf207..3a42405b6515 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -588,7 +588,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (is_inode_flag_set(inode, FI_ACL_MODE)) { inode->i_mode = F2FS_I(inode)->i_acl_mode; - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_time(inode); clear_inode_flag(inode, FI_ACL_MODE); } if (index == F2FS_XATTR_INDEX_ENCRYPTION && From 20339a1214b21266e44660c91fd9b391206c9d35 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 14 Oct 2016 11:51:23 -0700 Subject: [PATCH 0025/1212] f2fs: keep dirty inodes selectively for checkpoint This is to avoid no free segment bug during checkpoint caused by a number of dirty inodes. The case was reported by Chao like this. 1. mount with lazytime option 2. fill 4k file until disk is full 3. sync filesystem 4. read all files in the image 5. umount In this case, we actually don't need to flush dirty inode to inode page during checkpoint. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 2 +- fs/f2fs/dir.c | 6 +++--- fs/f2fs/extent_cache.c | 2 +- fs/f2fs/f2fs.h | 26 +++++++++++++------------- fs/f2fs/file.c | 9 +++++---- fs/f2fs/inline.c | 2 +- fs/f2fs/inode.c | 7 ++++--- fs/f2fs/namei.c | 6 +++--- fs/f2fs/super.c | 29 ++++++++++++++++------------- fs/f2fs/xattr.c | 4 ++-- 10 files changed, 49 insertions(+), 44 deletions(-) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 4a34040932e9..a45d1f4b7b0f 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -387,7 +387,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, if (error) return error; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); if (default_acl) { error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl, diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 3b8ebec0450b..5594667c2f41 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -313,7 +313,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, set_page_dirty(page); dir->i_mtime = dir->i_ctime = current_time(dir); - f2fs_mark_inode_dirty_sync(dir); + f2fs_mark_inode_dirty_sync(dir, false); f2fs_put_page(page, 1); } @@ -466,7 +466,7 @@ void update_parent_metadata(struct inode *dir, struct inode *inode, clear_inode_flag(inode, FI_NEW_INODE); } dir->i_mtime = dir->i_ctime = current_time(dir); - f2fs_mark_inode_dirty_sync(dir); + f2fs_mark_inode_dirty_sync(dir, false); if (F2FS_I(dir)->i_current_depth != current_depth) f2fs_i_depth_write(dir, current_depth); @@ -731,7 +731,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, set_page_dirty(page); dir->i_ctime = dir->i_mtime = current_time(dir); - f2fs_mark_inode_dirty_sync(dir); + f2fs_mark_inode_dirty_sync(dir, false); if (inode) f2fs_drop_nlink(dir, inode); diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 2b06d4fcd954..4db44da7ef69 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -172,7 +172,7 @@ static void __drop_largest_extent(struct inode *inode, if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) { largest->len = 0; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1938fe457041..0d2502fdf892 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -541,13 +541,13 @@ static inline bool __is_front_mergeable(struct extent_info *cur, return __is_extent_mergeable(cur, front); } -extern void f2fs_mark_inode_dirty_sync(struct inode *); +extern void f2fs_mark_inode_dirty_sync(struct inode *, bool); static inline void __try_update_largest_extent(struct inode *inode, struct extent_tree *et, struct extent_node *en) { if (en->ei.len > et->largest.len) { et->largest = en->ei; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } } @@ -1680,7 +1680,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode, return; case FI_DATA_EXIST: case FI_INLINE_DOTS: - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } } @@ -1707,7 +1707,7 @@ static inline void set_acl_inode(struct inode *inode, umode_t mode) { F2FS_I(inode)->i_acl_mode = mode; set_inode_flag(inode, FI_ACL_MODE); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, false); } static inline void f2fs_i_links_write(struct inode *inode, bool inc) @@ -1716,7 +1716,7 @@ static inline void f2fs_i_links_write(struct inode *inode, bool inc) inc_nlink(inode); else drop_nlink(inode); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void f2fs_i_blocks_write(struct inode *inode, @@ -1727,7 +1727,7 @@ static inline void f2fs_i_blocks_write(struct inode *inode, inode->i_blocks = add ? inode->i_blocks + diff : inode->i_blocks - diff; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); if (clean || recover) set_inode_flag(inode, FI_AUTO_RECOVER); } @@ -1741,7 +1741,7 @@ static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) return; i_size_write(inode, i_size); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); if (clean || recover) set_inode_flag(inode, FI_AUTO_RECOVER); } @@ -1756,19 +1756,19 @@ static inline bool f2fs_skip_inode_update(struct inode *inode) static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth) { F2FS_I(inode)->i_current_depth = depth; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void f2fs_i_xnid_write(struct inode *inode, nid_t xnid) { F2FS_I(inode)->i_xattr_nid = xnid; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void f2fs_i_pino_write(struct inode *inode, nid_t pino) { F2FS_I(inode)->i_pino = pino; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) @@ -1896,13 +1896,13 @@ static inline int is_file(struct inode *inode, int type) static inline void set_file(struct inode *inode, int type) { F2FS_I(inode)->i_advise |= type; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void clear_file(struct inode *inode, int type) { F2FS_I(inode)->i_advise &= ~type; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline int f2fs_readonly(struct super_block *sb) @@ -2054,7 +2054,7 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) /* * super.c */ -int f2fs_inode_dirtied(struct inode *); +int f2fs_inode_dirtied(struct inode *, bool); void f2fs_inode_synced(struct inode *); int f2fs_commit_super(struct f2fs_sb_info *, bool); int f2fs_sync_fs(struct super_block *, int); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 04a3205d2934..ce38a350fb38 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -265,7 +265,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, } if (need_inode_block_update(sbi, ino)) { - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); f2fs_write_inode(inode, NULL); goto sync_nodes; } @@ -633,7 +633,7 @@ int f2fs_truncate(struct inode *inode) return err; inode->i_mtime = inode->i_ctime = current_time(inode); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, false); return 0; } @@ -722,7 +722,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } } - f2fs_mark_inode_dirty_sync(inode); + /* update attributes only */ + f2fs_mark_inode_dirty_sync(inode, false); /* inode change will produce dirty node pages flushed by checkpoint */ f2fs_balance_fs(F2FS_I_SB(inode), true); @@ -1403,7 +1404,7 @@ static long f2fs_fallocate(struct file *file, int mode, if (!ret) { inode->i_mtime = inode->i_ctime = current_time(inode); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, false); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 3106155994b4..841aa13d9f4e 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -574,7 +574,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, f2fs_put_page(page, 1); dir->i_ctime = dir->i_mtime = current_time(dir); - f2fs_mark_inode_dirty_sync(dir); + f2fs_mark_inode_dirty_sync(dir, false); if (inode) f2fs_drop_nlink(dir, inode); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index d32fd0343eae..bfa512dde4ab 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -19,10 +19,11 @@ #include -void f2fs_mark_inode_dirty_sync(struct inode *inode) +void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync) { - if (f2fs_inode_dirtied(inode)) + if (f2fs_inode_dirtied(inode, sync)) return; + mark_inode_dirty_sync(inode); } @@ -43,7 +44,7 @@ void f2fs_set_inode_flags(struct inode *inode) new_fl |= S_DIRSYNC; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, false); } static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index ae29726afff0..7f2fdb154180 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -775,7 +775,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, up_write(&F2FS_I(old_inode)->i_sem); old_inode->i_ctime = current_time(old_inode); - f2fs_mark_inode_dirty_sync(old_inode); + f2fs_mark_inode_dirty_sync(old_inode, false); f2fs_delete_entry(old_entry, old_page, old_dir, NULL); @@ -935,7 +935,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_i_links_write(old_dir, old_nlink > 0); up_write(&F2FS_I(old_dir)->i_sem); } - f2fs_mark_inode_dirty_sync(old_dir); + f2fs_mark_inode_dirty_sync(old_dir, false); /* update directory entry info of new dir inode */ f2fs_set_link(new_dir, new_entry, new_page, old_inode); @@ -950,7 +950,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_i_links_write(new_dir, new_nlink > 0); up_write(&F2FS_I(new_dir)->i_sem); } - f2fs_mark_inode_dirty_sync(new_dir); + f2fs_mark_inode_dirty_sync(new_dir, false); f2fs_unlock_op(sbi); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 006138a6c5ab..23190a94840b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -620,24 +620,25 @@ static int f2fs_drop_inode(struct inode *inode) return generic_drop_inode(inode); } -int f2fs_inode_dirtied(struct inode *inode) +int f2fs_inode_dirtied(struct inode *inode, bool sync) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int ret = 0; spin_lock(&sbi->inode_lock[DIRTY_META]); if (is_inode_flag_set(inode, FI_DIRTY_INODE)) { - spin_unlock(&sbi->inode_lock[DIRTY_META]); - return 1; + ret = 1; + } else { + set_inode_flag(inode, FI_DIRTY_INODE); + stat_inc_dirty_inode(sbi, DIRTY_META); } - - set_inode_flag(inode, FI_DIRTY_INODE); - list_add_tail(&F2FS_I(inode)->gdirty_list, + if (sync && list_empty(&F2FS_I(inode)->gdirty_list)) { + list_add_tail(&F2FS_I(inode)->gdirty_list, &sbi->inode_list[DIRTY_META]); - inc_page_count(sbi, F2FS_DIRTY_IMETA); - stat_inc_dirty_inode(sbi, DIRTY_META); + inc_page_count(sbi, F2FS_DIRTY_IMETA); + } spin_unlock(&sbi->inode_lock[DIRTY_META]); - - return 0; + return ret; } void f2fs_inode_synced(struct inode *inode) @@ -649,10 +650,12 @@ void f2fs_inode_synced(struct inode *inode) spin_unlock(&sbi->inode_lock[DIRTY_META]); return; } - list_del_init(&F2FS_I(inode)->gdirty_list); + if (!list_empty(&F2FS_I(inode)->gdirty_list)) { + list_del_init(&F2FS_I(inode)->gdirty_list); + dec_page_count(sbi, F2FS_DIRTY_IMETA); + } clear_inode_flag(inode, FI_DIRTY_INODE); clear_inode_flag(inode, FI_AUTO_RECOVER); - dec_page_count(sbi, F2FS_DIRTY_IMETA); stat_dec_dirty_inode(F2FS_I_SB(inode), DIRTY_META); spin_unlock(&sbi->inode_lock[DIRTY_META]); } @@ -676,7 +679,7 @@ static void f2fs_dirty_inode(struct inode *inode, int flags) if (is_inode_flag_set(inode, FI_AUTO_RECOVER)) clear_inode_flag(inode, FI_AUTO_RECOVER); - f2fs_inode_dirtied(inode); + f2fs_inode_dirtied(inode, false); } static void f2fs_i_callback(struct rcu_head *head) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 3a42405b6515..1c4d5e39586c 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -151,7 +151,7 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler, return -EINVAL; F2FS_I(inode)->i_advise |= *(char *)value; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); return 0; } @@ -594,7 +594,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (index == F2FS_XATTR_INDEX_ENCRYPTION && !strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT)) f2fs_set_encrypted_inode(inode); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); if (!error && S_ISDIR(inode->i_mode)) set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP); exit: From 86f4d9f42e8a3bb2a02ca1eaecc180f1f1b21ee3 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 19 Oct 2016 18:27:56 -0700 Subject: [PATCH 0026/1212] f2fs: make clean inodes when flushing inode page This patch tries to make more clean inodes when flushing dirty inodes in checkpoint. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 6 +++++- fs/f2fs/inode.c | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 1dffe86651be..ed79757c36e0 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -924,7 +924,11 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) inode = igrab(&fi->vfs_inode); spin_unlock(&sbi->inode_lock[DIRTY_META]); if (inode) { - update_inode_page(inode); + sync_inode_metadata(inode, 0); + + /* it's on eviction */ + if (is_inode_flag_set(inode, FI_DIRTY_INODE)) + update_inode_page(inode); iput(inode); } }; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index bfa512dde4ab..7b5e402f0a72 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -339,7 +339,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) * We need to balance fs here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. */ - if (update_inode_page(inode)) + if (update_inode_page(inode) && wbc && wbc->nr_to_write) f2fs_balance_fs(sbi, true); return 0; } From 1789a2ca8a3e2247076419891f39f4d076b3738f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 20 Oct 2016 19:09:57 -0700 Subject: [PATCH 0027/1212] f2fs: remove percpu_count due to performance regression This patch removes percpu_count usage due to performance regression in iozone. Fixes: 523be8a6b3 ("f2fs: use percpu_counter for page counters") Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 12 ++++++------ fs/f2fs/f2fs.h | 12 ++++++------ fs/f2fs/super.c | 16 +++++----------- 3 files changed, 17 insertions(+), 23 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 6af146c48644..2fdf23398fa1 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -313,17 +313,17 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - inmem: %4lld, wb_bios: %4d\n", + seq_printf(s, " - inmem: %4d, wb_bios: %4d\n", si->inmem_pages, si->wb_bios); - seq_printf(s, " - nodes: %4lld in %4d\n", + seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); - seq_printf(s, " - dents: %4lld in dirs:%4d (%4d)\n", + seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", si->ndirty_dent, si->ndirty_dirs, si->ndirty_all); - seq_printf(s, " - datas: %4lld in files:%4d\n", + seq_printf(s, " - datas: %4d in files:%4d\n", si->ndirty_data, si->ndirty_files); - seq_printf(s, " - meta: %4lld in %4d\n", + seq_printf(s, " - meta: %4d in %4d\n", si->ndirty_meta, si->meta_pages); - seq_printf(s, " - imeta: %4lld\n", + seq_printf(s, " - imeta: %4d\n", si->ndirty_imeta); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0d2502fdf892..932c53f441db 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -872,7 +872,7 @@ struct f2fs_sb_info { atomic_t nr_wb_bios; /* # of writeback bios */ /* # of pages, see count_type */ - struct percpu_counter nr_pages[NR_COUNT_TYPE]; + atomic_t nr_pages[NR_COUNT_TYPE]; /* # of allocated blocks */ struct percpu_counter alloc_valid_block_count; @@ -1286,7 +1286,7 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) { - percpu_counter_inc(&sbi->nr_pages[count_type]); + atomic_inc(&sbi->nr_pages[count_type]); if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES) return; @@ -1303,7 +1303,7 @@ static inline void inode_inc_dirty_pages(struct inode *inode) static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) { - percpu_counter_dec(&sbi->nr_pages[count_type]); + atomic_dec(&sbi->nr_pages[count_type]); } static inline void inode_dec_dirty_pages(struct inode *inode) @@ -1319,7 +1319,7 @@ static inline void inode_dec_dirty_pages(struct inode *inode) static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type) { - return percpu_counter_sum_positive(&sbi->nr_pages[count_type]); + return atomic_read(&sbi->nr_pages[count_type]); } static inline s64 get_dirty_pages(struct inode *inode) @@ -2239,8 +2239,8 @@ struct f2fs_stat_info { unsigned long long hit_largest, hit_cached, hit_rbtree; unsigned long long hit_total, total_ext; int ext_tree, zombie_tree, ext_node; - s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; - s64 inmem_pages; + int ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; + int inmem_pages; unsigned int ndirty_dirs, ndirty_files, ndirty_all; int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids; int total_count, utilization; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 23190a94840b..6034d51fc5fc 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -696,10 +696,6 @@ static void f2fs_destroy_inode(struct inode *inode) static void destroy_percpu_info(struct f2fs_sb_info *sbi) { - int i; - - for (i = 0; i < NR_COUNT_TYPE; i++) - percpu_counter_destroy(&sbi->nr_pages[i]); percpu_counter_destroy(&sbi->alloc_valid_block_count); percpu_counter_destroy(&sbi->total_valid_inode_count); } @@ -1450,6 +1446,7 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) static void init_sb_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = sbi->raw_super; + int i; sbi->log_sectors_per_block = le32_to_cpu(raw_super->log_sectors_per_block); @@ -1474,6 +1471,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL; clear_sbi_flag(sbi, SBI_NEED_FSCK); + for (i = 0; i < NR_COUNT_TYPE; i++) + atomic_set(&sbi->nr_pages[i], 0); + INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); mutex_init(&sbi->wio_mutex[NODE]); @@ -1489,13 +1489,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) static int init_percpu_info(struct f2fs_sb_info *sbi) { - int i, err; - - for (i = 0; i < NR_COUNT_TYPE; i++) { - err = percpu_counter_init(&sbi->nr_pages[i], 0, GFP_KERNEL); - if (err) - return err; - } + int err; err = percpu_counter_init(&sbi->alloc_valid_block_count, 0, GFP_KERNEL); if (err) From 93ae1e63e4757ca32d39dacc8488ade28fbae4d8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 2 Nov 2016 14:52:15 +0100 Subject: [PATCH 0028/1212] f2fs: hide a maybe-uninitialized warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcc is unsure about the use of last_ofs_in_node, which might happen without a prior initialization: fs/f2fs//git/arm-soc/fs/f2fs/data.c: In function ‘f2fs_map_blocks’: fs/f2fs/data.c:799:54: warning: ‘last_ofs_in_node’ may be used uninitialized in this function [-Wmaybe-uninitialized] if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) { As pointed out by Chao Yu, the code is actually correct as 'prealloc' is only set if the last_ofs_in_node has been set, the two always get updated together. This initializes last_ofs_in_node to dn.ofs_in_node for each new dnode at the start of the 'next_block' loop, which at that point is a correct initialization as well. I assume that compilers that correctly track the contents of the variables and do not warn about the condition also figure out that they can eliminate the extra assignment here. Fixes: 46008c6d4232 ("f2fs: support in batch multi blocks preallocation") Signed-off-by: Arnd Bergmann Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 465fa9d62485..192bc039194d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -707,7 +707,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, } prealloc = 0; - ofs_in_node = dn.ofs_in_node; + last_ofs_in_node = ofs_in_node = dn.ofs_in_node; end_offset = ADDRS_PER_PAGE(dn.node_page, inode); next_block: From e164e43eb20b2980dc0e7f740c8b3e0596e9778f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 1 Dec 2016 10:44:44 -0800 Subject: [PATCH 0029/1212] fs/crypto: catch up 4.9-rc6 Signed-off-by: Jaegeuk Kim --- fs/crypto/crypto.c | 26 ++++---- fs/crypto/fname.c | 132 ++++++++++++++++++--------------------- fs/crypto/keyinfo.c | 85 ++++++++++++++++--------- fs/crypto/policy.c | 4 ++ fs/f2fs/dir.c | 6 +- fs/f2fs/namei.c | 6 +- include/linux/fscrypto.h | 24 ------- 7 files changed, 141 insertions(+), 142 deletions(-) diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 2fc8c43ce531..2d40ab9edc9f 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -28,7 +28,6 @@ #include #include #include -#include static unsigned int num_prealloc_crypto_pages = 32; static unsigned int num_prealloc_crypto_ctxs = 128; @@ -128,11 +127,11 @@ struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags) EXPORT_SYMBOL(fscrypt_get_ctx); /** - * fscrypt_complete() - The completion callback for page encryption - * @req: The asynchronous encryption request context - * @res: The result of the encryption operation + * page_crypt_complete() - completion callback for page crypto + * @req: The asynchronous cipher request context + * @res: The result of the cipher operation */ -static void fscrypt_complete(struct crypto_async_request *req, int res) +static void page_crypt_complete(struct crypto_async_request *req, int res) { struct fscrypt_completion_result *ecr = req->data; @@ -152,7 +151,10 @@ static int do_page_crypto(struct inode *inode, struct page *src_page, struct page *dest_page, gfp_t gfp_flags) { - u8 xts_tweak[FS_XTS_TWEAK_SIZE]; + struct { + __le64 index; + u8 padding[FS_XTS_TWEAK_SIZE - sizeof(__le64)]; + } xts_tweak; struct skcipher_request *req = NULL; DECLARE_FS_COMPLETION_RESULT(ecr); struct scatterlist dst, src; @@ -170,19 +172,17 @@ static int do_page_crypto(struct inode *inode, skcipher_request_set_callback( req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - fscrypt_complete, &ecr); + page_crypt_complete, &ecr); - BUILD_BUG_ON(FS_XTS_TWEAK_SIZE < sizeof(index)); - memcpy(xts_tweak, &index, sizeof(index)); - memset(&xts_tweak[sizeof(index)], 0, - FS_XTS_TWEAK_SIZE - sizeof(index)); + BUILD_BUG_ON(sizeof(xts_tweak) != FS_XTS_TWEAK_SIZE); + xts_tweak.index = cpu_to_le64(index); + memset(xts_tweak.padding, 0, sizeof(xts_tweak.padding)); sg_init_table(&dst, 1); sg_set_page(&dst, dest_page, PAGE_SIZE, 0); sg_init_table(&src, 1); sg_set_page(&src, src_page, PAGE_SIZE, 0); - skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, - xts_tweak); + skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, &xts_tweak); if (rw == FS_DECRYPT) res = crypto_skcipher_decrypt(req); else diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 5d6d49113efa..9b774f4b50c8 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -10,21 +10,16 @@ * This has not yet undergone a rigorous security audit. */ -#include -#include #include #include #include -static u32 size_round_up(size_t size, size_t blksize) -{ - return ((size + blksize - 1) / blksize) * blksize; -} - /** - * dir_crypt_complete() - + * fname_crypt_complete() - completion callback for filename crypto + * @req: The asynchronous cipher request context + * @res: The result of the cipher operation */ -static void dir_crypt_complete(struct crypto_async_request *req, int res) +static void fname_crypt_complete(struct crypto_async_request *req, int res) { struct fscrypt_completion_result *ecr = req->data; @@ -35,90 +30,80 @@ static void dir_crypt_complete(struct crypto_async_request *req, int res) } /** - * fname_encrypt() - + * fname_encrypt() - encrypt a filename * - * This function encrypts the input filename, and returns the length of the - * ciphertext. Errors are returned as negative numbers. We trust the caller to - * allocate sufficient memory to oname string. + * The caller must have allocated sufficient memory for the @oname string. + * + * Return: 0 on success, -errno on failure */ static int fname_encrypt(struct inode *inode, const struct qstr *iname, struct fscrypt_str *oname) { - u32 ciphertext_len; struct skcipher_request *req = NULL; DECLARE_FS_COMPLETION_RESULT(ecr); struct fscrypt_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; char iv[FS_CRYPTO_BLOCK_SIZE]; - struct scatterlist src_sg, dst_sg; + struct scatterlist sg; int padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK); - char *workbuf, buf[32], *alloc_buf = NULL; - unsigned lim; + unsigned int lim; + unsigned int cryptlen; lim = inode->i_sb->s_cop->max_namelen(inode); if (iname->len <= 0 || iname->len > lim) return -EIO; - ciphertext_len = (iname->len < FS_CRYPTO_BLOCK_SIZE) ? - FS_CRYPTO_BLOCK_SIZE : iname->len; - ciphertext_len = size_round_up(ciphertext_len, padding); - ciphertext_len = (ciphertext_len > lim) ? lim : ciphertext_len; + /* + * Copy the filename to the output buffer for encrypting in-place and + * pad it with the needed number of NUL bytes. + */ + cryptlen = max_t(unsigned int, iname->len, FS_CRYPTO_BLOCK_SIZE); + cryptlen = round_up(cryptlen, padding); + cryptlen = min(cryptlen, lim); + memcpy(oname->name, iname->name, iname->len); + memset(oname->name + iname->len, 0, cryptlen - iname->len); - if (ciphertext_len <= sizeof(buf)) { - workbuf = buf; - } else { - alloc_buf = kmalloc(ciphertext_len, GFP_NOFS); - if (!alloc_buf) - return -ENOMEM; - workbuf = alloc_buf; - } + /* Initialize the IV */ + memset(iv, 0, FS_CRYPTO_BLOCK_SIZE); - /* Allocate request */ + /* Set up the encryption request */ req = skcipher_request_alloc(tfm, GFP_NOFS); if (!req) { printk_ratelimited(KERN_ERR - "%s: crypto_request_alloc() failed\n", __func__); - kfree(alloc_buf); + "%s: skcipher_request_alloc() failed\n", __func__); return -ENOMEM; } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - dir_crypt_complete, &ecr); + fname_crypt_complete, &ecr); + sg_init_one(&sg, oname->name, cryptlen); + skcipher_request_set_crypt(req, &sg, &sg, cryptlen, iv); - /* Copy the input */ - memcpy(workbuf, iname->name, iname->len); - if (iname->len < ciphertext_len) - memset(workbuf + iname->len, 0, ciphertext_len - iname->len); - - /* Initialize IV */ - memset(iv, 0, FS_CRYPTO_BLOCK_SIZE); - - /* Create encryption request */ - sg_init_one(&src_sg, workbuf, ciphertext_len); - sg_init_one(&dst_sg, oname->name, ciphertext_len); - skcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv); + /* Do the encryption */ res = crypto_skcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { + /* Request is being completed asynchronously; wait for it */ wait_for_completion(&ecr.completion); res = ecr.res; } - kfree(alloc_buf); skcipher_request_free(req); - if (res < 0) + if (res < 0) { printk_ratelimited(KERN_ERR "%s: Error (error code %d)\n", __func__, res); + return res; + } - oname->len = ciphertext_len; - return res; + oname->len = cryptlen; + return 0; } -/* - * fname_decrypt() - * This function decrypts the input filename, and returns - * the length of the plaintext. - * Errors are returned as negative numbers. - * We trust the caller to allocate sufficient memory to oname string. +/** + * fname_decrypt() - decrypt a filename + * + * The caller must have allocated sufficient memory for the @oname string. + * + * Return: 0 on success, -errno on failure */ static int fname_decrypt(struct inode *inode, const struct fscrypt_str *iname, @@ -146,7 +131,7 @@ static int fname_decrypt(struct inode *inode, } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - dir_crypt_complete, &ecr); + fname_crypt_complete, &ecr); /* Initialize IV */ memset(iv, 0, FS_CRYPTO_BLOCK_SIZE); @@ -168,7 +153,7 @@ static int fname_decrypt(struct inode *inode, } oname->len = strnlen(oname->name, iname->len); - return oname->len; + return 0; } static const char *lookup_table = @@ -231,9 +216,8 @@ u32 fscrypt_fname_encrypted_size(struct inode *inode, u32 ilen) if (ci) padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK); - if (ilen < FS_CRYPTO_BLOCK_SIZE) - ilen = FS_CRYPTO_BLOCK_SIZE; - return size_round_up(ilen, padding); + ilen = max(ilen, (u32)FS_CRYPTO_BLOCK_SIZE); + return round_up(ilen, padding); } EXPORT_SYMBOL(fscrypt_fname_encrypted_size); @@ -279,6 +263,10 @@ EXPORT_SYMBOL(fscrypt_fname_free_buffer); /** * fscrypt_fname_disk_to_usr() - converts a filename from disk space to user * space + * + * The caller must have allocated sufficient memory for the @oname string. + * + * Return: 0 on success, -errno on failure */ int fscrypt_fname_disk_to_usr(struct inode *inode, u32 hash, u32 minor_hash, @@ -287,13 +275,12 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, { const struct qstr qname = FSTR_TO_QSTR(iname); char buf[24]; - int ret; if (fscrypt_is_dot_dotdot(&qname)) { oname->name[0] = '.'; oname->name[iname->len - 1] = '.'; oname->len = iname->len; - return oname->len; + return 0; } if (iname->len < FS_CRYPTO_BLOCK_SIZE) @@ -303,9 +290,9 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, return fname_decrypt(inode, iname, oname); if (iname->len <= FS_FNAME_CRYPTO_DIGEST_SIZE) { - ret = digest_encode(iname->name, iname->len, oname->name); - oname->len = ret; - return ret; + oname->len = digest_encode(iname->name, iname->len, + oname->name); + return 0; } if (hash) { memcpy(buf, &hash, 4); @@ -315,15 +302,18 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, } memcpy(buf + 8, iname->name + iname->len - 16, 16); oname->name[0] = '_'; - ret = digest_encode(buf, 24, oname->name + 1); - oname->len = ret + 1; - return ret + 1; + oname->len = 1 + digest_encode(buf, 24, oname->name + 1); + return 0; } EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); /** * fscrypt_fname_usr_to_disk() - converts a filename from user space to disk * space + * + * The caller must have allocated sufficient memory for the @oname string. + * + * Return: 0 on success, -errno on failure */ int fscrypt_fname_usr_to_disk(struct inode *inode, const struct qstr *iname, @@ -333,7 +323,7 @@ int fscrypt_fname_usr_to_disk(struct inode *inode, oname->name[0] = '.'; oname->name[iname->len - 1] = '.'; oname->len = iname->len; - return oname->len; + return 0; } if (inode->i_crypt_info) return fname_encrypt(inode, iname, oname); @@ -367,10 +357,10 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, if (dir->i_crypt_info) { ret = fscrypt_fname_alloc_buffer(dir, iname->len, &fname->crypto_buf); - if (ret < 0) + if (ret) return ret; ret = fname_encrypt(dir, iname, &fname->crypto_buf); - if (ret < 0) + if (ret) goto errout; fname->disk_name.name = fname->crypto_buf.name; fname->disk_name.len = fname->crypto_buf.len; diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 1ac263eddc4e..67fb6d8876d0 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -8,11 +8,8 @@ * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. */ -#include #include -#include #include -#include #include static void derive_crypt_complete(struct crypto_async_request *req, int rc) @@ -139,6 +136,38 @@ static int validate_user_key(struct fscrypt_info *crypt_info, return res; } +static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode, + const char **cipher_str_ret, int *keysize_ret) +{ + if (S_ISREG(inode->i_mode)) { + if (ci->ci_data_mode == FS_ENCRYPTION_MODE_AES_256_XTS) { + *cipher_str_ret = "xts(aes)"; + *keysize_ret = FS_AES_256_XTS_KEY_SIZE; + return 0; + } + pr_warn_once("fscrypto: unsupported contents encryption mode " + "%d for inode %lu\n", + ci->ci_data_mode, inode->i_ino); + return -ENOKEY; + } + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) { + if (ci->ci_filename_mode == FS_ENCRYPTION_MODE_AES_256_CTS) { + *cipher_str_ret = "cts(cbc(aes))"; + *keysize_ret = FS_AES_256_CTS_KEY_SIZE; + return 0; + } + pr_warn_once("fscrypto: unsupported filenames encryption mode " + "%d for inode %lu\n", + ci->ci_filename_mode, inode->i_ino); + return -ENOKEY; + } + + pr_warn_once("fscrypto: unsupported file type %d for inode %lu\n", + (inode->i_mode & S_IFMT), inode->i_ino); + return -ENOKEY; +} + static void put_crypt_info(struct fscrypt_info *ci) { if (!ci) @@ -155,8 +184,8 @@ int get_crypt_info(struct inode *inode) struct fscrypt_context ctx; struct crypto_skcipher *ctfm; const char *cipher_str; - u8 raw_key[FS_MAX_KEY_SIZE]; - u8 mode; + int keysize; + u8 *raw_key = NULL; int res; res = fscrypt_initialize(); @@ -179,13 +208,19 @@ int get_crypt_info(struct inode *inode) if (res < 0) { if (!fscrypt_dummy_context_enabled(inode)) return res; + ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS; ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS; ctx.flags = 0; } else if (res != sizeof(ctx)) { return -EINVAL; } - res = 0; + + if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1) + return -EINVAL; + + if (ctx.flags & ~FS_POLICY_FLAGS_VALID) + return -EINVAL; crypt_info = kmem_cache_alloc(fscrypt_info_cachep, GFP_NOFS); if (!crypt_info) @@ -198,27 +233,20 @@ int get_crypt_info(struct inode *inode) crypt_info->ci_keyring_key = NULL; memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, sizeof(crypt_info->ci_master_key)); - if (S_ISREG(inode->i_mode)) - mode = crypt_info->ci_data_mode; - else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - mode = crypt_info->ci_filename_mode; - else - BUG(); - switch (mode) { - case FS_ENCRYPTION_MODE_AES_256_XTS: - cipher_str = "xts(aes)"; - break; - case FS_ENCRYPTION_MODE_AES_256_CTS: - cipher_str = "cts(cbc(aes))"; - break; - default: - printk_once(KERN_WARNING - "%s: unsupported key mode %d (ino %u)\n", - __func__, mode, (unsigned) inode->i_ino); - res = -ENOKEY; + res = determine_cipher_type(crypt_info, inode, &cipher_str, &keysize); + if (res) goto out; - } + + /* + * This cannot be a stack buffer because it is passed to the scatterlist + * crypto API as part of key derivation. + */ + res = -ENOMEM; + raw_key = kmalloc(FS_MAX_KEY_SIZE, GFP_NOFS); + if (!raw_key) + goto out; + if (fscrypt_dummy_context_enabled(inode)) { memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE); goto got_key; @@ -253,11 +281,12 @@ int get_crypt_info(struct inode *inode) crypt_info->ci_ctfm = ctfm; crypto_skcipher_clear_flags(ctfm, ~0); crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY); - res = crypto_skcipher_setkey(ctfm, raw_key, fscrypt_key_size(mode)); + res = crypto_skcipher_setkey(ctfm, raw_key, keysize); if (res) goto out; - memzero_explicit(raw_key, sizeof(raw_key)); + kzfree(raw_key); + raw_key = NULL; if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) != NULL) { put_crypt_info(crypt_info); goto retry; @@ -268,7 +297,7 @@ int get_crypt_info(struct inode *inode) if (res == -ENOKEY) res = 0; put_crypt_info(crypt_info); - memzero_explicit(raw_key, sizeof(raw_key)); + kzfree(raw_key); return res; } diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index ed115acb5dee..6865663aac69 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -109,6 +109,8 @@ int fscrypt_process_policy(struct file *filp, if (ret) return ret; + inode_lock(inode); + if (!inode_has_encryption_context(inode)) { if (!S_ISDIR(inode->i_mode)) ret = -EINVAL; @@ -127,6 +129,8 @@ int fscrypt_process_policy(struct file *filp, ret = -EINVAL; } + inode_unlock(inode); + mnt_drop_write_file(filp); return ret; } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 5594667c2f41..210082783d5a 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -814,12 +814,12 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, if (f2fs_encrypted_inode(d->inode)) { int save_len = fstr->len; - int ret; + int err; - ret = fscrypt_fname_disk_to_usr(d->inode, + err = fscrypt_fname_disk_to_usr(d->inode, (u32)de->hash_code, 0, &de_name, fstr); - if (ret < 0) + if (err) return true; de_name = *fstr; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 7f2fdb154180..468b2dbe6d34 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -451,7 +451,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, ostr.name = sd->encrypted_path; ostr.len = disk_link.len; err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr); - if (err < 0) + if (err) goto err_out; sd->len = cpu_to_le16(ostr.len); @@ -1047,7 +1047,7 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook goto errout; res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); - if (res < 0) + if (res) goto errout; /* this is broken symlink case */ @@ -1059,7 +1059,7 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook paddr = pstr.name; /* Null-terminate the name */ - paddr[res] = '\0'; + paddr[pstr.len] = '\0'; put_page(cpage); return *cookie = paddr; diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h index 76cff18bb032..ff8b11b26f31 100644 --- a/include/linux/fscrypto.h +++ b/include/linux/fscrypto.h @@ -111,23 +111,6 @@ struct fscrypt_completion_result { struct fscrypt_completion_result ecr = { \ COMPLETION_INITIALIZER((ecr).completion), 0 } -static inline int fscrypt_key_size(int mode) -{ - switch (mode) { - case FS_ENCRYPTION_MODE_AES_256_XTS: - return FS_AES_256_XTS_KEY_SIZE; - case FS_ENCRYPTION_MODE_AES_256_GCM: - return FS_AES_256_GCM_KEY_SIZE; - case FS_ENCRYPTION_MODE_AES_256_CBC: - return FS_AES_256_CBC_KEY_SIZE; - case FS_ENCRYPTION_MODE_AES_256_CTS: - return FS_AES_256_CTS_KEY_SIZE; - default: - BUG(); - } - return 0; -} - #define FS_FNAME_NUM_SCATTER_ENTRIES 4 #define FS_CRYPTO_BLOCK_SIZE 16 #define FS_FNAME_CRYPTO_DIGEST_SIZE 32 @@ -202,13 +185,6 @@ static inline bool fscrypt_valid_filenames_enc_mode(u32 mode) return (mode == FS_ENCRYPTION_MODE_AES_256_CTS); } -static inline u32 fscrypt_validate_encryption_key_size(u32 mode, u32 size) -{ - if (size == fscrypt_key_size(mode)) - return size; - return 0; -} - static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) { if (str->len == 1 && str->name[0] == '.') From 16c3c372dca7de2199197fcf08a90f6c276011c2 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 29 Oct 2016 18:46:34 +0800 Subject: [PATCH 0030/1212] f2fs: report error of f2fs_fill_dentries Report error of f2fs_fill_dentries to ->iterate_shared, otherwise when error ocurrs, user may just list part of dirents in target directory without any hints. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 21 ++++++++++++--------- fs/f2fs/f2fs.h | 2 +- fs/f2fs/inline.c | 6 ++++-- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 210082783d5a..4436079dbf0c 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -785,7 +785,7 @@ bool f2fs_empty_dir(struct inode *dir) return true; } -bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, +int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, unsigned int start_pos, struct fscrypt_str *fstr) { unsigned char d_type = DT_UNKNOWN; @@ -820,7 +820,7 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, (u32)de->hash_code, 0, &de_name, fstr); if (err) - return true; + return err; de_name = *fstr; fstr->len = save_len; @@ -828,12 +828,12 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, if (!dir_emit(ctx, de_name.name, de_name.len, le32_to_cpu(de->ino), d_type)) - return true; + return 1; bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); ctx->pos = start_pos + bit_pos; } - return false; + return 0; } static int f2fs_readdir(struct file *file, struct dir_context *ctx) @@ -872,17 +872,21 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) dentry_page = get_lock_data_page(inode, n, false); if (IS_ERR(dentry_page)) { err = PTR_ERR(dentry_page); - if (err == -ENOENT) + if (err == -ENOENT) { + err = 0; continue; - else + } else { goto out; + } } dentry_blk = kmap(dentry_page); make_dentry_ptr(inode, &d, (void *)dentry_blk, 1); - if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) { + err = f2fs_fill_dentries(ctx, &d, + n * NR_DENTRY_IN_BLOCK, &fstr); + if (err) { kunmap(dentry_page); f2fs_put_page(dentry_page, 1); break; @@ -892,10 +896,9 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) kunmap(dentry_page); f2fs_put_page(dentry_page, 1); } - err = 0; out: fscrypt_fname_free_buffer(&fstr); - return err; + return err < 0 ? err : 0; } static int f2fs_dir_open(struct inode *inode, struct file *filp) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 932c53f441db..4b13d70d716c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2014,7 +2014,7 @@ void set_de_type(struct f2fs_dir_entry *, umode_t); unsigned char get_de_type(struct f2fs_dir_entry *); struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *, f2fs_hash_t, int *, struct f2fs_dentry_ptr *); -bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, +int f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, unsigned int, struct fscrypt_str *); void do_make_empty_dir(struct inode *, struct inode *, struct f2fs_dentry_ptr *); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 841aa13d9f4e..3f8bfc87c6dc 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -611,6 +611,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, struct f2fs_inline_dentry *inline_dentry = NULL; struct page *ipage = NULL; struct f2fs_dentry_ptr d; + int err; if (ctx->pos == NR_INLINE_DENTRY) return 0; @@ -623,11 +624,12 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, make_dentry_ptr(inode, &d, (void *)inline_dentry, 2); - if (!f2fs_fill_dentries(ctx, &d, 0, fstr)) + err = f2fs_fill_dentries(ctx, &d, 0, fstr); + if (!err) ctx->pos = NR_INLINE_DENTRY; f2fs_put_page(ipage, 1); - return 0; + return err < 0 ? err : 0; } int f2fs_inline_data_fiemap(struct inode *inode, From 20cf9476e3b8d1b6ecadf7abf0970d90a393217d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 31 Oct 2016 14:01:41 -0700 Subject: [PATCH 0031/1212] f2fs: avoid infinite loop in the EIO case on recover_orphan_inodes This patch should fix an infinite loop case below. F2FS-fs : inject IO error in f2fs_read_end_io+0xf3/0x120 [f2fs] F2FS-fs (nvme0n1p1): recover_orphan_inode: orphan failed (ino=39ac1a), run fsck to fix. ... [] sync_meta_pages+0xae/0x270 [f2fs] [] ? flush_sit_entries+0x8d/0x960 [f2fs] [] write_checkpoint+0x361/0xf20 [f2fs] [] ? trace_hardirqs_on+0xd/0x10 [] ? f2fs_sync_fs+0x79/0x190 [f2fs] [] f2fs_sync_fs+0x85/0x190 [f2fs] [] f2fs_balance_fs_bg+0x7e/0x1c0 [f2fs] [] f2fs_write_node_pages+0x34/0x320 [f2fs] [] do_writepages+0x21/0x30 [] __writeback_single_inode+0x61/0x760 [] ? _raw_spin_unlock+0x27/0x40 [] writeback_single_inode+0xd5/0x190 [] write_inode_now+0x99/0xc0 [] iput+0x1f6/0x2c0 [] f2fs_fill_super+0xe0e/0x1300 [f2fs] [] ? sget_userns+0x4f4/0x530 [] mount_bdev+0x182/0x1b0 [] ? f2fs_commit_super+0x100/0x100 [f2fs] [] f2fs_mount+0x15/0x20 [f2fs] [] mount_fs+0x38/0x170 [] vfs_kern_mount+0x6b/0x160 [] do_mount+0x1be/0xd60 [] ? copy_mount_options+0xb7/0x220 [] SyS_mount+0x94/0xd0 [] entry_SYSCALL_64_fastpath+0x23/0xc6 Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 6034d51fc5fc..e007c011ec53 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1890,6 +1890,13 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) mutex_lock(&sbi->umount_mutex); release_ino_entry(sbi, true); f2fs_leave_shrinker(sbi); + /* + * Some dirty meta pages can be produced by recover_orphan_inodes() + * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg() + * followed by write_checkpoint() through f2fs_write_node_pages(), which + * falls into an infinite loop in sync_meta_pages(). + */ + truncate_inode_pages_final(META_MAPPING(sbi)); iput(sbi->node_inode); mutex_unlock(&sbi->umount_mutex); free_nm: From 26fcd8659ef3863962235bfa1209b53a86fa9e06 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:44:59 +0900 Subject: [PATCH 0032/1212] f2fs: Add missing break in switch-case Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e007c011ec53..4fd34e7bcf60 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -420,6 +420,7 @@ static int parse_options(struct super_block *sb, char *options) break; case Opt_nodiscard: clear_opt(sbi, DISCARD); + break; case Opt_noheap: set_opt(sbi, NOHEAP); break; From 6e89bc832cc20e2edd05167dec11ca2a60b6d5d2 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:00 +0900 Subject: [PATCH 0033/1212] f2fs: Use generic zoned block device terminology SMR stands for "Shingled Magnetic Recording" which makes sense only for hard disk drives (spinning rust). The ZBC/ZAC standards enable management of SMR disks, but solid state drives may also support those standards. So rename the HMSMR feature to BLKZONED to avoid a HDD centric terminology. For the same reason, rename f2fs_sb_mounted_hmsmr to f2fs_sb_mounted_blkzoned. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/f2fs.h | 6 +++--- fs/f2fs/super.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 192bc039194d..3817cf841dff 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -110,7 +110,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, int rw, { if (!is_read_io(rw)) { atomic_inc(&sbi->nr_wb_bios); - if (f2fs_sb_mounted_hmsmr(sbi->sb) && + if (f2fs_sb_mounted_blkzoned(sbi->sb) && current->plug && (type == DATA || type == NODE)) blk_finish_plug(current->plug); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4b13d70d716c..5a563a9f3a52 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -103,7 +103,7 @@ struct f2fs_mount_info { }; #define F2FS_FEATURE_ENCRYPT 0x0001 -#define F2FS_FEATURE_HMSMR 0x0002 +#define F2FS_FEATURE_BLKZONED 0x0002 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -2470,9 +2470,9 @@ static inline int f2fs_sb_has_crypto(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT); } -static inline int f2fs_sb_mounted_hmsmr(struct super_block *sb) +static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb) { - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_HMSMR); + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED); } static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4fd34e7bcf60..3574d0620dc4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -974,7 +974,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, EXTENT_CACHE); sbi->sb->s_flags |= MS_LAZYTIME; set_opt(sbi, FLUSH_MERGE); - if (f2fs_sb_mounted_hmsmr(sbi->sb)) { + if (f2fs_sb_mounted_blkzoned(sbi->sb)) { set_opt_mode(sbi, F2FS_MOUNT_LFS); set_opt(sbi, DISCARD); } else { From 5b0f4f4c6a017a563edb90f393dd01d0f4dc7d4c Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:01 +0900 Subject: [PATCH 0034/1212] f2fs: Check zoned block feature for host-managed zoned block devices The F2FS_FEATURE_BLKZONED feature indicates that the drive was formatted with zone alignment optimization. This is optional for host-aware devices, but mandatory for host-managed zoned block devices. So check that the feature is set in this latter case. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3574d0620dc4..4187e3b9a83e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1639,6 +1639,26 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) sb->s_fs_info = sbi; sbi->raw_super = raw_super; + /* + * The BLKZONED feature indicates that the drive was formatted with + * zone alignment optimization. This is optional for host-aware + * devices, but mandatory for host-managed zoned block devices. + */ +#ifndef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_mounted_blkzoned(sb)) { + f2fs_msg(sb, KERN_ERR, + "Zoned block device support is not enabled\n"); + goto free_sb_buf; + } +#else + if (bdev_zoned_model(sb->s_bdev) == BLK_ZONED_HM && + !f2fs_sb_mounted_blkzoned(sb)) { + f2fs_msg(sb, KERN_ERR, + "Zoned block device feature not enabled\n"); + goto free_sb_buf; + } +#endif + default_options(sbi); /* parse mount options */ options = kstrdup((const char *)data, GFP_KERNEL); From 40e505d0271bc5af896b903ef4e1d6d0068feb27 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:02 +0900 Subject: [PATCH 0035/1212] f2fs: Suppress discard warning message for zoned block devices For zoned block devices, discard is replaced by zone reset. So do not warn if the device does not supports discard. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4187e3b9a83e..3e57ec837de9 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -412,7 +412,7 @@ static int parse_options(struct super_block *sb, char *options) q = bdev_get_queue(sb->s_bdev); if (blk_queue_discard(q)) { set_opt(sbi, DISCARD); - } else { + } else if (!f2fs_sb_mounted_blkzoned(sb)) { f2fs_msg(sb, KERN_WARNING, "mounting with \"discard\" option, but " "the device does not support discard"); From 789098ffddbfc8adbf55470d66ef6eef264485f0 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:03 +0900 Subject: [PATCH 0036/1212] f2fs: Always enable discard for zoned blocks devices Zone write pointer reset acts as discard for zoned block devices. So if the zoned block device feature is enabled, always declare that discard is enabled, even if the device does not actually support the command. For the same reason, prevent the use the "nodicard" mount option. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 14 +++++++------- fs/f2fs/super.c | 5 +++++ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5a563a9f3a52..4fd31208965d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1159,13 +1159,6 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) spin_unlock(&sbi->cp_lock); } -static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) -{ - struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); - - return blk_queue_discard(q); -} - static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { down_read(&sbi->cp_rwsem); @@ -2475,6 +2468,13 @@ static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED); } +static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) +{ + struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); + + return blk_queue_discard(q) || f2fs_sb_mounted_blkzoned(sbi->sb); +} + static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt) { clear_opt(sbi, ADAPTIVE); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3e57ec837de9..33676c1e35d7 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -419,6 +419,11 @@ static int parse_options(struct super_block *sb, char *options) } break; case Opt_nodiscard: + if (f2fs_sb_mounted_blkzoned(sb)) { + f2fs_msg(sb, KERN_WARNING, + "discard is required for zoned block devices"); + return -EINVAL; + } clear_opt(sbi, DISCARD); break; case Opt_noheap: From 02bccb06333fd6abe7b3bf61f092d6cffc9b3722 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:04 +0900 Subject: [PATCH 0037/1212] f2fs: Do not allow adaptive mode for host-managed zoned block devices The LFS mode is mandatory for host-managed zoned block devices as update in place optimizations are not possible for segments in sequential zones. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 33676c1e35d7..6bc0810969b7 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -518,6 +518,13 @@ static int parse_options(struct super_block *sb, char *options) return -ENOMEM; if (strlen(name) == 8 && !strncmp(name, "adaptive", 8)) { + if (f2fs_sb_mounted_blkzoned(sb)) { + f2fs_msg(sb, KERN_WARNING, + "adaptive mode is not allowed with " + "zoned block device feature"); + kfree(name); + return -EINVAL; + } set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE); } else if (strlen(name) == 3 && !strncmp(name, "lfs", 3)) { From 060887886644d506772e03d323782a6bffaedcb6 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:05 +0900 Subject: [PATCH 0038/1212] f2fs: Cache zoned block devices zone type With the zoned block device feature enabled, section discard need to do a zone reset for sections contained in sequential zones, and a regular discard (if supported) for sections stored in conventional zones. Avoid the need for a costly report zones to obtain a section zone type when discarding it by caching the types of the device zones in the super block information. This cache is initialized at mount time for mounts with the zoned block device feature enabled. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 18 +++++++++++++ fs/f2fs/super.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4fd31208965d..c6dba704b0fe 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -803,6 +803,14 @@ struct f2fs_sb_info { u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE]; u8 key_prefix_size; #endif + +#ifdef CONFIG_BLK_DEV_ZONED + unsigned int nr_blkz; /* Total number of zones */ + unsigned int blocks_per_blkz; /* F2FS blocks per zone */ + unsigned int log_blocks_per_blkz; /* log2 F2FS blocks per zone */ + u8 *blkz_type; /* Array of zones type */ +#endif + /* for node-related operations */ struct f2fs_nm_info *nm_info; /* node manager */ struct inode *node_inode; /* cache node blocks */ @@ -2468,6 +2476,16 @@ static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED); } +#ifdef CONFIG_BLK_DEV_ZONED +static inline int get_blkz_type(struct f2fs_sb_info *sbi, + block_t blkaddr) +{ + unsigned int zno = blkaddr >> sbi->log_blocks_per_blkz; + + return sbi->blkz_type[zno]; +} +#endif + static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) { struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 6bc0810969b7..d777a18df958 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1512,6 +1512,65 @@ static int init_percpu_info(struct f2fs_sb_info *sbi) GFP_KERNEL); } +#ifdef CONFIG_BLK_DEV_ZONED +static int init_blkz_info(struct f2fs_sb_info *sbi) +{ + struct block_device *bdev = sbi->sb->s_bdev; + sector_t nr_sectors = bdev->bd_part->nr_sects; + sector_t sector = 0; + struct blk_zone *zones; + unsigned int i, nr_zones; + unsigned int n = 0; + int err = -EIO; + + if (!f2fs_sb_mounted_blkzoned(sbi->sb)) + return 0; + + sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_size(bdev)); + sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz); + sbi->nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> + sbi->log_blocks_per_blkz; + if (nr_sectors & (bdev_zone_size(bdev) - 1)) + sbi->nr_blkz++; + + sbi->blkz_type = kmalloc(sbi->nr_blkz, GFP_KERNEL); + if (!sbi->blkz_type) + return -ENOMEM; + +#define F2FS_REPORT_NR_ZONES 4096 + + zones = kcalloc(F2FS_REPORT_NR_ZONES, sizeof(struct blk_zone), + GFP_KERNEL); + if (!zones) + return -ENOMEM; + + /* Get block zones type */ + while (zones && sector < nr_sectors) { + + nr_zones = F2FS_REPORT_NR_ZONES; + err = blkdev_report_zones(bdev, sector, + zones, &nr_zones, + GFP_KERNEL); + if (err) + break; + if (!nr_zones) { + err = -EIO; + break; + } + + for (i = 0; i < nr_zones; i++) { + sbi->blkz_type[n] = zones[i].type; + sector += zones[i].len; + n++; + } + } + + kfree(zones); + + return err; +} +#endif + /* * Read f2fs raw super block. * Because we have two copies of super block, so read both of them @@ -1758,6 +1817,15 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) init_ino_entry_info(sbi); +#ifdef CONFIG_BLK_DEV_ZONED + err = init_blkz_info(sbi); + if (err) { + f2fs_msg(sb, KERN_ERR, + "Failed to initialize F2FS blkzone information"); + goto free_blkz; + } +#endif + /* setup f2fs internal modules */ err = build_segment_manager(sbi); if (err) { @@ -1936,6 +2004,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) destroy_node_manager(sbi); free_sm: destroy_segment_manager(sbi); +#ifdef CONFIG_BLK_DEV_ZONED +free_blkz: + kfree(sbi->blkz_type); +#endif kfree(sbi->ckpt); free_meta_inode: make_bad_inode(sbi->meta_inode); From ac0357e5d5e91b10d66cd6b01fe10424bb5215b7 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:06 +0900 Subject: [PATCH 0039/1212] f2fs: Reset sequential zones on zoned block devices When a zoned block device is mounted, discarding sections contained in sequential zones must reset the zone write pointer. For sections contained in conventional zones, the regular discard is used if the drive supports it. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/segment.c --- fs/f2fs/segment.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ec4d74c26067..8e4863bd36f5 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "f2fs.h" #include "segment.h" @@ -584,6 +585,45 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_unlock(&dirty_i->seglist_lock); } +#ifdef CONFIG_BLK_DEV_ZONED +static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, + block_t blkstart, block_t blklen) +{ + sector_t sector = SECTOR_FROM_BLOCK(blkstart); + sector_t nr_sects = SECTOR_FROM_BLOCK(blklen); + struct block_device *bdev = sbi->sb->s_bdev; + + if (nr_sects != bdev_zone_size(bdev)) { + f2fs_msg(sbi->sb, KERN_INFO, + "Unaligned discard attempted (sector %llu + %llu)", + (unsigned long long)sector, + (unsigned long long)nr_sects); + return -EIO; + } + + /* + * We need to know the type of the zone: for conventional zones, + * use regular discard if the drive supports it. For sequential + * zones, reset the zone write pointer. + */ + switch (get_blkz_type(sbi, blkstart)) { + + case BLK_ZONE_TYPE_CONVENTIONAL: + if (!blk_queue_discard(bdev_get_queue(bdev))) + return 0; + return blkdev_issue_discard(bdev, sector, nr_sects, + GFP_NOFS, 0); + case BLK_ZONE_TYPE_SEQWRITE_REQ: + case BLK_ZONE_TYPE_SEQWRITE_PREF: + return blkdev_reset_zones(bdev, sector, + nr_sects, GFP_NOFS); + default: + /* Unknown zone type: broken device ? */ + return -EIO; + } +} +#endif + static int f2fs_issue_discard(struct f2fs_sb_info *sbi, block_t blkstart, block_t blklen) { @@ -601,6 +641,11 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, sbi->discard_blks--; } trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); + +#ifdef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_mounted_blkzoned(sbi->sb)) + return f2fs_issue_discard_zone(sbi, blkstart, blklen); +#endif return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0); } From 55fac8071160fb3368531abedb3b72e7a7394004 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:07 +0900 Subject: [PATCH 0040/1212] f2fs: Trace reset zone events Similarly to the regular discard, trace zone reset events. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 1 + include/trace/events/f2fs.h | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 8e4863bd36f5..06b9d16a19f6 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -615,6 +615,7 @@ static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, GFP_NOFS, 0); case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF: + trace_f2fs_issue_reset_zone(sbi->sb, blkstart); return blkdev_reset_zones(bdev, sector, nr_sects, GFP_NOFS); default: diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 3a09bb4dc3b2..90d6ad49a9c5 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1110,6 +1110,27 @@ TRACE_EVENT(f2fs_issue_discard, (unsigned long long)__entry->blklen) ); +TRACE_EVENT(f2fs_issue_reset_zone, + + TP_PROTO(struct super_block *sb, block_t blkstart), + + TP_ARGS(sb, blkstart), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(block_t, blkstart) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->blkstart = blkstart; + ), + + TP_printk("dev = (%d,%d), reset zone at block = 0x%llx", + show_dev(__entry), + (unsigned long long)__entry->blkstart) +); + TRACE_EVENT(f2fs_issue_flush, TP_PROTO(struct super_block *sb, unsigned int nobarrier, From d69efabf19970dcc335ea8265affd8eadafe70f3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 2 Nov 2016 20:43:21 +0800 Subject: [PATCH 0041/1212] f2fs: record inode updating status correctly We should record updating status of inode only for living inode, for those unlinked inode it needs to clear its ino cache, otherwise after the ino was been reused, it will cause unneeded node page writing during ->fsync. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 7b5e402f0a72..af06bda51a54 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -377,6 +377,9 @@ void f2fs_evict_inode(struct inode *inode) goto no_delete; #endif + remove_ino_entry(sbi, inode->i_ino, APPEND_INO); + remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); + sb_start_intwrite(inode->i_sb); set_inode_flag(inode, FI_NO_ALLOC); i_size_write(inode, 0); @@ -409,10 +412,12 @@ void f2fs_evict_inode(struct inode *inode) invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); if (xnid) invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); - if (is_inode_flag_set(inode, FI_APPEND_WRITE)) - add_ino_entry(sbi, inode->i_ino, APPEND_INO); - if (is_inode_flag_set(inode, FI_UPDATE_WRITE)) - add_ino_entry(sbi, inode->i_ino, UPDATE_INO); + if (inode->i_nlink) { + if (is_inode_flag_set(inode, FI_APPEND_WRITE)) + add_ino_entry(sbi, inode->i_ino, APPEND_INO); + if (is_inode_flag_set(inode, FI_UPDATE_WRITE)) + add_ino_entry(sbi, inode->i_ino, UPDATE_INO); + } if (is_inode_flag_set(inode, FI_FREE_NID)) { alloc_nid_failed(sbi, inode->i_ino); clear_inode_flag(inode, FI_FREE_NID); From 16650422c86074d8f4f02ede10fb66901226da90 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 4 Nov 2016 00:26:55 +0800 Subject: [PATCH 0042/1212] f2fs: fix wrong i_atime recovery Shouldn't update in-memory i_atime with on-disk i_mtime of inode when recovering inode. Shuoran found this bug which is hidden for a long time, honour is belong to him. Signed-off-by: Shuoran Liu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 2fc84a991325..d2ba4da08ec3 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -180,10 +180,10 @@ static void recover_inode(struct inode *inode, struct page *page) inode->i_mode = le16_to_cpu(raw->i_mode); f2fs_i_size_write(inode, le64_to_cpu(raw->i_size)); - inode->i_atime.tv_sec = le64_to_cpu(raw->i_mtime); + inode->i_atime.tv_sec = le64_to_cpu(raw->i_atime); inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime); inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime); - inode->i_atime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); + inode->i_atime.tv_nsec = le32_to_cpu(raw->i_atime_nsec); inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); From 4cd4b0465d2227a6ca686d72f81d3fd3b207e94c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 4 Nov 2016 14:33:57 -0700 Subject: [PATCH 0043/1212] f2fs: assign segments correctly for direct_io Previously, we assigned CURSEG_WARM_DATA for direct_io, but if we have two or four logs, we do not use that type at all. Let's fix it. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 06b9d16a19f6..4bdf1191a36f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1422,8 +1422,12 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, struct curseg_info *curseg; bool direct_io = (type == CURSEG_DIRECT_IO); - type = direct_io ? CURSEG_WARM_DATA : type; - + if (direct_io) { + if (sbi->active_logs <= 4) + type = CURSEG_HOT_DATA; + else + type = CURSEG_WARM_DATA; + } curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); From 5f59a8f59bdfe9190a24caaec69fa33975df7ce0 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 4 Nov 2016 14:59:15 -0700 Subject: [PATCH 0044/1212] f2fs: remove checkpoint in f2fs_freeze The generic freeze_super() calls sync_filesystems() before f2fs_freeze(). So, basically we don't need to do checkpoint in f2fs_freeze(). But, in xfs/068, it triggers circular locking problem below due to gc_mutex for checkpoint. ====================================================== [ INFO: possible circular locking dependency detected ] 4.9.0-rc1+ #132 Tainted: G OE ------------------------------------------------------- 1. wait for __sb_start_write() by [] dump_stack+0x85/0xc2 [] print_circular_bug+0x1cf/0x230 [] __lock_acquire+0x19e0/0x1bc0 [] lock_acquire+0x11b/0x220 [] ? f2fs_drop_inode+0x9b/0x160 [f2fs] [] __sb_start_write+0x130/0x200 [] ? f2fs_drop_inode+0x9b/0x160 [f2fs] [] f2fs_drop_inode+0x9b/0x160 [f2fs] [] iput+0x171/0x2c0 [] f2fs_sync_inode_meta+0x3f/0xf0 [f2fs] [] block_operations+0x84/0x110 [f2fs] [] write_checkpoint+0xe8/0xf20 [f2fs] [] ? trace_hardirqs_on+0xd/0x10 [] ? f2fs_sync_fs+0x79/0x190 [f2fs] [] ? sched_clock+0x9/0x10 [] ? f2fs_sync_fs+0x79/0x190 [f2fs] [] f2fs_sync_fs+0x85/0x190 [f2fs] [] ? do_fsync+0x70/0x70 [] ? do_fsync+0x70/0x70 [] sync_fs_one_sb+0x20/0x30 [] iterate_supers+0xae/0x100 [] sys_sync+0x55/0x90 [] entry_SYSCALL_64_fastpath+0x23/0xc6 2. wait for sbi->gc_mutex by [] lock_acquire+0x11b/0x220 [] mutex_lock_nested+0x76/0x3f0 [] f2fs_sync_fs+0x79/0x190 [f2fs] [] f2fs_freeze+0x1c/0x20 [f2fs] [] freeze_super+0xcf/0x190 [] do_vfs_ioctl+0x53c/0x6a0 [] SyS_ioctl+0x79/0x90 [] entry_SYSCALL_64_fastpath+0x23/0xc6 Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d777a18df958..7a0634b0bee8 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -800,13 +800,17 @@ int f2fs_sync_fs(struct super_block *sb, int sync) static int f2fs_freeze(struct super_block *sb) { - int err; - if (f2fs_readonly(sb)) return 0; - err = f2fs_sync_fs(sb, 1); - return err; + /* IO error happened before */ + if (unlikely(f2fs_cp_error(F2FS_SB(sb)))) + return -EIO; + + /* must be clean, since sync_filesystem() was already called */ + if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY)) + return -EINVAL; + return 0; } static int f2fs_unfreeze(struct super_block *sb) @@ -2153,3 +2157,4 @@ module_exit(exit_f2fs_fs) MODULE_AUTHOR("Samsung Electronics's Praesto Team"); MODULE_DESCRIPTION("Flash Friendly File System"); MODULE_LICENSE("GPL"); + From 11895b32059553d1ee358980e28587af9cd5eea6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 5 Nov 2016 11:12:40 +0800 Subject: [PATCH 0045/1212] Revert "f2fs: do not recover from previous remained wrong dnodes" i_times of inode will be set with current system time which can be configured through 'date', so it's not safe to judge dnode block as garbage data or unchanged inode depend on i_times. Now, we have used enhanced 'cp_ver + cp' crc method to verify valid dnode block, so I expect recoverying invalid dnode is almost not possible. This reverts commit 807b1e1c8e08452948495b1a9985ab46d329e5c2. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 31 +------------------------------ 1 file changed, 1 insertion(+), 30 deletions(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index d2ba4da08ec3..62523b217571 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -196,32 +196,6 @@ static void recover_inode(struct inode *inode, struct page *page) ino_of_node(page), name); } -static bool is_same_inode(struct inode *inode, struct page *ipage) -{ - struct f2fs_inode *ri = F2FS_INODE(ipage); - struct timespec disk; - - if (!IS_INODE(ipage)) - return true; - - disk.tv_sec = le64_to_cpu(ri->i_ctime); - disk.tv_nsec = le32_to_cpu(ri->i_ctime_nsec); - if (timespec_compare(&inode->i_ctime, &disk) > 0) - return false; - - disk.tv_sec = le64_to_cpu(ri->i_atime); - disk.tv_nsec = le32_to_cpu(ri->i_atime_nsec); - if (timespec_compare(&inode->i_atime, &disk) > 0) - return false; - - disk.tv_sec = le64_to_cpu(ri->i_mtime); - disk.tv_nsec = le32_to_cpu(ri->i_mtime_nsec); - if (timespec_compare(&inode->i_mtime, &disk) > 0) - return false; - - return true; -} - static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) { struct curseg_info *curseg; @@ -248,10 +222,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) goto next; entry = get_fsync_inode(head, ino_of_node(page)); - if (entry) { - if (!is_same_inode(entry->inode, page)) - goto next; - } else { + if (!entry) { if (IS_INODE(page) && is_dent_dnode(page)) { err = recover_inode_page(sbi, page); if (err) From 908659afc0564703fae66eae4ffe23b352308ee3 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Mon, 7 Nov 2016 21:22:31 +0800 Subject: [PATCH 0046/1212] f2fs: return directly if block has been removed from the victim If one block has been to written to a new place, just return in move data process. This patch check it again with holding page lock. Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 72a0ca08f901..744031194934 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -544,7 +544,8 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return true; } -static void move_encrypted_block(struct inode *inode, block_t bidx) +static void move_encrypted_block(struct inode *inode, block_t bidx, + unsigned int segno, int off) { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), @@ -564,6 +565,9 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) if (!page) return; + if (!check_valid_map(F2FS_I_SB(inode), segno, off)) + goto out; + set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE); if (err) @@ -643,7 +647,8 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) f2fs_put_page(page, 1); } -static void move_data_page(struct inode *inode, block_t bidx, int gc_type) +static void move_data_page(struct inode *inode, block_t bidx, int gc_type, + unsigned int segno, int off) { struct page *page; @@ -651,6 +656,9 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) if (IS_ERR(page)) return; + if (!check_valid_map(F2FS_I_SB(inode), segno, off)) + goto out; + if (gc_type == BG_GC) { if (PageWriteback(page)) goto out; @@ -792,9 +800,9 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, start_bidx = start_bidx_of_node(nofs, inode) + ofs_in_node; if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - move_encrypted_block(inode, start_bidx); + move_encrypted_block(inode, start_bidx, segno, off); else - move_data_page(inode, start_bidx, gc_type); + move_data_page(inode, start_bidx, gc_type, segno, off); if (locked) { up_write(&fi->dio_rwsem[WRITE]); From 17aa419b53395bb52d0afb5863335d9260e1775c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 11 Nov 2016 12:31:40 -0800 Subject: [PATCH 0047/1212] f2fs: revert segment allocation for direct IO Now we don't need to be too much careful about storage alignment for dio, since its speed becomes quite fast and we'd better avoid any misalignment first. Revert: 38aa0889b250 (f2fs: align direct_io'ed data to section) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 6 +----- fs/f2fs/f2fs.h | 1 - fs/f2fs/segment.c | 36 +++++++++--------------------------- 3 files changed, 10 insertions(+), 33 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 3817cf841dff..c37396b3212e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -582,7 +582,6 @@ static int __allocate_data_block(struct dnode_of_data *dn) struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct f2fs_summary sum; struct node_info ni; - int seg = CURSEG_WARM_DATA; pgoff_t fofs; blkcnt_t count = 1; @@ -600,11 +599,8 @@ static int __allocate_data_block(struct dnode_of_data *dn) get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); - if (dn->ofs_in_node == 0 && dn->inode_page == dn->node_page) - seg = CURSEG_DIRECT_IO; - allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr, - &sum, seg); + &sum, CURSEG_WARM_DATA); set_data_blkaddr(dn); /* update i_size */ diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c6dba704b0fe..4d4bfbaeb788 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -639,7 +639,6 @@ enum { CURSEG_WARM_NODE, /* direct node blocks of normal files */ CURSEG_COLD_NODE, /* indirect node blocks */ NO_CHECK_TYPE, - CURSEG_DIRECT_IO, /* to use for the direct IO path */ }; struct flush_cmd { diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 4bdf1191a36f..19ab2e63d8d7 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1267,25 +1267,21 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, stat_inc_seg_type(sbi, curseg); } -static void __allocate_new_segments(struct f2fs_sb_info *sbi, int type) -{ - struct curseg_info *curseg = CURSEG_I(sbi, type); - unsigned int old_segno; - - old_segno = curseg->segno; - SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true); - locate_dirty_segment(sbi, old_segno); -} - void allocate_new_segments(struct f2fs_sb_info *sbi) { + struct curseg_info *curseg; + unsigned int old_segno; int i; if (test_opt(sbi, LFS)) return; - for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) - __allocate_new_segments(sbi, i); + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { + curseg = CURSEG_I(sbi, i); + old_segno = curseg->segno; + SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true); + locate_dirty_segment(sbi, old_segno); + } } static const struct segment_allocation default_salloc_ops = { @@ -1419,25 +1415,11 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, struct f2fs_summary *sum, int type) { struct sit_info *sit_i = SIT_I(sbi); - struct curseg_info *curseg; - bool direct_io = (type == CURSEG_DIRECT_IO); - - if (direct_io) { - if (sbi->active_logs <= 4) - type = CURSEG_HOT_DATA; - else - type = CURSEG_WARM_DATA; - } - curseg = CURSEG_I(sbi, type); + struct curseg_info *curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); mutex_lock(&sit_i->sentry_lock); - /* direct_io'ed data is aligned to the segment for better performance */ - if (direct_io && curseg->next_blkoff && - !has_not_enough_free_secs(sbi, 0, 0)) - __allocate_new_segments(sbi, type); - *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); /* From 09d9b573e5881fb588edb22e1a093ac4e485e1f4 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 11 Nov 2016 12:08:22 -0800 Subject: [PATCH 0048/1212] f2fs: allow dio read for LFS mode We can allow dio reads for LFS mode, while doing buffered writes for dio writes. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index c37396b3212e..08a1c09adba7 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1723,7 +1723,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) return 0; - if (test_opt(F2FS_I_SB(inode), LFS)) + if (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) return 0; trace_f2fs_direct_IO_enter(inode, offset, count, rw); From 79d47107adb6f58c8555be43e6b18fccf944ae8d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 6 Oct 2016 19:02:05 -0700 Subject: [PATCH 0049/1212] f2fs: support multiple devices This patch implements multiple devices support for f2fs. Given multiple devices by mkfs.f2fs, f2fs shows them entirely as one big volume under one f2fs instance. Internal block management is very simple, but we will modify block allocation and background GC policy to boost IO speed by exploiting them accoording to each device speed. Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/data.c fs/f2fs/segment.c --- fs/f2fs/data.c | 55 ++++++++++++++-- fs/f2fs/f2fs.h | 29 +++++++-- fs/f2fs/segment.c | 112 +++++++++++++++++++++++--------- fs/f2fs/super.c | 138 +++++++++++++++++++++++++++++++--------- include/linux/f2fs_fs.h | 10 ++- 5 files changed, 274 insertions(+), 70 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 08a1c09adba7..447dd624f6a4 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -87,6 +87,46 @@ static void f2fs_write_end_io(struct bio *bio) bio_put(bio); } +/* + * Return true, if pre_bio's bdev is same as its target device. + */ +struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, + block_t blk_addr, struct bio *bio) +{ + struct block_device *bdev = sbi->sb->s_bdev; + int i; + + for (i = 0; i < sbi->s_ndevs; i++) { + if (FDEV(i).start_blk <= blk_addr && + FDEV(i).end_blk >= blk_addr) { + blk_addr -= FDEV(i).start_blk; + bdev = FDEV(i).bdev; + break; + } + } + if (bio) { + bio->bi_bdev = bdev; + bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr); + } + return bdev; +} + +int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + int i; + + for (i = 0; i < sbi->s_ndevs; i++) + if (FDEV(i).start_blk <= blkaddr && FDEV(i).end_blk >= blkaddr) + return i; + return 0; +} + +static bool __same_bdev(struct f2fs_sb_info *sbi, + block_t blk_addr, struct bio *bio) +{ + return f2fs_target_device(sbi, blk_addr, NULL) == bio->bi_bdev; +} + /* * Low-level block read/write IO operations. */ @@ -97,8 +137,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, bio = f2fs_bio_alloc(npages); - bio->bi_bdev = sbi->sb->s_bdev; - bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr); + f2fs_target_device(sbi, blk_addr, bio); bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; bio->bi_private = is_read ? NULL : sbi; @@ -268,7 +307,8 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) down_write(&io->io_rwsem); if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 || - io->fio.rw != fio->rw)) + (io->fio.rw != fio->rw) || + !__same_bdev(sbi, fio->new_blkaddr, io->bio))) __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { @@ -955,7 +995,6 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct fscrypt_ctx *ctx = NULL; - struct block_device *bdev = sbi->sb->s_bdev; struct bio *bio; if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { @@ -973,8 +1012,7 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, fscrypt_release_ctx(ctx); return ERR_PTR(-ENOMEM); } - bio->bi_bdev = bdev; - bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blkaddr); + f2fs_target_device(sbi, blkaddr, bio); bio->bi_end_io = f2fs_read_end_io; bio->bi_private = ctx; @@ -1068,7 +1106,8 @@ static int f2fs_mpage_readpages(struct address_space *mapping, * This page will go to BIO. Do we need to send this * BIO off first? */ - if (bio && (last_block_in_bio != block_nr - 1)) { + if (bio && (last_block_in_bio != block_nr - 1 || + !__same_bdev(F2FS_I_SB(inode), block_nr, bio))) { submit_and_realloc: __submit_bio(F2FS_I_SB(inode), READ, bio, DATA); bio = NULL; @@ -1725,6 +1764,8 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, return 0; if (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) return 0; + if (F2FS_I_SB(inode)->s_ndevs) + return 0; trace_f2fs_direct_IO_enter(inode, offset, count, rw); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4d4bfbaeb788..04f6dddc6d91 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -756,6 +756,20 @@ struct f2fs_bio_info { struct rw_semaphore io_rwsem; /* blocking op for bio */ }; +#define FDEV(i) (sbi->devs[i]) +#define RDEV(i) (raw_super->devs[i]) +struct f2fs_dev_info { + struct block_device *bdev; + char path[MAX_PATH_LEN]; + unsigned int total_segments; + block_t start_blk; + block_t end_blk; +#ifdef CONFIG_BLK_DEV_ZONED + unsigned int nr_blkz; /* Total number of zones */ + u8 *blkz_type; /* Array of zones type */ +#endif +}; + enum inode_type { DIR_INODE, /* for dirty dir inode */ FILE_INODE, /* for dirty regular/symlink inode */ @@ -804,10 +818,8 @@ struct f2fs_sb_info { #endif #ifdef CONFIG_BLK_DEV_ZONED - unsigned int nr_blkz; /* Total number of zones */ unsigned int blocks_per_blkz; /* F2FS blocks per zone */ unsigned int log_blocks_per_blkz; /* log2 F2FS blocks per zone */ - u8 *blkz_type; /* Array of zones type */ #endif /* for node-related operations */ @@ -924,6 +936,8 @@ struct f2fs_sb_info { /* For shrinker support */ struct list_head s_list; + int s_ndevs; /* number of devices */ + struct f2fs_dev_info *devs; /* for device list */ struct mutex umount_mutex; unsigned int shrinker_run_no; @@ -2190,6 +2204,9 @@ void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *, void f2fs_flush_merged_bios(struct f2fs_sb_info *); int f2fs_submit_page_bio(struct f2fs_io_info *); void f2fs_submit_page_mbio(struct f2fs_io_info *); +struct block_device *f2fs_target_device(struct f2fs_sb_info *, + block_t, struct bio *); +int f2fs_target_device_index(struct f2fs_sb_info *, block_t); void set_data_blkaddr(struct dnode_of_data *); void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t); int reserve_new_blocks(struct dnode_of_data *, blkcnt_t); @@ -2477,11 +2494,15 @@ static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb) #ifdef CONFIG_BLK_DEV_ZONED static inline int get_blkz_type(struct f2fs_sb_info *sbi, - block_t blkaddr) + struct block_device *bdev, block_t blkaddr) { unsigned int zno = blkaddr >> sbi->log_blocks_per_blkz; + int i; - return sbi->blkz_type[zno]; + for (i = 0; i < sbi->s_ndevs; i++) + if (FDEV(i).bdev == bdev) + return FDEV(i).blkz_type[zno]; + return -EINVAL; } #endif diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 19ab2e63d8d7..30d0e9a76c62 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -401,6 +401,32 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) } } +static int __submit_flush_wait(struct block_device *bdev) +{ + struct bio *bio = f2fs_bio_alloc(0); + int ret; + + bio->bi_bdev = bdev; + ret = submit_bio_wait(WRITE_FLUSH, bio); + bio_put(bio); + return ret; +} + +static int submit_flush_wait(struct f2fs_sb_info *sbi) +{ + int ret = __submit_flush_wait(sbi->sb->s_bdev); + int i; + + if (sbi->s_ndevs && !ret) { + for (i = 1; i < sbi->s_ndevs; i++) { + ret = __submit_flush_wait(FDEV(i).bdev); + if (ret) + break; + } + } + return ret; +} + static int issue_flush_thread(void *data) { struct f2fs_sb_info *sbi = data; @@ -411,24 +437,18 @@ static int issue_flush_thread(void *data) return 0; if (!llist_empty(&fcc->issue_list)) { - struct bio *bio; struct flush_cmd *cmd, *next; int ret; - bio = f2fs_bio_alloc(0); - fcc->dispatch_list = llist_del_all(&fcc->issue_list); fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); - bio->bi_bdev = sbi->sb->s_bdev; - ret = submit_bio_wait(WRITE_FLUSH, bio); - + ret = submit_flush_wait(sbi); llist_for_each_entry_safe(cmd, next, fcc->dispatch_list, llnode) { cmd->ret = ret; complete(&cmd->wait); } - bio_put(bio); fcc->dispatch_list = NULL; } @@ -449,14 +469,11 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) return 0; if (!test_opt(sbi, FLUSH_MERGE) || !atomic_read(&fcc->submit_flush)) { - struct bio *bio = f2fs_bio_alloc(0); int ret; atomic_inc(&fcc->submit_flush); - bio->bi_bdev = sbi->sb->s_bdev; - ret = submit_bio_wait(WRITE_FLUSH, bio); + ret = submit_flush_wait(sbi); atomic_dec(&fcc->submit_flush); - bio_put(bio); return ret; } @@ -586,18 +603,24 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) } #ifdef CONFIG_BLK_DEV_ZONED -static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, - block_t blkstart, block_t blklen) +static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) { - sector_t sector = SECTOR_FROM_BLOCK(blkstart); sector_t nr_sects = SECTOR_FROM_BLOCK(blklen); - struct block_device *bdev = sbi->sb->s_bdev; + sector_t sector; + int devi = 0; - if (nr_sects != bdev_zone_size(bdev)) { + if (sbi->s_ndevs) { + devi = f2fs_target_device_index(sbi, blkstart); + blkstart -= FDEV(devi).start_blk; + } + sector = SECTOR_FROM_BLOCK(blkstart); + + if (sector % bdev_zone_size(bdev) || nr_sects != bdev_zone_size(bdev)) { f2fs_msg(sbi->sb, KERN_INFO, - "Unaligned discard attempted (sector %llu + %llu)", - (unsigned long long)sector, - (unsigned long long)nr_sects); + "(%d) %s: Unaligned discard attempted (block %x + %x)", + devi, sbi->s_ndevs ? FDEV(devi).path: "", + blkstart, blklen); return -EIO; } @@ -606,7 +629,7 @@ static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, * use regular discard if the drive supports it. For sequential * zones, reset the zone write pointer. */ - switch (get_blkz_type(sbi, blkstart)) { + switch (get_blkz_type(sbi, bdev, blkstart)) { case BLK_ZONE_TYPE_CONVENTIONAL: if (!blk_queue_discard(bdev_get_queue(bdev))) @@ -625,29 +648,60 @@ static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, } #endif -static int f2fs_issue_discard(struct f2fs_sb_info *sbi, - block_t blkstart, block_t blklen) +static int __issue_discard_async(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) { sector_t start = SECTOR_FROM_BLOCK(blkstart); sector_t len = SECTOR_FROM_BLOCK(blklen); + +#ifdef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_mounted_blkzoned(sbi->sb) && + bdev_zoned_model(bdev) != BLK_ZONED_NONE) + return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); +#endif + return blkdev_issue_discard(bdev, start, len, GFP_NOFS, 0); +} + +static int f2fs_issue_discard(struct f2fs_sb_info *sbi, + block_t blkstart, block_t blklen) +{ + sector_t start = blkstart, len = 0; + struct block_device *bdev; struct seg_entry *se; unsigned int offset; block_t i; + int err = 0; + + bdev = f2fs_target_device(sbi, blkstart, NULL); + + for (i = blkstart; i < blkstart + blklen; i++, len++) { + if (i != start) { + struct block_device *bdev2 = + f2fs_target_device(sbi, i, NULL); + + if (bdev2 != bdev) { + err = __issue_discard_async(sbi, bdev, + start, len); + if (err) + return err; + bdev = bdev2; + start = i; + len = 0; + } + } - for (i = blkstart; i < blkstart + blklen; i++) { se = get_seg_entry(sbi, GET_SEGNO(sbi, i)); offset = GET_BLKOFF_FROM_SEG0(sbi, i); if (!f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; } - trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); -#ifdef CONFIG_BLK_DEV_ZONED - if (f2fs_sb_mounted_blkzoned(sbi->sb)) - return f2fs_issue_discard_zone(sbi, blkstart, blklen); -#endif - return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0); + if (len) + err = __issue_discard_async(sbi, bdev, start, len); + + trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); + return err; } static void __add_discard_entry(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7a0634b0bee8..2d332a16de71 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -713,6 +713,19 @@ static void destroy_percpu_info(struct f2fs_sb_info *sbi) percpu_counter_destroy(&sbi->total_valid_inode_count); } +static void destroy_device_list(struct f2fs_sb_info *sbi) +{ + int i; + + for (i = 0; i < sbi->s_ndevs; i++) { + blkdev_put(FDEV(i).bdev, FMODE_EXCL); +#ifdef CONFIG_BLK_DEV_ZONED + kfree(FDEV(i).blkz_type); +#endif + } + kfree(sbi->devs); +} + static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -773,6 +786,8 @@ static void f2fs_put_super(struct super_block *sb) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->raw_super); + destroy_device_list(sbi); + destroy_percpu_info(sbi); kfree(sbi); } @@ -1517,9 +1532,9 @@ static int init_percpu_info(struct f2fs_sb_info *sbi) } #ifdef CONFIG_BLK_DEV_ZONED -static int init_blkz_info(struct f2fs_sb_info *sbi) +static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) { - struct block_device *bdev = sbi->sb->s_bdev; + struct block_device *bdev = FDEV(devi).bdev; sector_t nr_sectors = bdev->bd_part->nr_sects; sector_t sector = 0; struct blk_zone *zones; @@ -1530,15 +1545,21 @@ static int init_blkz_info(struct f2fs_sb_info *sbi) if (!f2fs_sb_mounted_blkzoned(sbi->sb)) return 0; + if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != + SECTOR_TO_BLOCK(bdev_zone_size(bdev))) + return -EINVAL; sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_size(bdev)); + if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz != + __ilog2_u32(sbi->blocks_per_blkz)) + return -EINVAL; sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz); - sbi->nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> - sbi->log_blocks_per_blkz; + FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> + sbi->log_blocks_per_blkz; if (nr_sectors & (bdev_zone_size(bdev) - 1)) - sbi->nr_blkz++; + FDEV(devi).nr_blkz++; - sbi->blkz_type = kmalloc(sbi->nr_blkz, GFP_KERNEL); - if (!sbi->blkz_type) + FDEV(devi).blkz_type = kmalloc(FDEV(devi).nr_blkz, GFP_KERNEL); + if (!FDEV(devi).blkz_type) return -ENOMEM; #define F2FS_REPORT_NR_ZONES 4096 @@ -1563,7 +1584,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi) } for (i = 0; i < nr_zones; i++) { - sbi->blkz_type[n] = zones[i].type; + FDEV(devi).blkz_type[n] = zones[i].type; sector += zones[i].len; n++; } @@ -1667,6 +1688,77 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) return err; } +static int f2fs_scan_devices(struct f2fs_sb_info *sbi) +{ + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + int i; + + for (i = 0; i < MAX_DEVICES; i++) { + if (!RDEV(i).path[0]) + return 0; + + if (i == 0) { + sbi->devs = kzalloc(sizeof(struct f2fs_dev_info) * + MAX_DEVICES, GFP_KERNEL); + if (!sbi->devs) + return -ENOMEM; + } + + memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN); + FDEV(i).total_segments = le32_to_cpu(RDEV(i).total_segments); + if (i == 0) { + FDEV(i).start_blk = 0; + FDEV(i).end_blk = FDEV(i).start_blk + + (FDEV(i).total_segments << + sbi->log_blocks_per_seg) - 1 + + le32_to_cpu(raw_super->segment0_blkaddr); + } else { + FDEV(i).start_blk = FDEV(i - 1).end_blk + 1; + FDEV(i).end_blk = FDEV(i).start_blk + + (FDEV(i).total_segments << + sbi->log_blocks_per_seg) - 1; + } + + FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, + sbi->sb->s_mode, sbi->sb->s_type); + if (IS_ERR(FDEV(i).bdev)) + return PTR_ERR(FDEV(i).bdev); + + /* to release errored devices */ + sbi->s_ndevs = i + 1; + +#ifdef CONFIG_BLK_DEV_ZONED + if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM && + !f2fs_sb_mounted_blkzoned(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Zoned block device feature not enabled\n"); + return -EINVAL; + } + if (bdev_zoned_model(FDEV(i).bdev) != BLK_ZONED_NONE) { + if (init_blkz_info(sbi, i)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Failed to initialize F2FS blkzone information"); + return -EINVAL; + } + f2fs_msg(sbi->sb, KERN_INFO, + "Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)", + i, FDEV(i).path, + FDEV(i).total_segments, + FDEV(i).start_blk, FDEV(i).end_blk, + bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HA ? + "Host-aware" : "Host-managed"); + continue; + } +#endif + f2fs_msg(sbi->sb, KERN_INFO, + "Mount Device [%2d]: %20s, %8u, %8x - %8x", + i, FDEV(i).path, + FDEV(i).total_segments, + FDEV(i).start_blk, FDEV(i).end_blk); + } + return 0; +} + static int f2fs_fill_super(struct super_block *sb, void *data, int silent) { struct f2fs_sb_info *sbi; @@ -1725,15 +1817,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) "Zoned block device support is not enabled\n"); goto free_sb_buf; } -#else - if (bdev_zoned_model(sb->s_bdev) == BLK_ZONED_HM && - !f2fs_sb_mounted_blkzoned(sb)) { - f2fs_msg(sb, KERN_ERR, - "Zoned block device feature not enabled\n"); - goto free_sb_buf; - } #endif - default_options(sbi); /* parse mount options */ options = kstrdup((const char *)data, GFP_KERNEL); @@ -1803,6 +1887,13 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) goto free_meta_inode; } + /* Initialize device list */ + err = f2fs_scan_devices(sbi); + if (err) { + f2fs_msg(sb, KERN_ERR, "Failed to find devices"); + goto free_devices; + } + sbi->total_valid_node_count = le32_to_cpu(sbi->ckpt->valid_node_count); percpu_counter_set(&sbi->total_valid_inode_count, @@ -1821,15 +1912,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) init_ino_entry_info(sbi); -#ifdef CONFIG_BLK_DEV_ZONED - err = init_blkz_info(sbi); - if (err) { - f2fs_msg(sb, KERN_ERR, - "Failed to initialize F2FS blkzone information"); - goto free_blkz; - } -#endif - /* setup f2fs internal modules */ err = build_segment_manager(sbi); if (err) { @@ -2008,10 +2090,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) destroy_node_manager(sbi); free_sm: destroy_segment_manager(sbi); -#ifdef CONFIG_BLK_DEV_ZONED -free_blkz: - kfree(sbi->blkz_type); -#endif +free_devices: + destroy_device_list(sbi); kfree(sbi->ckpt); free_meta_inode: make_bad_inode(sbi->meta_inode); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 422630b8e588..cea41a124a80 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -52,10 +52,17 @@ #define VERSION_LEN 256 #define MAX_VOLUME_NAME 512 +#define MAX_PATH_LEN 64 +#define MAX_DEVICES 8 /* * For superblock */ +struct f2fs_device { + __u8 path[MAX_PATH_LEN]; + __le32 total_segments; +} __packed; + struct f2fs_super_block { __le32 magic; /* Magic Number */ __le16 major_ver; /* Major Version */ @@ -94,7 +101,8 @@ struct f2fs_super_block { __le32 feature; /* defined features */ __u8 encryption_level; /* versioning level for encryption */ __u8 encrypt_pw_salt[16]; /* Salt used for string2key algorithm */ - __u8 reserved[871]; /* valid reserved region */ + struct f2fs_device devs[MAX_DEVICES]; /* device list */ + __u8 reserved[327]; /* valid reserved region */ } __packed; /* From 0bcbcd3714e5765abedf29fc42cd328e16c6b438 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 11 Nov 2016 16:31:56 -0800 Subject: [PATCH 0050/1212] f2fs: use err for f2fs_preallocate_blocks This patch has no functional change. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 26 +++++++++++++------------- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 35 +++++++++++++++++++---------------- 3 files changed, 33 insertions(+), 30 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 447dd624f6a4..13da02435fc5 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -652,11 +652,11 @@ static int __allocate_data_block(struct dnode_of_data *dn) return 0; } -ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) +int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); struct f2fs_map_blocks map; - ssize_t ret = 0; + int err = 0; map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos); map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from)); @@ -668,19 +668,19 @@ ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) map.m_next_pgofs = NULL; if (iocb->ki_flags & IOCB_DIRECT) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; + err = f2fs_convert_inline_inode(inode); + if (err) + return err; return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO); } if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; + err = f2fs_convert_inline_inode(inode); + if (err) + return err; } if (!f2fs_has_inline_data(inode)) return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); - return ret; + return err; } /* @@ -857,19 +857,19 @@ static int __get_data_block(struct inode *inode, sector_t iblock, pgoff_t *next_pgofs) { struct f2fs_map_blocks map; - int ret; + int err; map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; map.m_next_pgofs = next_pgofs; - ret = f2fs_map_blocks(inode, &map, create, flag); - if (!ret) { + err = f2fs_map_blocks(inode, &map, create, flag); + if (!err) { map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags; bh->b_size = map.m_len << inode->i_blkbits; } - return ret; + return err; } static int get_data_block(struct inode *inode, sector_t iblock, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 04f6dddc6d91..8dc378d82f67 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2212,7 +2212,7 @@ void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t); int reserve_new_blocks(struct dnode_of_data *, blkcnt_t); int reserve_new_block(struct dnode_of_data *); int f2fs_get_block(struct dnode_of_data *, pgoff_t); -ssize_t f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *); +int f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *); int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); struct page *get_read_data_page(struct inode *, pgoff_t, int, bool); struct page *find_data_page(struct inode *, pgoff_t); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ce38a350fb38..fbfcd809baec 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1324,15 +1324,15 @@ static int expand_inode_data(struct inode *inode, loff_t offset, pgoff_t pg_end; loff_t new_size = i_size_read(inode); loff_t off_end; - int ret; + int err; - ret = inode_newsize_ok(inode, (len + offset)); - if (ret) - return ret; + err = inode_newsize_ok(inode, (len + offset)); + if (err) + return err; - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; + err = f2fs_convert_inline_inode(inode); + if (err) + return err; f2fs_balance_fs(sbi, true); @@ -1344,12 +1344,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset, if (off_end) map.m_len++; - ret = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); - if (ret) { + err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); + if (err) { pgoff_t last_off; if (!map.m_len) - return ret; + return err; last_off = map.m_lblk + map.m_len - 1; @@ -1363,7 +1363,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset, if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) f2fs_i_size_write(inode, new_size); - return ret; + return err; } static long f2fs_fallocate(struct file *file, int mode, @@ -2267,12 +2267,15 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret > 0) { - ret = f2fs_preallocate_blocks(iocb, from); - if (!ret) { - blk_start_plug(&plug); - ret = __generic_file_write_iter(iocb, from); - blk_finish_plug(&plug); + int err = f2fs_preallocate_blocks(iocb, from); + + if (err) { + inode_unlock(inode); + return err; } + blk_start_plug(&plug); + ret = __generic_file_write_iter(iocb, from); + blk_finish_plug(&plug); } inode_unlock(inode); From 5f8b73185bd818f074f2351065e46fe46fe18784 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 11 Nov 2016 16:46:40 -0800 Subject: [PATCH 0051/1212] f2fs: fix redundant block allocation In direct_IO path of f2fs_file_write_iter(), 1. f2fs_preallocate_blocks(F2FS_GET_BLOCK_PRE_DIO) -> allocate LBA X 2. f2fs_direct_IO() -> return 0; Then, f2fs_write_data_page() will allocate another LBA X+1. This makes EIO triggered by HM-SMR. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 13da02435fc5..dcc5f61ac187 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -652,6 +652,13 @@ static int __allocate_data_block(struct dnode_of_data *dn) return 0; } +static inline bool __force_buffered_io(struct inode *inode, int rw) +{ + return ((f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) || + (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) || + F2FS_I_SB(inode)->s_ndevs); +} + int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); @@ -671,7 +678,10 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) err = f2fs_convert_inline_inode(inode); if (err) return err; - return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO); + return f2fs_map_blocks(inode, &map, 1, + __force_buffered_io(inode, WRITE) ? + F2FS_GET_BLOCK_PRE_AIO : + F2FS_GET_BLOCK_PRE_DIO); } if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) { err = f2fs_convert_inline_inode(inode); @@ -1760,11 +1770,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (err) return err; - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - return 0; - if (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) - return 0; - if (F2FS_I_SB(inode)->s_ndevs) + if (__force_buffered_io(inode, rw)) return 0; trace_f2fs_direct_IO_enter(inode, offset, count, rw); From 3d89bca8b1feefa6b4a574207115de1458711c7f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 14 Nov 2016 17:38:35 -0800 Subject: [PATCH 0052/1212] f2fs: avoid BG_GC in f2fs_balance_fs If many threads hit has_not_enough_free_secs() in f2fs_balance_fs() at the same time, all the threads would do FG_GC or BG_GC. In this critical path, we totally don't need to do BG_GC at all. Let's avoid that. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 2 +- fs/f2fs/gc.c | 7 +++++-- fs/f2fs/segment.c | 2 +- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8dc378d82f67..687ab43a6cd8 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2235,7 +2235,7 @@ int f2fs_migrate_page(struct address_space *, struct page *, struct page *, int start_gc_thread(struct f2fs_sb_info *); void stop_gc_thread(struct f2fs_sb_info *); block_t start_bidx_of_node(unsigned int, struct inode *); -int f2fs_gc(struct f2fs_sb_info *, bool); +int f2fs_gc(struct f2fs_sb_info *, bool, bool); void build_gc_manager(struct f2fs_sb_info *); /* diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index fbfcd809baec..84f4572ae959 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1853,7 +1853,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) mutex_lock(&sbi->gc_mutex); } - ret = f2fs_gc(sbi, sync); + ret = f2fs_gc(sbi, sync, true); out: mnt_drop_write_file(filp); return ret; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 744031194934..54d06c21af07 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -82,7 +82,7 @@ static int gc_thread_func(void *data) stat_inc_bggc_count(sbi); /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC))) + if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true)) wait_ms = gc_th->no_gc_sleep_time; trace_f2fs_background_gc(sbi->sb, wait_ms, @@ -905,7 +905,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, return sec_freed; } -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync) +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background) { unsigned int segno; int gc_type = sync ? FG_GC : BG_GC; @@ -946,6 +946,9 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync) if (ret) goto stop; } + } else if (gc_type == BG_GC && !background) { + /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ + goto stop; } if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type)) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 30d0e9a76c62..27e1b7c56e4c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -364,7 +364,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) */ if (has_not_enough_free_secs(sbi, 0, 0)) { mutex_lock(&sbi->gc_mutex); - f2fs_gc(sbi, false); + f2fs_gc(sbi, false, false); } } From 185a1b0664eef1c95a2ab55ffc274146333f921f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 14 Nov 2016 18:20:10 -0800 Subject: [PATCH 0053/1212] f2fs: fix wrong written_valid_blocks counting Previously, written_valid_blocks was got by ckpt->valid_block_count. But if the last checkpoint has some NEW_ADDR due to power-cut, we can get wrong value. Fix it to get the number from actual written block count from sit entries. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 27e1b7c56e4c..58cae4a541a7 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2176,7 +2176,6 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) static int build_sit_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); - struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct sit_info *sit_i; unsigned int sit_segs, start; char *src_bitmap, *dst_bitmap; @@ -2243,7 +2242,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr); sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg; - sit_i->written_valid_blocks = le64_to_cpu(ckpt->valid_block_count); + sit_i->written_valid_blocks = 0; sit_i->sit_bitmap = dst_bitmap; sit_i->bitmap_size = bitmap_size; sit_i->dirty_sentries = 0; @@ -2397,6 +2396,9 @@ static void init_free_segmap(struct f2fs_sb_info *sbi) struct seg_entry *sentry = get_seg_entry(sbi, start); if (!sentry->valid_blocks) __set_free(sbi, start); + else + SIT_I(sbi)->written_valid_blocks += + sentry->valid_blocks; } /* set use the current segments */ From 2ea2e28982f0264aae0d1c3e413eddfa7c8c149e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 16 Nov 2016 10:41:20 +0800 Subject: [PATCH 0054/1212] f2fs: don't wait writeback for datas during checkpoint Normally, while committing checkpoint, we will wait on all pages to be writebacked no matter the page is data or metadata, so in scenario where there are lots of data IO being submitted with metadata, we may suffer long latency for waiting writeback during checkpoint. Indeed, we only care about persistence for pages with metadata, but not pages with data, as file system consistent are only related to metadate, so in order to avoid encountering long latency in above scenario, let's recognize and reference metadata in submitted IOs, wait writeback only for metadatas. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/data.c --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/data.c | 35 +++++++++++++++++++++++++++++------ fs/f2fs/debug.c | 7 ++++--- fs/f2fs/f2fs.h | 9 ++++++--- fs/f2fs/file.c | 2 -- fs/f2fs/gc.c | 2 -- fs/f2fs/segment.c | 1 - 7 files changed, 40 insertions(+), 18 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index ed79757c36e0..889317e07122 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1005,7 +1005,7 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) for (;;) { prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE); - if (!atomic_read(&sbi->nr_wb_bios)) + if (!get_pages(sbi, F2FS_WB_CP_DATA)) break; io_schedule_timeout(5*HZ); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index dcc5f61ac187..3994e0a1d9ff 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -29,6 +29,26 @@ #include "trace.h" #include +static bool __is_cp_guaranteed(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode; + struct f2fs_sb_info *sbi; + + if (!mapping) + return false; + + inode = mapping->host; + sbi = F2FS_I_SB(inode); + + if (inode->i_ino == F2FS_META_INO(sbi) || + inode->i_ino == F2FS_NODE_INO(sbi) || + S_ISDIR(inode->i_mode) || + is_cold_data(page)) + return true; + return false; +} + static void f2fs_read_end_io(struct bio *bio) { struct bio_vec *bvec; @@ -71,6 +91,7 @@ static void f2fs_write_end_io(struct bio *bio) bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; + enum count_type type = WB_DATA_TYPE(page); fscrypt_pullback_bio_page(&page, true); @@ -78,9 +99,11 @@ static void f2fs_write_end_io(struct bio *bio) set_bit(AS_EIO, &page->mapping->flags); f2fs_stop_checkpoint(sbi, true); } + dec_page_count(sbi, type); + clear_cold_data(page); end_page_writeback(page); } - if (atomic_dec_and_test(&sbi->nr_wb_bios) && + if (!get_pages(sbi, F2FS_WB_CP_DATA) && wq_has_sleeper(&sbi->cp_wait)) wake_up(&sbi->cp_wait); @@ -148,7 +171,6 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, int rw, struct bio *bio, enum page_type type) { if (!is_read_io(rw)) { - atomic_inc(&sbi->nr_wb_bios); if (f2fs_sb_mounted_blkzoned(sbi->sb) && current->plug && (type == DATA || type == NODE)) blk_finish_plug(current->plug); @@ -304,6 +326,11 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) verify_block_addr(sbi, fio->old_blkaddr); verify_block_addr(sbi, fio->new_blkaddr); + bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; + + if (!is_read) + inc_page_count(sbi, WB_DATA_TYPE(bio_page)); + down_write(&io->io_rwsem); if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 || @@ -317,8 +344,6 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) io->fio = *fio; } - bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; - if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < PAGE_SIZE) { __submit_merged_bio(io); @@ -1330,7 +1355,6 @@ static int f2fs_write_data_page(struct page *page, if (err && err != -ENOENT) goto redirty_out; - clear_cold_data(page); out: inode_dec_dirty_pages(inode); if (err) @@ -1733,7 +1757,6 @@ static int f2fs_write_end(struct file *file, goto unlock_out; set_page_dirty(page); - clear_cold_data(page); if (pos + copied > i_size_read(inode)) f2fs_i_size_write(inode, pos + copied); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 2fdf23398fa1..67a04d8074bb 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -50,7 +50,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); - si->wb_bios = atomic_read(&sbi->nr_wb_bios); + si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); + si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA); si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); @@ -313,8 +314,8 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - inmem: %4d, wb_bios: %4d\n", - si->inmem_pages, si->wb_bios); + seq_printf(s, " - inmem: %4d, wb_cp_data: %4d, wb_data: %4d\n", + si->inmem_pages, si->nr_wb_cp_data, si->nr_wb_data); seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 687ab43a6cd8..d6119ea3b86d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -702,6 +702,7 @@ struct f2fs_sm_info { * f2fs monitors the number of several block types such as on-writeback, * dirty dentry blocks, dirty node blocks, and dirty meta blocks. */ +#define WB_DATA_TYPE(p) (__is_cp_guaranteed(p) ? F2FS_WB_CP_DATA : F2FS_WB_DATA) enum count_type { F2FS_DIRTY_DENTS, F2FS_DIRTY_DATA, @@ -709,6 +710,8 @@ enum count_type { F2FS_DIRTY_META, F2FS_INMEM_PAGES, F2FS_DIRTY_IMETA, + F2FS_WB_CP_DATA, + F2FS_WB_DATA, NR_COUNT_TYPE, }; @@ -888,7 +891,6 @@ struct f2fs_sb_info { block_t discard_blks; /* discard command candidats */ block_t last_valid_block_count; /* for recovery */ u32 s_next_generation; /* for NFS support */ - atomic_t nr_wb_bios; /* # of writeback bios */ /* # of pages, see count_type */ atomic_t nr_pages[NR_COUNT_TYPE]; @@ -1302,7 +1304,8 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) { atomic_inc(&sbi->nr_pages[count_type]); - if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES) + if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES || + count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA) return; set_sbi_flag(sbi, SBI_IS_DIRTY); @@ -2261,7 +2264,7 @@ struct f2fs_stat_info { unsigned int ndirty_dirs, ndirty_files, ndirty_all; int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids; int total_count, utilization; - int bg_gc, wb_bios; + int bg_gc, nr_wb_cp_data, nr_wb_data; int inline_xattr, inline_inode, inline_dir, orphans; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 84f4572ae959..bab65f0a5bb5 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -95,8 +95,6 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr); - /* if gced page is attached, don't write to cold segment */ - clear_cold_data(page); out: sb_end_pagefault(inode->i_sb); f2fs_update_time(sbi, REQ_TIME); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 54d06c21af07..6390d45c1b68 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -690,8 +690,6 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry; } - - clear_cold_data(page); } out: f2fs_put_page(page, 1); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 58cae4a541a7..23e8892c4e60 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -288,7 +288,6 @@ static int __commit_inmem_pages(struct inode *inode, /* record old blkaddr for revoking */ cur->old_addr = fio.old_blkaddr; - clear_cold_data(page); submit_bio = true; } unlock_page(page); From daa738ea01c5e1c24fd67dd043723cf7e80ab758 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Wed, 16 Nov 2016 17:26:24 +0800 Subject: [PATCH 0055/1212] f2fs: fix an infinite loop when flush nodes in cp Thread A Thread B - write_checkpoint - block_operations -blk_start_plug -sync_node_pages - f2fs_do_sync_file - fsync_node_pages - f2fs_wait_on_page_writeback Thread A wait for global F2FS_DIRTY_NODES decreased to zero, it start a plug list, some requests have been added to this list. Thread B lock one dirty node page, and wait this page write back. But this page has been in plug list of thread A with PG_writeback flag. Thread A keep on running and its plug list has no chance to finish, so it seems a deadlock between cp and fsync path. This patch add a wait on page write back before set node page dirty to avoid this problem. Signed-off-by: Yunlei He Signed-off-by: Pengyang Hou Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 389be7f6e07c..59cc29e6b73c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1409,6 +1409,7 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, "Retry to write fsync mark: ino=%u, idx=%lx", ino, last_page->index); lock_page(last_page); + f2fs_wait_on_page_writeback(last_page, NODE, true); set_page_dirty(last_page); unlock_page(last_page); goto retry; From 8351875692b06008a6a91c4e63110c8d70fd83fe Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 17 Nov 2016 20:53:11 +0800 Subject: [PATCH 0056/1212] f2fs: fix to account total free nid correctly Thread A Thread B Thread C - f2fs_create - f2fs_new_inode - f2fs_lock_op - alloc_nid alloc last nid - f2fs_unlock_op - f2fs_create - f2fs_new_inode - f2fs_lock_op - alloc_nid as node count still not be increased, we will loop in alloc_nid - f2fs_write_node_pages - f2fs_balance_fs_bg - f2fs_sync_fs - write_checkpoint - block_operations - f2fs_lock_all - f2fs_lock_op While creating new inode, we do not allocate and account nid atomically, so that when there is almost no free nids left, we may encounter deadloop like above stack. In order to avoid that, reuse nm_i::available_nids for accounting free nids and make nid allocation and counting being atomical during node creation. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/node.c | 34 +++++++++++++++++++++++++++++----- 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d6119ea3b86d..973ca74404de 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -560,7 +560,7 @@ enum nid_list { struct f2fs_nm_info { block_t nat_blkaddr; /* base disk address of NAT */ nid_t max_nid; /* maximum possible node ids */ - nid_t available_nids; /* maximum available node ids */ + nid_t available_nids; /* # of available node ids */ nid_t next_scan_nid; /* the next nid to be scanned */ unsigned int ram_thresh; /* control the memory footprint */ unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 59cc29e6b73c..edacbabb92cf 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1885,11 +1885,13 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) if (time_to_inject(sbi, FAULT_ALLOC_NID)) return false; #endif - if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids)) - return false; - spin_lock(&nm_i->nid_list_lock); + if (unlikely(nm_i->available_nids == 0)) { + spin_unlock(&nm_i->nid_list_lock); + return false; + } + /* We should not use stale free nids created by build_free_nids */ if (nm_i->nid_cnt[FREE_NID_LIST] && !on_build_free_nids(nm_i)) { f2fs_bug_on(sbi, list_empty(&nm_i->nid_list[FREE_NID_LIST])); @@ -1900,6 +1902,7 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) __remove_nid_from_list(sbi, i, FREE_NID_LIST, true); i->state = NID_ALLOC; __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false); + nm_i->available_nids--; spin_unlock(&nm_i->nid_list_lock); return true; } @@ -1951,6 +1954,9 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) i->state = NID_NEW; __insert_nid_to_list(sbi, i, FREE_NID_LIST, false); } + + nm_i->available_nids++; + spin_unlock(&nm_i->nid_list_lock); if (need_free) @@ -2150,6 +2156,19 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) ne = grab_nat_entry(nm_i, nid); node_info_from_raw_nat(&ne->ni, &raw_ne); } + + /* + * if a free nat in journal has not been used after last + * checkpoint, we should remove it from available nids, + * since later we will add it again. + */ + if (!get_nat_flag(ne, IS_DIRTY) && + le32_to_cpu(raw_ne.block_addr) == NULL_ADDR) { + spin_lock(&nm_i->nid_list_lock); + nm_i->available_nids--; + spin_unlock(&nm_i->nid_list_lock); + } + __set_nat_cache_dirty(nm_i, ne); } update_nats_in_cursum(journal, -i); @@ -2222,8 +2241,12 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, raw_nat_from_node_info(raw_ne, &ne->ni); nat_reset_flag(ne); __clear_nat_cache_dirty(NM_I(sbi), ne); - if (nat_get_blkaddr(ne) == NULL_ADDR) + if (nat_get_blkaddr(ne) == NULL_ADDR) { add_free_nid(sbi, nid, false); + spin_lock(&NM_I(sbi)->nid_list_lock); + NM_I(sbi)->available_nids++; + spin_unlock(&NM_I(sbi)->nid_list_lock); + } } if (to_journal) @@ -2298,7 +2321,8 @@ static int init_node_manager(struct f2fs_sb_info *sbi) nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks; /* not used nids: 0, node, meta, (and root counted as valid node) */ - nm_i->available_nids = nm_i->max_nid - F2FS_RESERVED_NODE_NUM; + nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count - + F2FS_RESERVED_NODE_NUM; nm_i->nid_cnt[FREE_NID_LIST] = 0; nm_i->nid_cnt[ALLOC_NID_LIST] = 0; nm_i->nat_cnt = 0; From d1e1a3a4c8158a03f4d94933de4d17ebaebe9e15 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 17 Nov 2016 20:53:31 +0800 Subject: [PATCH 0057/1212] f2fs: fix fdatasync For below two cases, we can't guarantee data consistence: a) 1. xfs_io "pwrite 0 4195328" "fsync" 2. xfs_io "pwrite 4195328 1024" "fdatasync" 3. godown 4. umount & mount --> isize we updated before fdatasync won't be recovered b) 1. xfs_io "pwrite -S 0xcc 0 4202496" "fsync" 2. xfs_io "fpunch 4194304 4096" "fdatasync" 3. godown 4. umount & mount --> dnode we punched before fdatasync won't be recovered The reason is that normally fdatasync won't be aware of modification of metadata in file, e.g. isize changing, dnode updating, so in ->fsync we will skip flushing node pages for above cases, result in making fdatasynced file being lost during recovery. Currently we have introduced DIRTY_META global list in sbi for tracking dirty inode selectively, so in fdatasync we can choose to flush nodes depend on dirty state of current inode in the list. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 11 ++++++++++- fs/f2fs/file.c | 2 +- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 973ca74404de..16bedd87022d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1763,8 +1763,17 @@ static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) set_inode_flag(inode, FI_AUTO_RECOVER); } -static inline bool f2fs_skip_inode_update(struct inode *inode) +static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) { + if (dsync) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + bool ret; + + spin_lock(&sbi->inode_lock[DIRTY_META]); + ret = list_empty(&F2FS_I(inode)->gdirty_list); + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return ret; + } if (!is_inode_flag_set(inode, FI_AUTO_RECOVER)) return false; return F2FS_I(inode)->last_disk_size == i_size_read(inode); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index bab65f0a5bb5..7fd8e7cffe9b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -209,7 +209,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, } /* if the inode is dirty, let's recover all the time */ - if (!datasync && !f2fs_skip_inode_update(inode)) { + if (!f2fs_skip_inode_update(inode, datasync)) { f2fs_write_inode(inode, NULL); goto go_write; } From aca5463a208f230e27b13aa2d8ca6952afb90f4d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 16 Nov 2016 15:09:48 -0800 Subject: [PATCH 0058/1212] f2fs: do not recover i_size if it's valid If i_size is already valid during roll_forward recovery, we should not update it according to the block alignment. Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 62523b217571..687c176f0b56 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -425,7 +425,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, continue; } - if ((start + 1) << PAGE_SHIFT > i_size_read(inode)) + if (i_size_read(inode) <= (start << PAGE_SHIFT)) f2fs_i_size_write(inode, (start + 1) << PAGE_SHIFT); /* From 8df5d34aa0cd8ab9ed6d79f1d2d64c9ccbaf6a58 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 16 Nov 2016 18:53:16 -0800 Subject: [PATCH 0059/1212] f2fs: fix wrong AUTO_RECOVER condition If i_size is not aligned to the f2fs's block size, we should not skip inode update during fsync. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 16bedd87022d..fa66e5baa58a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1774,7 +1774,8 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) spin_unlock(&sbi->inode_lock[DIRTY_META]); return ret; } - if (!is_inode_flag_set(inode, FI_AUTO_RECOVER)) + if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) || + i_size_read(inode) & PAGE_MASK) return false; return F2FS_I(inode)->last_disk_size == i_size_read(inode); } From 10a2e5e7a2d4a17f15df8b23002979c00d562d4d Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 18 Nov 2016 22:21:13 +0800 Subject: [PATCH 0060/1212] f2fs: drop duplicate header timer.h Drop duplicate header timer.h from segment.c. Signed-off-by: Geliang Tang Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 23e8892c4e60..ba715d60c738 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -16,7 +16,6 @@ #include #include #include -#include #include "f2fs.h" #include "segment.h" From 11fce24cbf647730c551293ca061d86716c017dc Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 18 Nov 2016 22:27:41 +0800 Subject: [PATCH 0061/1212] f2fs: fix incorrect free inode count in ->statfs While calculating inode count that we can create at most in the left space, we should consider space which data/node blocks occupied, since we create data/node mixly in main area. So fix the wrong calculation in ->statfs. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 2d332a16de71..a288456c17c0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -852,7 +852,8 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = user_block_count - valid_user_blocks(sbi); buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; - buf->f_ffree = buf->f_files - valid_inode_count(sbi); + buf->f_ffree = min(buf->f_files - valid_node_count(sbi), + buf->f_bavail); buf->f_namelen = F2FS_NAME_LEN; buf->f_fsid.val[0] = (u32)id; From fd464b55a493c54ba660ac3967a4f50f0979da47 Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Sun, 20 Nov 2016 19:57:23 +0100 Subject: [PATCH 0062/1212] f2fs: set ->owner for debugfs status file's file_operations The struct file_operations instance serving the f2fs/status debugfs file lacks an initialization of its ->owner. This means that although that file might have been opened, the f2fs module can still get removed. Any further operation on that opened file, releasing included, will cause accesses to unmapped memory. Indeed, Mike Marshall reported the following: BUG: unable to handle kernel paging request at ffffffffa0307430 IP: [] full_proxy_release+0x24/0x90 <...> Call Trace: [] __fput+0xdf/0x1d0 [] ____fput+0xe/0x10 [] task_work_run+0x8e/0xc0 [] do_exit+0x2ae/0xae0 [] ? __audit_syscall_entry+0xae/0x100 [] ? syscall_trace_enter+0x1ca/0x310 [] do_group_exit+0x44/0xc0 [] SyS_exit_group+0x14/0x20 [] do_syscall_64+0x61/0x150 [] entry_SYSCALL64_slow_path+0x25/0x25 <...> ---[ end trace f22ae883fa3ea6b8 ]--- Fixing recursive fault but reboot is needed! Fix this by initializing the f2fs/status file_operations' ->owner with THIS_MODULE. This will allow debugfs to grab a reference to the f2fs module upon any open on that file, thus preventing it from getting removed. Fixes: 902829aa0b72 ("f2fs: move proc files to debugfs") Reported-by: Mike Marshall Reported-by: Martin Brandenburg Cc: stable@vger.kernel.org Signed-off-by: Nicolai Stange Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 67a04d8074bb..fbd5184140d0 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -377,6 +377,7 @@ static int stat_open(struct inode *inode, struct file *file) } static const struct file_operations stat_fops = { + .owner = THIS_MODULE, .open = stat_open, .read = seq_read, .llseek = seq_lseek, From b683e01e25446c1fbf67d1a895d5ee25d897c694 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 22 Nov 2016 15:20:16 +0100 Subject: [PATCH 0063/1212] f2fs: fix 32-bit build The addition of multiple-device support broke CONFIG_BLK_DEV_ZONED on 32-bit machines because of a 64-bit division: fs/f2fs/f2fs.o: In function `__issue_discard_async': extent_cache.c:(.text.__issue_discard_async+0xd4): undefined reference to `__aeabi_uldivmod' Fortunately, bdev_zone_size() is guaranteed to return a power-of-two number, so we can replace the % operator with a cheaper bit mask. Fixes: 792b84b74b54 ("f2fs: support multiple devices") Signed-off-by: Arnd Bergmann Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ba715d60c738..4f557e5e789d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -614,7 +614,8 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, } sector = SECTOR_FROM_BLOCK(blkstart); - if (sector % bdev_zone_size(bdev) || nr_sects != bdev_zone_size(bdev)) { + if (sector & (bdev_zone_size(bdev) - 1) || + nr_sects != bdev_zone_size(bdev)) { f2fs_msg(sbi->sb, KERN_INFO, "(%d) %s: Unaligned discard attempted (block %x + %x)", devi, sbi->s_ndevs ? FDEV(devi).path: "", From 099d3df452efc107e3856a606d9fa29302edc13e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 28 Nov 2016 15:33:38 -0800 Subject: [PATCH 0064/1212] f2fs: do not activate auto_recovery for fallocated i_size If a file needs to keep its i_size by fallocate, we need to turn off auto recovery during roll-forward recovery. This will resolve the below scenario. 1. xfs_io -f /mnt/f2fs/file -c "pwrite 0 4096" -c "fsync" 2. xfs_io -f /mnt/f2fs/file -c "falloc -k 4096 4096" -c "fsync" 3. md5sum /mnt/f2fs/file; 4. godown /mnt/f2fs/ 5. umount /mnt/f2fs/ 6. mount -t f2fs /dev/sdx /mnt/f2fs 7. md5sum /mnt/f2fs/file Reported-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 38 +++++++++++++++++++++----------------- fs/f2fs/file.c | 2 ++ fs/f2fs/recovery.c | 11 ++++++++--- 3 files changed, 31 insertions(+), 20 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fa66e5baa58a..4f1046be0d74 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -449,6 +449,7 @@ struct f2fs_map_blocks { #define FADVISE_LOST_PINO_BIT 0x02 #define FADVISE_ENCRYPT_BIT 0x04 #define FADVISE_ENC_NAME_BIT 0x08 +#define FADVISE_KEEP_SIZE_BIT 0x10 #define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) #define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) @@ -461,6 +462,8 @@ struct f2fs_map_blocks { #define file_clear_encrypt(inode) clear_file(inode, FADVISE_ENCRYPT_BIT) #define file_enc_name(inode) is_file(inode, FADVISE_ENC_NAME_BIT) #define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT) +#define file_keep_isize(inode) is_file(inode, FADVISE_KEEP_SIZE_BIT) +#define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT) #define DEF_DIR_LEVEL 0 @@ -1763,23 +1766,6 @@ static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) set_inode_flag(inode, FI_AUTO_RECOVER); } -static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) -{ - if (dsync) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - bool ret; - - spin_lock(&sbi->inode_lock[DIRTY_META]); - ret = list_empty(&F2FS_I(inode)->gdirty_list); - spin_unlock(&sbi->inode_lock[DIRTY_META]); - return ret; - } - if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) || - i_size_read(inode) & PAGE_MASK) - return false; - return F2FS_I(inode)->last_disk_size == i_size_read(inode); -} - static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth) { F2FS_I(inode)->i_current_depth = depth; @@ -1932,6 +1918,24 @@ static inline void clear_file(struct inode *inode, int type) f2fs_mark_inode_dirty_sync(inode, true); } +static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) +{ + if (dsync) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + bool ret; + + spin_lock(&sbi->inode_lock[DIRTY_META]); + ret = list_empty(&F2FS_I(inode)->gdirty_list); + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return ret; + } + if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) || + file_keep_isize(inode) || + i_size_read(inode) & PAGE_MASK) + return false; + return F2FS_I(inode)->last_disk_size == i_size_read(inode); +} + static inline int f2fs_readonly(struct super_block *sb) { return sb->s_flags & MS_RDONLY; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7fd8e7cffe9b..57b6dbcbbd88 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1403,6 +1403,8 @@ static long f2fs_fallocate(struct file *file, int mode, if (!ret) { inode->i_mtime = inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, false); + if (mode & FALLOC_FL_KEEP_SIZE) + file_set_keep_isize(inode); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 687c176f0b56..981a9584b62f 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -187,6 +187,8 @@ static void recover_inode(struct inode *inode, struct page *page) inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); + F2FS_I(inode)->i_advise = raw->i_advise; + if (file_enc_name(inode)) name = ""; else @@ -425,7 +427,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, continue; } - if (i_size_read(inode) <= (start << PAGE_SHIFT)) + if (!file_keep_isize(inode) && + (i_size_read(inode) <= (start << PAGE_SHIFT))) f2fs_i_size_write(inode, (start + 1) << PAGE_SHIFT); /* @@ -478,8 +481,10 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, f2fs_put_dnode(&dn); out: f2fs_msg(sbi->sb, KERN_NOTICE, - "recover_data: ino = %lx, recovered = %d blocks, err = %d", - inode->i_ino, recovered, err); + "recover_data: ino = %lx (i_size: %s) recovered = %d, err = %d", + inode->i_ino, + file_keep_isize(inode) ? "keep" : "recover", + recovered, err); return err; } From 0c0f597086be6f2f335648529df3e725840d582a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 28 Nov 2016 19:13:43 -0800 Subject: [PATCH 0065/1212] f2fs: return AOP_WRITEPAGE_ACTIVATE for writepage We should use AOP_WRITEPAGE_ACTIVATE when we bypass writing pages. Signed-off-by: Chao Yu Signed-off-by: Miao Xie Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 3994e0a1d9ff..3da99574a59d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1375,6 +1375,8 @@ static int f2fs_write_data_page(struct page *page, redirty_out: redirty_page_for_writepage(wbc, page); + if (!err) + return AOP_WRITEPAGE_ACTIVATE; unlock_page(page); return err; } @@ -1470,6 +1472,15 @@ static int f2fs_write_cache_pages(struct address_space *mapping, ret = mapping->a_ops->writepage(page, wbc); if (unlikely(ret)) { + /* + * keep nr_to_write, since vfs uses this to + * get # of written pages. + */ + if (ret == AOP_WRITEPAGE_ACTIVATE) { + unlock_page(page); + ret = 0; + continue; + } done_index = page->index + 1; done = 1; break; From 769b4ad829b5b4a3ee2924441106cdf349d59f02 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 2 Dec 2016 15:11:32 -0800 Subject: [PATCH 0066/1212] Revert "f2fs: use percpu_counter for # of dirty pages in inode" This reverts commit 1beba1b3a953107c3ff5448ab4e4297db4619c76. The perpcu_counter doesn't provide atomicity in single core and consume more DRAM. That incurs fs_mark test failure due to ENOMEM. Cc: stable@vger.kernel.org # 4.7+ Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 10 +++++----- fs/f2fs/file.c | 2 +- fs/f2fs/super.c | 7 +------ 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4f1046be0d74..c7eb2ff398ce 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -479,7 +479,7 @@ struct f2fs_inode_info { /* Use below internally in f2fs*/ unsigned long flags; /* use to pass per-file flags */ struct rw_semaphore i_sem; /* protect fi info */ - struct percpu_counter dirty_pages; /* # of dirty pages */ + atomic_t dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ nid_t i_xattr_nid; /* node id that contains xattrs */ @@ -1316,7 +1316,7 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) static inline void inode_inc_dirty_pages(struct inode *inode) { - percpu_counter_inc(&F2FS_I(inode)->dirty_pages); + atomic_inc(&F2FS_I(inode)->dirty_pages); inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } @@ -1332,7 +1332,7 @@ static inline void inode_dec_dirty_pages(struct inode *inode) !S_ISLNK(inode->i_mode)) return; - percpu_counter_dec(&F2FS_I(inode)->dirty_pages); + atomic_dec(&F2FS_I(inode)->dirty_pages); dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } @@ -1342,9 +1342,9 @@ static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type) return atomic_read(&sbi->nr_pages[count_type]); } -static inline s64 get_dirty_pages(struct inode *inode) +static inline int get_dirty_pages(struct inode *inode) { - return percpu_counter_sum_positive(&F2FS_I(inode)->dirty_pages); + return atomic_read(&F2FS_I(inode)->dirty_pages); } static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 57b6dbcbbd88..5c0500813efe 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1537,7 +1537,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) goto out; f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, - "Unexpected flush for atomic writes: ino=%lu, npages=%lld", + "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a288456c17c0..ce09191891f8 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -571,13 +571,9 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) init_once((void *) fi); - if (percpu_counter_init(&fi->dirty_pages, 0, GFP_NOFS)) { - kmem_cache_free(f2fs_inode_cachep, fi); - return NULL; - } - /* Initialize f2fs-specific inode info */ fi->vfs_inode.i_version = 1; + atomic_set(&fi->dirty_pages, 0); fi->i_current_depth = 1; fi->i_advise = 0; init_rwsem(&fi->i_sem); @@ -703,7 +699,6 @@ static void f2fs_i_callback(struct rcu_head *head) static void f2fs_destroy_inode(struct inode *inode) { - percpu_counter_destroy(&F2FS_I(inode)->dirty_pages); call_rcu(&inode->i_rcu, f2fs_i_callback); } From a980f29780f397a57a0f53cc2ce7a85078cf7e5d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 5 Dec 2016 11:37:14 -0800 Subject: [PATCH 0067/1212] f2fs: call sync_fs when f2fs is idle The sync_fs in f2fs_balance_fs_bg must avoid interrupting current user requests. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 4f557e5e789d..b95f07559d90 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -381,12 +381,15 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) else build_free_nids(sbi, false); + if (!is_idle(sbi)) + return; + /* checkpoint is the only way to shrink partial cached entries */ if (!available_free_memory(sbi, NAT_ENTRIES) || !available_free_memory(sbi, INO_ENTRIES) || excess_prefree_segs(sbi) || excess_dirty_nats(sbi) || - (is_idle(sbi) && f2fs_time_over(sbi, CP_TIME))) { + f2fs_time_over(sbi, CP_TIME)) { if (test_opt(sbi, DATA_FLUSH)) { struct blk_plug plug; From 96e6c6084b7525043095ed4235b60482d8f36573 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 5 Dec 2016 13:56:04 -0800 Subject: [PATCH 0068/1212] f2fs: detect wrong layout Previous mkfs.f2fs allows small partition inappropriately, so f2fs should detect that as well. Refer this in f2fs-tools. mkfs.f2fs: detect small partition by overprovision ratio and # of segments Reported-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 2 ++ fs/f2fs/super.c | 11 +++++++++++ 2 files changed, 13 insertions(+) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 89ab4301ef02..9d44ce83acb2 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -18,6 +18,8 @@ #define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */ #define DEF_MAX_RECLAIM_PREFREE_SEGMENTS 4096 /* 8GB in maximum */ +#define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */ + /* L: Logical segment # in volume, R: Relative segment # in main area */ #define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) #define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ce09191891f8..07f4ba444733 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1453,6 +1453,7 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) unsigned int total, fsmeta; struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + unsigned int ovp_segments, reserved_segments; total = le32_to_cpu(raw_super->segment_count); fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); @@ -1464,6 +1465,16 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) if (unlikely(fsmeta >= total)) return 1; + ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); + reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count); + + if (unlikely(fsmeta < F2FS_MIN_SEGMENTS || + ovp_segments == 0 || reserved_segments == 0)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Wrong layout: check mkfs.f2fs version"); + return 1; + } + if (unlikely(f2fs_cp_error(sbi))) { f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); return 1; From 640bdae24f2744feb52dfdeadda32215c111709c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 5 Dec 2016 17:25:32 -0800 Subject: [PATCH 0069/1212] f2fs: free meta pages if sanity check for ckpt is failed This fixes missing freeing meta pages in the error case. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 889317e07122..640f28576e88 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -768,7 +768,7 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) /* Sanity checking of checkpoint */ if (sanity_check_ckpt(sbi)) - goto fail_no_cp; + goto free_fail_no_cp; if (cur_page == cp1) sbi->cur_cp_pack = 1; @@ -796,6 +796,9 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) f2fs_put_page(cp2, 1); return 0; +free_fail_no_cp: + f2fs_put_page(cp1, 1); + f2fs_put_page(cp2, 1); fail_no_cp: kfree(sbi->ckpt); return -EINVAL; From f96ce4c98613274ebbdc8cf527a8eb43e47ba4ba Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 7 Dec 2016 16:23:32 -0800 Subject: [PATCH 0070/1212] f2fs: fix to access nullified flush_cmd_control pointer f2fs_sync_file() remount_ro - f2fs_readonly - destroy_flush_cmd_control - f2fs_issue_flush - no fcc pointer! So, this patch doesn't free fcc in this case, but just stop its kernel thread which sends flush commands. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/segment.c | 33 +++++++++++++++++++++++++-------- fs/f2fs/super.c | 5 +++-- 3 files changed, 29 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c7eb2ff398ce..3ef2d93ab936 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2150,7 +2150,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *, bool); void f2fs_balance_fs_bg(struct f2fs_sb_info *); int f2fs_issue_flush(struct f2fs_sb_info *); int create_flush_cmd_control(struct f2fs_sb_info *); -void destroy_flush_cmd_control(struct f2fs_sb_info *); +void destroy_flush_cmd_control(struct f2fs_sb_info *, bool); void invalidate_blocks(struct f2fs_sb_info *, block_t); bool is_checkpointed_data(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b95f07559d90..a288de069164 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -486,8 +486,13 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) if (!fcc->dispatch_list) wake_up(&fcc->flush_wait_queue); - wait_for_completion(&cmd.wait); - atomic_dec(&fcc->submit_flush); + if (fcc->f2fs_issue_flush) { + wait_for_completion(&cmd.wait); + atomic_dec(&fcc->submit_flush); + } else { + llist_del_all(&fcc->issue_list); + atomic_set(&fcc->submit_flush, 0); + } return cmd.ret; } @@ -498,6 +503,11 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) struct flush_cmd_control *fcc; int err = 0; + if (SM_I(sbi)->cmd_control_info) { + fcc = SM_I(sbi)->cmd_control_info; + goto init_thread; + } + fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL); if (!fcc) return -ENOMEM; @@ -505,6 +515,7 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); SM_I(sbi)->cmd_control_info = fcc; +init_thread: fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { @@ -517,14 +528,20 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) return err; } -void destroy_flush_cmd_control(struct f2fs_sb_info *sbi) +void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) { struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; - if (fcc && fcc->f2fs_issue_flush) - kthread_stop(fcc->f2fs_issue_flush); - kfree(fcc); - SM_I(sbi)->cmd_control_info = NULL; + if (fcc && fcc->f2fs_issue_flush) { + struct task_struct *flush_thread = fcc->f2fs_issue_flush; + + fcc->f2fs_issue_flush = NULL; + kthread_stop(flush_thread); + } + if (free) { + kfree(fcc); + SM_I(sbi)->cmd_control_info = NULL; + } } static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, @@ -2658,7 +2675,7 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi) if (!sm_info) return; - destroy_flush_cmd_control(sbi); + destroy_flush_cmd_control(sbi, true); destroy_dirty_segmap(sbi); destroy_curseg(sbi); destroy_free_segmap(sbi); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 07f4ba444733..e6d8d011786c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1103,8 +1103,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * or if flush_merge is not passed in mount option. */ if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { - destroy_flush_cmd_control(sbi); - } else if (!SM_I(sbi)->cmd_control_info) { + clear_opt(sbi, FLUSH_MERGE); + destroy_flush_cmd_control(sbi, false); + } else { err = create_flush_cmd_control(sbi); if (err) goto restore_gc; From 2cf125d417ade924903ef4c09b24a864513714c1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 9 Dec 2016 11:46:10 -0800 Subject: [PATCH 0071/1212] scripts/tags.sh: catch 4.9-rc6 Signed-off-by: Jaegeuk Kim --- scripts/tags.sh | 222 ++++++++++++++++++++++++------------------------ 1 file changed, 112 insertions(+), 110 deletions(-) diff --git a/scripts/tags.sh b/scripts/tags.sh index 262889046703..a2ff3388e5ea 100755 --- a/scripts/tags.sh +++ b/scripts/tags.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # Generate tags or cscope files # Usage tags.sh # @@ -134,11 +134,6 @@ all_kconfigs() find_other_sources 'Kconfig*' } -all_defconfigs() -{ - find_sources $ALLSOURCE_ARCHS "defconfig" -} - docscope() { (echo \-k; echo \-q; all_target_sources) > cscope.files @@ -150,8 +145,111 @@ dogtags() all_target_sources | gtags -i -f - } +# Basic regular expressions with an optional /kind-spec/ for ctags and +# the following limitations: +# - No regex modifiers +# - Use \{0,1\} instead of \?, because etags expects an unescaped ? +# - \s is not working with etags, use a space or [ \t] +# - \w works, but does not match underscores in etags +# - etags regular expressions have to match at the start of a line; +# a ^[^#] is prepended by setup_regex unless an anchor is already present +regex_asm=( + '/^\(ENTRY\|_GLOBAL\)(\([[:alnum:]_\\]*\)).*/\2/' +) +regex_c=( + '/^SYSCALL_DEFINE[0-9](\([[:alnum:]_]*\).*/sys_\1/' + '/^COMPAT_SYSCALL_DEFINE[0-9](\([[:alnum:]_]*\).*/compat_sys_\1/' + '/^TRACE_EVENT(\([[:alnum:]_]*\).*/trace_\1/' + '/^TRACE_EVENT(\([[:alnum:]_]*\).*/trace_\1_rcuidle/' + '/^DEFINE_EVENT([^,)]*, *\([[:alnum:]_]*\).*/trace_\1/' + '/^DEFINE_EVENT([^,)]*, *\([[:alnum:]_]*\).*/trace_\1_rcuidle/' + '/^DEFINE_INSN_CACHE_OPS(\([[:alnum:]_]*\).*/get_\1_slot/' + '/^DEFINE_INSN_CACHE_OPS(\([[:alnum:]_]*\).*/free_\1_slot/' + '/^PAGEFLAG(\([[:alnum:]_]*\).*/Page\1/' + '/^PAGEFLAG(\([[:alnum:]_]*\).*/SetPage\1/' + '/^PAGEFLAG(\([[:alnum:]_]*\).*/ClearPage\1/' + '/^TESTSETFLAG(\([[:alnum:]_]*\).*/TestSetPage\1/' + '/^TESTPAGEFLAG(\([[:alnum:]_]*\).*/Page\1/' + '/^SETPAGEFLAG(\([[:alnum:]_]*\).*/SetPage\1/' + '/\<__SETPAGEFLAG(\([[:alnum:]_]*\).*/__SetPage\1/' + '/\ Date: Mon, 26 Sep 2016 18:07:48 +0200 Subject: [PATCH 0072/1212] fs/super.c: fix race between freeze_super() and thaw_super() Change thaw_super() to check frozen != SB_FREEZE_COMPLETE rather than frozen == SB_UNFROZEN, otherwise it can race with freeze_super() which drops sb->s_umount after SB_FREEZE_WRITE to preserve the lock ordering. In this case thaw_super() will wrongly call s_op->unfreeze_fs() before it was actually frozen, and call sb_freeze_unlock() which leads to the unbalanced percpu_up_write(). Unfortunately lockdep can't detect this, so this triggers misc BUG_ON()'s in kernel/rcu/sync.c. Reported-and-tested-by: Nikolay Borisov Signed-off-by: Oleg Nesterov Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/super.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/super.c b/fs/super.c index f5f4b328f860..d4d2591b77c8 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1326,8 +1326,8 @@ int freeze_super(struct super_block *sb) } } /* - * This is just for debugging purposes so that fs can warn if it - * sees write activity when frozen is set to SB_FREEZE_COMPLETE. + * For debugging purposes so that fs can warn if it sees write activity + * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super(). */ sb->s_writers.frozen = SB_FREEZE_COMPLETE; up_write(&sb->s_umount); @@ -1346,7 +1346,7 @@ int thaw_super(struct super_block *sb) int error; down_write(&sb->s_umount); - if (sb->s_writers.frozen == SB_UNFROZEN) { + if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) { up_write(&sb->s_umount); return -EINVAL; } From a1c31d8ded433de32cf7d931ada7ec7cebe8ba85 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Sun, 11 Dec 2016 15:35:15 +0800 Subject: [PATCH 0073/1212] f2fs: fix a missing size change in f2fs_setattr This patch fix a missing size change in f2fs_setattr Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5c0500813efe..5808d5c709a7 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -678,6 +678,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int err; + bool size_changed = false; err = inode_change_ok(inode, attr); if (err) @@ -708,6 +709,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } inode->i_mtime = inode->i_ctime = current_time(inode); } + + size_changed = true; } __setattr_copy(inode, attr); @@ -720,8 +723,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } } - /* update attributes only */ - f2fs_mark_inode_dirty_sync(inode, false); + /* file size may changed here */ + f2fs_mark_inode_dirty_sync(inode, size_changed); /* inode change will produce dirty node pages flushed by checkpoint */ f2fs_balance_fs(F2FS_I_SB(inode), true); From d8a1d0c13cde35521bd92c1c4607cbd9d3a7618f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 11 Jan 2017 09:55:38 -0800 Subject: [PATCH 0074/1212] f2fs: remove wrong backported codes Kconfig and dentry RCU mode stuffs. Signed-off-by: Jaegeuk Kim --- fs/f2fs/Kconfig | 1 - fs/f2fs/namei.c | 3 --- 2 files changed, 4 deletions(-) diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 1852d99df97b..378c221d68a9 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -2,7 +2,6 @@ config F2FS_FS tristate "F2FS filesystem support" depends on BLOCK select CRYPTO - select KEYS select CRYPTO_CRC32 help F2FS is based on Log-structured File System (LFS), which supports diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 468b2dbe6d34..523bf073642e 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -1014,9 +1014,6 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook u32 max_size = inode->i_sb->s_blocksize; int res; - if (!dentry) - return ERR_PTR(-ECHILD); - res = fscrypt_get_encryption_info(inode); if (res) return ERR_PTR(res); From 7146292938e42afacadd7b3402f459e638f5b77a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 11 Jan 2017 18:24:54 -0800 Subject: [PATCH 0075/1212] f2fs: resolve op and op_flags confilcts Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 11 +++++--- fs/f2fs/data.c | 54 ++++++++++++++++++++----------------- fs/f2fs/f2fs.h | 24 +++++++++++++++-- fs/f2fs/gc.c | 12 ++++++--- fs/f2fs/inline.c | 3 ++- fs/f2fs/node.c | 12 +++++---- fs/f2fs/segment.c | 9 ++++--- fs/f2fs/trace.c | 7 ++--- include/trace/events/f2fs.h | 19 ++++++++----- 9 files changed, 98 insertions(+), 53 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 640f28576e88..2ed785e5ffbb 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -64,14 +64,15 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, struct f2fs_io_info fio = { .sbi = sbi, .type = META, - .rw = READ_SYNC | REQ_META | REQ_PRIO, + .op = REQ_OP_READ, + .op_flags = REQ_SYNC | REQ_META | REQ_PRIO, .old_blkaddr = index, .new_blkaddr = index, .encrypted_page = NULL, }; if (unlikely(!is_meta)) - fio.rw &= ~REQ_META; + fio.op_flags &= ~REQ_META; repeat: page = f2fs_grab_cache_page(mapping, index, false); if (!page) { @@ -158,13 +159,15 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, struct f2fs_io_info fio = { .sbi = sbi, .type = META, - .rw = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : READA, + .op = REQ_OP_READ, + .op_flags = sync ? (REQ_SYNC | REQ_META | REQ_PRIO) : + REQ_RAHEAD, .encrypted_page = NULL, }; struct blk_plug plug; if (unlikely(type == META_POR)) - fio.rw &= ~REQ_META; + fio.op_flags &= ~REQ_META; blk_start_plug(&plug); for (; nrpages-- > 0; blkno++) { diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 3da99574a59d..87a85ff3c069 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -167,15 +167,15 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, return bio; } -static inline void __submit_bio(struct f2fs_sb_info *sbi, int rw, - struct bio *bio, enum page_type type) +static inline void __submit_bio(struct f2fs_sb_info *sbi, + struct bio *bio, enum page_type type) { - if (!is_read_io(rw)) { + if (!is_read_io(bio_op(bio))) { if (f2fs_sb_mounted_blkzoned(sbi->sb) && current->plug && (type == DATA || type == NODE)) blk_finish_plug(current->plug); } - submit_bio(rw, bio); + submit_bio(0, bio); } static void __submit_merged_bio(struct f2fs_bio_info *io) @@ -185,12 +185,14 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) if (!io->bio) return; - if (is_read_io(fio->rw)) + if (is_read_io(fio->op)) trace_f2fs_submit_read_bio(io->sbi->sb, fio, io->bio); else trace_f2fs_submit_write_bio(io->sbi->sb, fio, io->bio); - __submit_bio(io->sbi, fio->rw, io->bio, fio->type); + bio_set_op_attrs(io->bio, fio->op, fio->op_flags); + + __submit_bio(io->sbi, io->bio, fio->type); io->bio = NULL; } @@ -256,10 +258,10 @@ static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { io->fio.type = META_FLUSH; - if (test_opt(sbi, NOBARRIER)) - io->fio.rw = WRITE_FLUSH | REQ_META | REQ_PRIO; - else - io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO; + io->fio.op = REQ_OP_WRITE; + io->fio.op_flags = WRITE_FLUSH | REQ_META | REQ_PRIO; + if (!test_opt(sbi, NOBARRIER)) + io->fio.op_flags |= REQ_FUA; } __submit_merged_bio(io); out: @@ -301,14 +303,15 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) f2fs_trace_ios(fio, 0); /* Allocate a new bio */ - bio = __bio_alloc(fio->sbi, fio->new_blkaddr, 1, is_read_io(fio->rw)); + bio = __bio_alloc(fio->sbi, fio->new_blkaddr, 1, is_read_io(fio->op)); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_put(bio); return -EFAULT; } + bio_set_op_attrs(bio, fio->op, fio->op_flags); - __submit_bio(fio->sbi, fio->rw, bio, fio->type); + __submit_bio(fio->sbi, bio, fio->type); return 0; } @@ -317,7 +320,7 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) struct f2fs_sb_info *sbi = fio->sbi; enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); struct f2fs_bio_info *io; - bool is_read = is_read_io(fio->rw); + bool is_read = is_read_io(fio->op); struct page *bio_page; io = is_read ? &sbi->read_io : &sbi->write_io[btype]; @@ -334,7 +337,7 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) down_write(&io->io_rwsem); if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 || - (io->fio.rw != fio->rw) || + (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags) || !__same_bdev(sbi, fio->new_blkaddr, io->bio))) __submit_merged_bio(io); alloc_new: @@ -462,7 +465,7 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) } struct page *get_read_data_page(struct inode *inode, pgoff_t index, - int rw, bool for_write) + int op_flags, bool for_write) { struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; @@ -472,7 +475,8 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .type = DATA, - .rw = rw, + .op = REQ_OP_READ, + .op_flags = op_flags, .encrypted_page = NULL, }; @@ -540,7 +544,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index) return page; f2fs_put_page(page, 0); - page = get_read_data_page(inode, index, READ_SYNC, false); + page = get_read_data_page(inode, index, REQ_SYNC, false); if (IS_ERR(page)) return page; @@ -566,7 +570,7 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct page *page; repeat: - page = get_read_data_page(inode, index, READ_SYNC, for_write); + page = get_read_data_page(inode, index, REQ_SYNC, for_write); if (IS_ERR(page)) return page; @@ -1144,7 +1148,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, if (bio && (last_block_in_bio != block_nr - 1 || !__same_bdev(F2FS_I_SB(inode), block_nr, bio))) { submit_and_realloc: - __submit_bio(F2FS_I_SB(inode), READ, bio, DATA); + __submit_bio(F2FS_I_SB(inode), bio, DATA); bio = NULL; } if (bio == NULL) { @@ -1153,6 +1157,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, bio = NULL; goto set_error_page; } + bio_set_op_attrs(bio, REQ_OP_READ, 0); } if (bio_add_page(bio, page, blocksize, 0) < blocksize) @@ -1167,7 +1172,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, goto next_page; confused: if (bio) { - __submit_bio(F2FS_I_SB(inode), READ, bio, DATA); + __submit_bio(F2FS_I_SB(inode), bio, DATA); bio = NULL; } unlock_page(page); @@ -1177,7 +1182,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, } BUG_ON(pages && !list_empty(pages)); if (bio) - __submit_bio(F2FS_I_SB(inode), READ, bio, DATA); + __submit_bio(F2FS_I_SB(inode), bio, DATA); return 0; } @@ -1295,7 +1300,8 @@ static int f2fs_write_data_page(struct page *page, struct f2fs_io_info fio = { .sbi = sbi, .type = DATA, - .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, + .op = REQ_OP_WRITE, + .op_flags = wbc_to_write_flags(wbc), .page = page, .encrypted_page = NULL, }; @@ -1717,14 +1723,14 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, err = PTR_ERR(bio); goto fail; } - + bio->bi_rw = READ_SYNC; if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_put(bio); err = -EFAULT; goto fail; } - __submit_bio(sbi, READ_SYNC, bio, DATA); + __submit_bio(sbi, bio, DATA); lock_page(page); if (unlikely(page->mapping != mapping)) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3ef2d93ab936..d0c7decdd3ac 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -24,6 +24,7 @@ #include #include #include +#include #ifdef CONFIG_F2FS_CHECK_FS #define f2fs_bug_on(sbi, condition) BUG_ON(condition) @@ -112,6 +113,24 @@ struct f2fs_mount_info { #define F2FS_CLEAR_FEATURE(sb, mask) \ F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask) +/* bio stuffs */ +#define REQ_OP_READ READ +#define REQ_OP_WRITE WRITE +#define bio_op(bio) ((bio)->bi_rw & 1) + +static inline void bio_set_op_attrs(struct bio *bio, unsigned op, + unsigned op_flags) +{ + bio->bi_rw = op | op_flags; +} + +static inline int wbc_to_write_flags(struct writeback_control *wbc) +{ + if (wbc->sync_mode == WB_SYNC_ALL) + return REQ_SYNC; + return 0; +} + /** * wq_has_sleeper - check if there are any waiting processes * @wq: wait queue head @@ -746,14 +765,15 @@ enum page_type { struct f2fs_io_info { struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ - int rw; /* contains R/RS/W/WS with REQ_META/REQ_PRIO */ + int op; /* contains REQ_OP_ */ + int op_flags; /* req_flag_bits */ block_t new_blkaddr; /* new block address to be written */ block_t old_blkaddr; /* old block address before Cow */ struct page *page; /* page to be written */ struct page *encrypted_page; /* encrypted page */ }; -#define is_read_io(rw) (((rw) & 1) == READ) +#define is_read_io(rw) (rw == READ) struct f2fs_bio_info { struct f2fs_sb_info *sbi; /* f2fs superblock */ struct bio *bio; /* bios to merge */ diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 6390d45c1b68..d3a36e4b442c 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -550,7 +550,8 @@ static void move_encrypted_block(struct inode *inode, block_t bidx, struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .type = DATA, - .rw = READ_SYNC, + .op = REQ_OP_READ, + .op_flags = REQ_SYNC, .encrypted_page = NULL, }; struct dnode_of_data dn; @@ -627,7 +628,8 @@ static void move_encrypted_block(struct inode *inode, block_t bidx, /* allocate block address */ f2fs_wait_on_page_writeback(dn.node_page, NODE, true); - fio.rw = WRITE_SYNC; + fio.op = REQ_OP_WRITE; + fio.op_flags = REQ_SYNC | REQ_NOIDLE; fio.new_blkaddr = newaddr; f2fs_submit_page_mbio(&fio); @@ -668,7 +670,8 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .type = DATA, - .rw = WRITE_SYNC, + .op = REQ_OP_WRITE, + .op_flags = REQ_SYNC | REQ_NOIDLE, .page = page, .encrypted_page = NULL, }; @@ -767,7 +770,8 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, start_bidx = start_bidx_of_node(nofs, inode); data_page = get_read_data_page(inode, - start_bidx + ofs_in_node, READA, true); + start_bidx + ofs_in_node, REQ_RAHEAD, + true); if (IS_ERR(data_page)) { iput(inode); continue; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 3f8bfc87c6dc..d82e97b1e6c4 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -110,7 +110,8 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) struct f2fs_io_info fio = { .sbi = F2FS_I_SB(dn->inode), .type = DATA, - .rw = WRITE_SYNC | REQ_PRIO, + .op = REQ_OP_WRITE, + .op_flags = REQ_SYNC | REQ_NOIDLE | REQ_PRIO, .page = page, .encrypted_page = NULL, }; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index edacbabb92cf..26a745c544fc 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1068,14 +1068,15 @@ struct page *new_node_page(struct dnode_of_data *dn, * 0: f2fs_put_page(page, 0) * LOCKED_PAGE or error: f2fs_put_page(page, 1) */ -static int read_node_page(struct page *page, int rw) +static int read_node_page(struct page *page, int op_flags) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); struct node_info ni; struct f2fs_io_info fio = { .sbi = sbi, .type = NODE, - .rw = rw, + .op = REQ_OP_READ, + .op_flags = op_flags, .page = page, .encrypted_page = NULL, }; @@ -1116,7 +1117,7 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) if (!apage) return; - err = read_node_page(apage, READA); + err = read_node_page(apage, REQ_RAHEAD); f2fs_put_page(apage, err ? 1 : 0); } @@ -1134,7 +1135,7 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, if (!page) return ERR_PTR(-ENOMEM); - err = read_node_page(page, READ_SYNC); + err = read_node_page(page, REQ_SYNC); if (err < 0) { f2fs_put_page(page, 1); return ERR_PTR(err); @@ -1575,7 +1576,8 @@ static int f2fs_write_node_page(struct page *page, struct f2fs_io_info fio = { .sbi = sbi, .type = NODE, - .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, + .op = REQ_OP_WRITE, + .op_flags = wbc_to_write_flags(wbc), .page = page, .encrypted_page = NULL, }; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a288de069164..70aec4a8de13 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -257,7 +257,8 @@ static int __commit_inmem_pages(struct inode *inode, struct f2fs_io_info fio = { .sbi = sbi, .type = DATA, - .rw = WRITE_SYNC | REQ_PRIO, + .op = REQ_OP_WRITE, + .op_flags = REQ_SYNC | REQ_NOIDLE | REQ_PRIO, .encrypted_page = NULL, }; bool submit_bio = false; @@ -407,6 +408,7 @@ static int __submit_flush_wait(struct block_device *bdev) struct bio *bio = f2fs_bio_alloc(0); int ret; + bio->bi_rw = REQ_OP_WRITE; bio->bi_bdev = bdev; ret = submit_bio_wait(WRITE_FLUSH, bio); bio_put(bio); @@ -1544,7 +1546,8 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) struct f2fs_io_info fio = { .sbi = sbi, .type = META, - .rw = WRITE_SYNC | REQ_META | REQ_PRIO, + .op = REQ_OP_WRITE, + .op_flags = REQ_SYNC | REQ_NOIDLE | REQ_META | REQ_PRIO, .old_blkaddr = page->index, .new_blkaddr = page->index, .page = page, @@ -1552,7 +1555,7 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) }; if (unlikely(page->index >= MAIN_BLKADDR(sbi))) - fio.rw &= ~REQ_META; + fio.op_flags &= ~REQ_META; set_page_writeback(page); f2fs_submit_page_mbio(&fio); diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index 562ce0821559..73b4e1d1912a 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -25,11 +25,11 @@ static inline void __print_last_io(void) if (!last_io.len) return; - trace_printk("%3x:%3x %4x %-16s %2x %5x %12x %4x\n", + trace_printk("%3x:%3x %4x %-16s %2x %5x %5x %12x %4x\n", last_io.major, last_io.minor, last_io.pid, "----------------", last_io.type, - last_io.fio.rw, + last_io.fio.op, last_io.fio.op_flags, last_io.fio.new_blkaddr, last_io.len); memset(&last_io, 0, sizeof(last_io)); @@ -101,7 +101,8 @@ void f2fs_trace_ios(struct f2fs_io_info *fio, int flush) if (last_io.major == major && last_io.minor == minor && last_io.pid == pid && last_io.type == __file_type(inode, pid) && - last_io.fio.rw == fio->rw && + last_io.fio.op == fio->op && + last_io.fio.op_flags == fio->op_flags && last_io.fio.new_blkaddr + last_io.len == fio->new_blkaddr) { last_io.len++; diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 90d6ad49a9c5..7ad46e8a89e6 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -59,7 +59,8 @@ TRACE_DEFINE_ENUM(CP_DISCARD); #define F2FS_BIO_MASK(t) (t & (READA | WRITE_FLUSH_FUA)) #define F2FS_BIO_EXTRA_MASK(t) (t & (REQ_META | REQ_PRIO)) -#define show_bio_type(type) show_bio_base(type), show_bio_extra(type) +#define show_bio_type(op, op_flags) \ + show_bio_base((op|op_flags)), show_bio_extra((op|op_flags)) #define show_bio_base(type) \ __print_symbolic(F2FS_BIO_MASK(type), \ @@ -734,7 +735,8 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, __field(pgoff_t, index) __field(block_t, old_blkaddr) __field(block_t, new_blkaddr) - __field(int, rw) + __field(int, op) + __field(int, op_flags) __field(int, type) ), @@ -744,7 +746,8 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, __entry->index = page->index; __entry->old_blkaddr = fio->old_blkaddr; __entry->new_blkaddr = fio->new_blkaddr; - __entry->rw = fio->rw; + __entry->op = fio->op; + __entry->op_flags = fio->op_flags; __entry->type = fio->type; ), @@ -754,7 +757,7 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, (unsigned long)__entry->index, (unsigned long long)__entry->old_blkaddr, (unsigned long long)__entry->new_blkaddr, - show_bio_type(__entry->rw), + show_bio_type(__entry->op, __entry->op_flags), show_block_type(__entry->type)) ); @@ -785,7 +788,8 @@ DECLARE_EVENT_CLASS(f2fs__submit_bio, TP_STRUCT__entry( __field(dev_t, dev) - __field(int, rw) + __field(int, op) + __field(int, op_flags) __field(int, type) __field(sector_t, sector) __field(unsigned int, size) @@ -793,7 +797,8 @@ DECLARE_EVENT_CLASS(f2fs__submit_bio, TP_fast_assign( __entry->dev = sb->s_dev; - __entry->rw = fio->rw; + __entry->op = fio->op; + __entry->op_flags = fio->op_flags; __entry->type = fio->type; __entry->sector = bio->bi_iter.bi_sector; __entry->size = bio->bi_iter.bi_size; @@ -801,7 +806,7 @@ DECLARE_EVENT_CLASS(f2fs__submit_bio, TP_printk("dev = (%d,%d), %s%s, %s, sector = %lld, size = %u", show_dev(__entry), - show_bio_type(__entry->rw), + show_bio_type(__entry->op, __entry->op_flags), show_block_type(__entry->type), (unsigned long long)__entry->sector, __entry->size) From 373bb0247ae5d5ff0e371d613599fa44392e972e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 11 Jan 2017 16:41:25 -0800 Subject: [PATCH 0076/1212] f2fs: support async discard based on v4.9 This patch is based on commit 275b66b09e85 (f2fs: support async discard). Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 7 +- fs/f2fs/f2fs.h | 3 +- fs/f2fs/segment.c | 183 ++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 181 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 2ed785e5ffbb..d485bea3d6bb 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1255,6 +1255,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_bug_on(sbi, prefree_segments(sbi)); flush_sit_entries(sbi, cpc); clear_prefree_segments(sbi, cpc); + f2fs_wait_all_discard_bio(sbi); unblock_operations(sbi); goto out; } @@ -1273,10 +1274,12 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* unlock all the fs_lock[] in do_checkpoint() */ err = do_checkpoint(sbi, cpc); - if (err) + if (err) { release_discard_addrs(sbi); - else + } else { clear_prefree_segments(sbi, cpc); + f2fs_wait_all_discard_bio(sbi); + } unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d0c7decdd3ac..883d3ab388c1 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -127,7 +127,7 @@ static inline void bio_set_op_attrs(struct bio *bio, unsigned op, static inline int wbc_to_write_flags(struct writeback_control *wbc) { if (wbc->sync_mode == WB_SYNC_ALL) - return REQ_SYNC; + return REQ_SYNC | REQ_NOIDLE; return 0; } @@ -2174,6 +2174,7 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *, bool); void invalidate_blocks(struct f2fs_sb_info *, block_t); bool is_checkpointed_data(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); +void f2fs_wait_all_discard_bio(struct f2fs_sb_info *); void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *); void release_discard_addrs(struct f2fs_sb_info *); int npages_for_summary_flush(struct f2fs_sb_info *, bool); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 70aec4a8de13..13bea6e5120e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -26,6 +26,7 @@ #define __reverse_ffz(x) __reverse_ffs(~(x)) static struct kmem_cache *discard_entry_slab; +static struct kmem_cache *bio_entry_slab; static struct kmem_cache *sit_entry_set_slab; static struct kmem_cache *inmem_entry_slab; @@ -622,6 +623,162 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_unlock(&dirty_i->seglist_lock); } +static struct bio_entry *__add_bio_entry(struct f2fs_sb_info *sbi, + struct bio *bio) +{ + struct list_head *wait_list = &(SM_I(sbi)->wait_list); + struct bio_entry *be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS); + + INIT_LIST_HEAD(&be->list); + be->bio = bio; + init_completion(&be->event); + list_add_tail(&be->list, wait_list); + + return be; +} + +void f2fs_wait_all_discard_bio(struct f2fs_sb_info *sbi) +{ + struct list_head *wait_list = &(SM_I(sbi)->wait_list); + struct bio_entry *be, *tmp; + + list_for_each_entry_safe(be, tmp, wait_list, list) { + struct bio *bio = be->bio; + int err; + + wait_for_completion_io(&be->event); + err = be->error; + if (err == -EOPNOTSUPP) + err = 0; + + if (err) + f2fs_msg(sbi->sb, KERN_INFO, + "Issue discard failed, ret: %d", err); + + bio_put(bio); + list_del(&be->list); + kmem_cache_free(bio_entry_slab, be); + } +} + +static void f2fs_submit_bio_wait_endio(struct bio *bio) +{ + struct bio_entry *be = (struct bio_entry *)bio->bi_private; + + be->error = bio->bi_error; + complete(&be->event); +} + +/* copied from block/blk-lib.c in 4.10-rc1 */ +static int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, int flags, + struct bio **biop) +{ + struct request_queue *q = bdev_get_queue(bdev); + struct bio *bio = *biop; + unsigned int granularity; + int op = REQ_WRITE | REQ_DISCARD; + int alignment; + sector_t bs_mask; + + if (!q) + return -ENXIO; + + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + + if (flags & BLKDEV_DISCARD_SECURE) { + if (!blk_queue_secdiscard(q)) + return -EOPNOTSUPP; + op |= REQ_SECURE; + } + + bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; + if ((sector | nr_sects) & bs_mask) + return -EINVAL; + + /* Zero-sector (unknown) and one-sector granularities are the same. */ + granularity = max(q->limits.discard_granularity >> 9, 1U); + alignment = (bdev_discard_alignment(bdev) >> 9) % granularity; + + while (nr_sects) { + unsigned int req_sects; + sector_t end_sect, tmp; + + /* Make sure bi_size doesn't overflow */ + req_sects = min_t(sector_t, nr_sects, UINT_MAX >> 9); + + /** + * If splitting a request, and the next starting sector would be + * misaligned, stop the discard at the previous aligned sector. + */ + end_sect = sector + req_sects; + tmp = end_sect; + if (req_sects < nr_sects && + sector_div(tmp, granularity) != alignment) { + end_sect = end_sect - alignment; + sector_div(end_sect, granularity); + end_sect = end_sect * granularity + alignment; + req_sects = end_sect - sector; + } + + if (bio) { + int ret = submit_bio_wait(0, bio); + bio_put(bio); + if (ret) + return ret; + } + bio = f2fs_bio_alloc(0); + bio->bi_iter.bi_sector = sector; + bio->bi_bdev = bdev; + bio_set_op_attrs(bio, op, 0); + + bio->bi_iter.bi_size = req_sects << 9; + nr_sects -= req_sects; + sector = end_sect; + + /* + * We can loop for a long time in here, if someone does + * full device discards (like mkfs). Be nice and allow + * us to schedule out to avoid softlocking if preempt + * is disabled. + */ + cond_resched(); + } + + *biop = bio; + return 0; +} + +/* this function is copied from blkdev_issue_discard from block/blk-lib.c */ +static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) +{ + struct bio *bio = NULL; + int err; + + trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); + + if (sbi->s_ndevs) { + int devi = f2fs_target_device_index(sbi, blkstart); + + blkstart -= FDEV(devi).start_blk; + } + err = __blkdev_issue_discard(bdev, + SECTOR_FROM_BLOCK(blkstart), + SECTOR_FROM_BLOCK(blklen), + GFP_NOFS, 0, &bio); + if (!err && bio) { + struct bio_entry *be = __add_bio_entry(sbi, bio); + + bio->bi_private = be; + bio->bi_end_io = f2fs_submit_bio_wait_endio; + submit_bio(REQ_SYNC, bio); + } + + return err; +} + #ifdef CONFIG_BLK_DEV_ZONED static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) @@ -655,8 +812,7 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, case BLK_ZONE_TYPE_CONVENTIONAL: if (!blk_queue_discard(bdev_get_queue(bdev))) return 0; - return blkdev_issue_discard(bdev, sector, nr_sects, - GFP_NOFS, 0); + return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen); case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF: trace_f2fs_issue_reset_zone(sbi->sb, blkstart); @@ -672,15 +828,12 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, static int __issue_discard_async(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { - sector_t start = SECTOR_FROM_BLOCK(blkstart); - sector_t len = SECTOR_FROM_BLOCK(blklen); - #ifdef CONFIG_BLK_DEV_ZONED if (f2fs_sb_mounted_blkzoned(sbi->sb) && bdev_zoned_model(bdev) != BLK_ZONED_NONE) return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); #endif - return blkdev_issue_discard(bdev, start, len, GFP_NOFS, 0); + return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen); } static int f2fs_issue_discard(struct f2fs_sb_info *sbi, @@ -720,8 +873,6 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, if (len) err = __issue_discard_async(sbi, bdev, start, len); - - trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); return err; } @@ -822,11 +973,14 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct list_head *head = &(SM_I(sbi)->discard_list); struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + struct blk_plug plug; unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int start = 0, end = -1; unsigned int secno, start_segno; bool force = (cpc->reason == CP_DISCARD); + blk_start_plug(&plug); + mutex_lock(&dirty_i->seglist_lock); while (1) { @@ -875,6 +1029,8 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) SM_I(sbi)->nr_discards -= entry->len; kmem_cache_free(discard_entry_slab, entry); } + + blk_finish_plug(&plug); } static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) @@ -2551,6 +2707,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; INIT_LIST_HEAD(&sm_info->discard_list); + INIT_LIST_HEAD(&sm_info->wait_list); sm_info->nr_discards = 0; sm_info->max_discards = 0; @@ -2694,10 +2851,15 @@ int __init create_segment_manager_caches(void) if (!discard_entry_slab) goto fail; + bio_entry_slab = f2fs_kmem_cache_create("bio_entry", + sizeof(struct bio_entry)); + if (!bio_entry_slab) + goto destroy_discard_entry; + sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set", sizeof(struct sit_entry_set)); if (!sit_entry_set_slab) - goto destroy_discard_entry; + goto destroy_bio_entry; inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry", sizeof(struct inmem_pages)); @@ -2707,6 +2869,8 @@ int __init create_segment_manager_caches(void) destroy_sit_entry_set: kmem_cache_destroy(sit_entry_set_slab); +destroy_bio_entry: + kmem_cache_destroy(bio_entry_slab); destroy_discard_entry: kmem_cache_destroy(discard_entry_slab); fail: @@ -2716,6 +2880,7 @@ int __init create_segment_manager_caches(void) void destroy_segment_manager_caches(void) { kmem_cache_destroy(sit_entry_set_slab); + kmem_cache_destroy(bio_entry_slab); kmem_cache_destroy(discard_entry_slab); kmem_cache_destroy(inmem_entry_slab); } From 0e4e431a23c324ed49916871ef1306a8c08f08c1 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Tue, 13 Dec 2016 17:23:37 +0800 Subject: [PATCH 0077/1212] f2fs: remove unused values in recover_fsync_data This patch remove unused values in function recover_fsync_data Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 981a9584b62f..4fb4471a3206 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -552,10 +552,8 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) { - struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); struct list_head inode_list; struct list_head dir_list; - block_t blkaddr; int err; int ret = 0; bool need_writecp = false; @@ -571,8 +569,6 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) /* prevent checkpoint */ mutex_lock(&sbi->cp_mutex); - blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); - /* step #1: find fsynced inode numbers */ err = find_fsync_dnodes(sbi, &inode_list); if (err || list_empty(&inode_list)) From 8799db31b9b1969792f05a48454234febed10008 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 13 Dec 2016 18:54:59 +0800 Subject: [PATCH 0078/1212] f2fs: don't cache nat entry if out of memory If we run out of memory, in cache_nat_entry, it's better to avoid loop for allocating memory to cache nat entry, so in low memory scenario, for read path of node block, I expect this can avoid unneeded latency. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 26a745c544fc..b01b01cfc39e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -245,12 +245,24 @@ bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) return need_update; } -static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) +static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid, + bool no_fail) { struct nat_entry *new; - new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_NOFS); - f2fs_radix_tree_insert(&nm_i->nat_root, nid, new); + if (no_fail) { + new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_NOFS); + f2fs_radix_tree_insert(&nm_i->nat_root, nid, new); + } else { + new = kmem_cache_alloc(nat_entry_slab, GFP_NOFS); + if (!new) + return NULL; + if (radix_tree_insert(&nm_i->nat_root, nid, new)) { + kmem_cache_free(nat_entry_slab, new); + return NULL; + } + } + memset(new, 0, sizeof(struct nat_entry)); nat_set_nid(new, nid); nat_reset_flag(new); @@ -267,8 +279,9 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, e = __lookup_nat_cache(nm_i, nid); if (!e) { - e = grab_nat_entry(nm_i, nid); - node_info_from_raw_nat(&e->ni, ne); + e = grab_nat_entry(nm_i, nid, false); + if (e) + node_info_from_raw_nat(&e->ni, ne); } else { f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) || nat_get_blkaddr(e) != @@ -286,7 +299,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, ni->nid); if (!e) { - e = grab_nat_entry(nm_i, ni->nid); + e = grab_nat_entry(nm_i, ni->nid, true); copy_node_info(&e->ni, ni); f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR); } else if (new_blkaddr == NEW_ADDR) { @@ -2155,7 +2168,7 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) ne = __lookup_nat_cache(nm_i, nid); if (!ne) { - ne = grab_nat_entry(nm_i, nid); + ne = grab_nat_entry(nm_i, nid, true); node_info_from_raw_nat(&ne->ni, &raw_ne); } From e891bf97aa8ab43ff254acede061157c1dae0ba8 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 16 Dec 2016 11:18:15 +0300 Subject: [PATCH 0079/1212] f2fs: remove unneeded condition We checked that "inode" is not an error pointer earlier so there is no need to check again here. Signed-off-by: Dan Carpenter Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 523bf073642e..ca9e2f85eae8 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -321,9 +321,9 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, if (err) goto err_out; } - if (!IS_ERR(inode) && f2fs_encrypted_inode(dir) && - (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && - !fscrypt_has_permitted_context(dir, inode)) { + if (f2fs_encrypted_inode(dir) && + (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && + !fscrypt_has_permitted_context(dir, inode)) { bool nokey = f2fs_encrypted_inode(inode) && !fscrypt_has_encryption_key(inode); err = nokey ? -ENOKEY : -EPERM; From e82207d3ee89d81b19882106b69cd5b760f3d4aa Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Mon, 19 Dec 2016 20:10:48 +0800 Subject: [PATCH 0080/1212] f2fs: fix a problem of using memory after free This patch fix a problem of using memory after free in function __try_merge_extent_node. Fixes: 0f825ee6e873 ("f2fs: add new interfaces for extent tree") Cc: Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 4db44da7ef69..e02c3d88dc9a 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -352,11 +352,12 @@ static struct extent_node *__try_merge_extent_node(struct inode *inode, } if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) { - if (en) - __release_extent_node(sbi, et, prev_ex); next_ex->ei.fofs = ei->fofs; next_ex->ei.blk = ei->blk; next_ex->ei.len += ei->len; + if (en) + __release_extent_node(sbi, et, prev_ex); + en = next_ex; } From 4212c0f71a584ad1a8f4655c19032816f65b5d3e Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Tue, 20 Dec 2016 11:11:35 +0800 Subject: [PATCH 0081/1212] f2fs: add a case of no need to read a page in write begin If the range we write cover the whole valid data in the last page, we do not need to read it. Signed-off-by: Yunlei He [Jaegeuk Kim: nullify the remaining area (fix: xfstests/f2fs/001)] Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 87a85ff3c069..b47830db4263 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1712,6 +1712,11 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, if (len == PAGE_SIZE || PageUptodate(page)) return 0; + if (!(pos & (PAGE_SIZE - 1)) && (pos + len) >= i_size_read(inode)) { + zero_user_segment(page, len, PAGE_SIZE); + return 0; + } + if (blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_SIZE); SetPageUptodate(page); @@ -1765,7 +1770,7 @@ static int f2fs_write_end(struct file *file, * let generic_perform_write() try to copy data again through copied=0. */ if (!PageUptodate(page)) { - if (unlikely(copied != PAGE_SIZE)) + if (unlikely(copied != len)) copied = 0; else SetPageUptodate(page); From 670a455ef9e10064eb35579e2b6c9c7492ab693f Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 20 Dec 2016 21:57:42 +0800 Subject: [PATCH 0082/1212] f2fs: use rb_entry_safe Use rb_entry_safe() instead of open-coding it. Signed-off-by: Geliang Tang Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index e02c3d88dc9a..6ed6424807b6 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -311,28 +311,24 @@ static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et, tmp_node = parent; if (parent && fofs > en->ei.fofs) tmp_node = rb_next(parent); - *next_ex = tmp_node ? - rb_entry(tmp_node, struct extent_node, rb_node) : NULL; + *next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); tmp_node = parent; if (parent && fofs < en->ei.fofs) tmp_node = rb_prev(parent); - *prev_ex = tmp_node ? - rb_entry(tmp_node, struct extent_node, rb_node) : NULL; + *prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); return NULL; lookup_neighbors: if (fofs == en->ei.fofs) { /* lookup prev node for merging backward later */ tmp_node = rb_prev(&en->rb_node); - *prev_ex = tmp_node ? - rb_entry(tmp_node, struct extent_node, rb_node) : NULL; + *prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); } if (fofs == en->ei.fofs + en->ei.len - 1) { /* lookup next node for merging frontward later */ tmp_node = rb_next(&en->rb_node); - *next_ex = tmp_node ? - rb_entry(tmp_node, struct extent_node, rb_node) : NULL; + *next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); } return en; } @@ -493,9 +489,8 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, if (!next_en) { struct rb_node *node = rb_next(&en->rb_node); - next_en = node ? - rb_entry(node, struct extent_node, rb_node) - : NULL; + next_en = rb_entry_safe(node, struct extent_node, + rb_node); } if (parts) From 0c61b0a37be7e2f7f55066dfa77c3697765724ff Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Thu, 22 Dec 2016 11:46:24 +0800 Subject: [PATCH 0083/1212] f2fs: fix a missing discard prefree segments If userspace issue a fstrim with a range not involve prefree segments, it will reuse these segments without discard. This patch fix it. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 13bea6e5120e..f4e41f997ae3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -996,9 +996,13 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) dirty_i->nr_dirty[PRE] -= end - start; - if (force || !test_opt(sbi, DISCARD)) + if (!test_opt(sbi, DISCARD)) continue; + if (force && start >= cpc->trim_start && + (end - 1) <= cpc->trim_end) + continue; + if (!test_opt(sbi, LFS) || sbi->segs_per_sec == 1) { f2fs_issue_discard(sbi, START_BLOCK(sbi, start), (end - start) << sbi->log_blocks_per_seg); @@ -2343,8 +2347,12 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_bug_on(sbi, sit_i->dirty_sentries); out: if (cpc->reason == CP_DISCARD) { + __u64 trim_start = cpc->trim_start; + for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) add_discard_addrs(sbi, cpc); + + cpc->trim_start = trim_start; } mutex_unlock(&sit_i->sentry_lock); From c1e5d5278024fbe65f123c6f8e772b82f1f72106 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 21 Dec 2016 11:51:32 -0800 Subject: [PATCH 0084/1212] f2fs: reassign new segment for mode=lfs Otherwise we can remain wrong curseg->next_blkoff, resulting in fsck failure. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f4e41f997ae3..e6d3f3d4b028 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1508,9 +1508,6 @@ void allocate_new_segments(struct f2fs_sb_info *sbi) unsigned int old_segno; int i; - if (test_opt(sbi, LFS)) - return; - for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { curseg = CURSEG_I(sbi, i); old_segno = curseg->segno; From 849981c99bd8508cfcce20b693ff3206e0b3d161 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 21 Dec 2016 12:13:03 -0800 Subject: [PATCH 0085/1212] f2fs: add submit_bio tracepoint This patch adds final submit_bio() tracepoint. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/data.c --- fs/f2fs/data.c | 14 +++++++----- include/trace/events/f2fs.h | 45 ++++++++++++++++++++++++------------- 2 files changed, 39 insertions(+), 20 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b47830db4263..ab82b388c5aa 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -175,6 +175,10 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, current->plug && (type == DATA || type == NODE)) blk_finish_plug(current->plug); } + if (is_read_io(bio_op(bio))) + trace_f2fs_submit_read_bio(sbi->sb, type, bio); + else + trace_f2fs_submit_write_bio(sbi->sb, type, bio); submit_bio(0, bio); } @@ -185,13 +189,13 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) if (!io->bio) return; - if (is_read_io(fio->op)) - trace_f2fs_submit_read_bio(io->sbi->sb, fio, io->bio); - else - trace_f2fs_submit_write_bio(io->sbi->sb, fio, io->bio); - bio_set_op_attrs(io->bio, fio->op, fio->op_flags); + if (is_read_io(fio->op)) + trace_f2fs_prepare_read_bio(io->sbi->sb, fio->type, io->bio); + else + trace_f2fs_prepare_write_bio(io->sbi->sb, fio->type, io->bio); + __submit_bio(io->sbi, io->bio, fio->type); io->bio = NULL; } diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 7ad46e8a89e6..217691582dd4 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -779,12 +779,11 @@ DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_mbio, TP_CONDITION(page->mapping) ); -DECLARE_EVENT_CLASS(f2fs__submit_bio, +DECLARE_EVENT_CLASS(f2fs__bio, - TP_PROTO(struct super_block *sb, struct f2fs_io_info *fio, - struct bio *bio), + TP_PROTO(struct super_block *sb, int type, struct bio *bio), - TP_ARGS(sb, fio, bio), + TP_ARGS(sb, type, bio), TP_STRUCT__entry( __field(dev_t, dev) @@ -797,9 +796,9 @@ DECLARE_EVENT_CLASS(f2fs__submit_bio, TP_fast_assign( __entry->dev = sb->s_dev; - __entry->op = fio->op; - __entry->op_flags = fio->op_flags; - __entry->type = fio->type; + __entry->op = bio_op(bio); + __entry->op_flags = bio->bi_rw; + __entry->type = type; __entry->sector = bio->bi_iter.bi_sector; __entry->size = bio->bi_iter.bi_size; ), @@ -812,22 +811,38 @@ DECLARE_EVENT_CLASS(f2fs__submit_bio, __entry->size) ); -DEFINE_EVENT_CONDITION(f2fs__submit_bio, f2fs_submit_write_bio, +DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_prepare_write_bio, - TP_PROTO(struct super_block *sb, struct f2fs_io_info *fio, - struct bio *bio), + TP_PROTO(struct super_block *sb, int type, struct bio *bio), - TP_ARGS(sb, fio, bio), + TP_ARGS(sb, type, bio), TP_CONDITION(bio) ); -DEFINE_EVENT_CONDITION(f2fs__submit_bio, f2fs_submit_read_bio, +DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_prepare_read_bio, - TP_PROTO(struct super_block *sb, struct f2fs_io_info *fio, - struct bio *bio), + TP_PROTO(struct super_block *sb, int type, struct bio *bio), - TP_ARGS(sb, fio, bio), + TP_ARGS(sb, type, bio), + + TP_CONDITION(bio) +); + +DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_submit_read_bio, + + TP_PROTO(struct super_block *sb, int type, struct bio *bio), + + TP_ARGS(sb, type, bio), + + TP_CONDITION(bio) +); + +DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_submit_write_bio, + + TP_PROTO(struct super_block *sb, int type, struct bio *bio), + + TP_ARGS(sb, type, bio), TP_CONDITION(bio) ); From 22f1947949fd050ca103d37f369dbd6d2024ea50 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 14 Dec 2016 10:12:56 -0800 Subject: [PATCH 0086/1212] f2fs: support IO alignment for DATA and NODE writes This patch implements IO alignment by filling dummy blocks in DATA and NODE write bios. If we can guarantee, for example, 32KB or 64KB for such the IOs, we can eliminate underlying dummy page problem which FTL conducts in order to close MLC or TLC partial written pages. Note that, - it requires "-o mode=lfs". - IO size should be power of 2, not exceed BIO_MAX_PAGES, 256. - read IO is still 4KB. - do checkpoint at fsync, if dummy NODE page was written. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 55 +++++++++++++++++++++++++++++++++++++++-- fs/f2fs/f2fs.h | 4 ++- fs/f2fs/segment.c | 9 +++++-- fs/f2fs/segment.h | 3 +++ fs/f2fs/super.c | 13 +++++++++- include/linux/f2fs_fs.h | 6 +++++ 6 files changed, 84 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ab82b388c5aa..1b19b805ef81 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -93,6 +93,17 @@ static void f2fs_write_end_io(struct bio *bio) struct page *page = bvec->bv_page; enum count_type type = WB_DATA_TYPE(page); + if (IS_DUMMY_WRITTEN_PAGE(page)) { + set_page_private(page, (unsigned long)NULL); + ClearPagePrivate(page); + unlock_page(page); + mempool_free(page, sbi->write_io_dummy); + + if (unlikely(bio->bi_error)) + f2fs_stop_checkpoint(sbi, true); + continue; + } + fscrypt_pullback_bio_page(&page, true); if (unlikely(bio->bi_error)) { @@ -171,10 +182,42 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, struct bio *bio, enum page_type type) { if (!is_read_io(bio_op(bio))) { + unsigned int start; + if (f2fs_sb_mounted_blkzoned(sbi->sb) && current->plug && (type == DATA || type == NODE)) blk_finish_plug(current->plug); + + if (type != DATA && type != NODE) + goto submit_io; + + start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS; + start %= F2FS_IO_SIZE(sbi); + + if (start == 0) + goto submit_io; + + /* fill dummy pages */ + for (; start < F2FS_IO_SIZE(sbi); start++) { + struct page *page = + mempool_alloc(sbi->write_io_dummy, + GFP_NOIO | __GFP_ZERO | __GFP_NOFAIL); + f2fs_bug_on(sbi, !page); + + SetPagePrivate(page); + set_page_private(page, (unsigned long)DUMMY_WRITTEN_PAGE); + lock_page(page); + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) + f2fs_bug_on(sbi, 1); + } + /* + * In the NODE case, we lose next block address chain. So, we + * need to do checkpoint in f2fs_sync_file. + */ + if (type == NODE) + set_sbi_flag(sbi, SBI_NEED_CP); } +submit_io: if (is_read_io(bio_op(bio))) trace_f2fs_submit_read_bio(sbi->sb, type, bio); else @@ -319,13 +362,14 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) return 0; } -void f2fs_submit_page_mbio(struct f2fs_io_info *fio) +int f2fs_submit_page_mbio(struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = fio->sbi; enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); struct f2fs_bio_info *io; bool is_read = is_read_io(fio->op); struct page *bio_page; + int err = 0; io = is_read ? &sbi->read_io : &sbi->write_io[btype]; @@ -346,6 +390,12 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { + if ((fio->type == DATA || fio->type == NODE) && + fio->new_blkaddr & F2FS_IO_SIZE_MASK(sbi)) { + err = -EAGAIN; + dec_page_count(sbi, WB_DATA_TYPE(bio_page)); + goto out_fail; + } io->bio = __bio_alloc(sbi, fio->new_blkaddr, BIO_MAX_PAGES, is_read); io->fio = *fio; @@ -359,9 +409,10 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) io->last_block_in_bio = fio->new_blkaddr; f2fs_trace_ios(fio, 0); - +out_fail: up_write(&io->io_rwsem); trace_f2fs_submit_page_mbio(fio->page, fio); + return err; } static void __set_data_blkaddr(struct dnode_of_data *dn) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 883d3ab388c1..f9a739ffca0f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -859,6 +859,8 @@ struct f2fs_sb_info { struct f2fs_bio_info read_io; /* for read bios */ struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */ struct mutex wio_mutex[NODE + 1]; /* bio ordering for NODE/DATA */ + int write_io_size_bits; /* Write IO size bits */ + mempool_t *write_io_dummy; /* Dummy pages */ /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ @@ -2241,7 +2243,7 @@ void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *, struct page *, nid_t, enum page_type, int); void f2fs_flush_merged_bios(struct f2fs_sb_info *); int f2fs_submit_page_bio(struct f2fs_io_info *); -void f2fs_submit_page_mbio(struct f2fs_io_info *); +int f2fs_submit_page_mbio(struct f2fs_io_info *); struct block_device *f2fs_target_device(struct f2fs_sb_info *, block_t, struct bio *); int f2fs_target_device_index(struct f2fs_sb_info *, block_t); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e6d3f3d4b028..a7bb97826445 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1684,15 +1684,20 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { int type = __get_segment_type(fio->page, fio->type); + int err; if (fio->type == NODE || fio->type == DATA) mutex_lock(&fio->sbi->wio_mutex[fio->type]); - +reallocate: allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, &fio->new_blkaddr, sum, type); /* writeout dirty page into bdev */ - f2fs_submit_page_mbio(fio); + err = f2fs_submit_page_mbio(fio); + if (err == -EAGAIN) { + fio->old_blkaddr = fio->new_blkaddr; + goto reallocate; + } if (fio->type == NODE || fio->type == DATA) mutex_unlock(&fio->sbi->wio_mutex[fio->type]); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 9d44ce83acb2..08f1455c812c 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -186,9 +186,12 @@ struct segment_allocation { * the page is atomically written, and it is in inmem_pages list. */ #define ATOMIC_WRITTEN_PAGE ((unsigned long)-1) +#define DUMMY_WRITTEN_PAGE ((unsigned long)-2) #define IS_ATOMIC_WRITTEN_PAGE(page) \ (page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE) +#define IS_DUMMY_WRITTEN_PAGE(page) \ + (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) struct inmem_pages { struct list_head list; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e6d8d011786c..fb9f6c09fa11 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1764,6 +1764,8 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) FDEV(i).total_segments, FDEV(i).start_blk, FDEV(i).end_blk); } + f2fs_msg(sbi->sb, KERN_INFO, + "IO Block Size: %8d KB", F2FS_IO_SIZE_KB(sbi)); return 0; } @@ -1881,12 +1883,19 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (err) goto free_options; + if (F2FS_IO_SIZE(sbi) > 1) { + sbi->write_io_dummy = + mempool_create_page_pool(F2FS_IO_SIZE(sbi) - 1, 0); + if (!sbi->write_io_dummy) + goto free_options; + } + /* get an inode for meta space */ sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi)); if (IS_ERR(sbi->meta_inode)) { f2fs_msg(sb, KERN_ERR, "Failed to read F2FS meta data inode"); err = PTR_ERR(sbi->meta_inode); - goto free_options; + goto free_io_dummy; } err = get_valid_checkpoint(sbi); @@ -2104,6 +2113,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) free_meta_inode: make_bad_inode(sbi->meta_inode); iput(sbi->meta_inode); +free_io_dummy: + mempool_destroy(sbi->write_io_dummy); free_options: destroy_percpu_info(sbi); kfree(options); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index cea41a124a80..f0748524ca8c 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -36,6 +36,12 @@ #define F2FS_NODE_INO(sbi) (sbi->node_ino_num) #define F2FS_META_INO(sbi) (sbi->meta_ino_num) +#define F2FS_IO_SIZE(sbi) (1 << (sbi)->write_io_size_bits) /* Blocks */ +#define F2FS_IO_SIZE_KB(sbi) (1 << ((sbi)->write_io_size_bits + 2)) /* KB */ +#define F2FS_IO_SIZE_BYTES(sbi) (1 << ((sbi)->write_io_size_bits + 12)) /* B */ +#define F2FS_IO_SIZE_BITS(sbi) ((sbi)->write_io_size_bits) /* power of 2 */ +#define F2FS_IO_SIZE_MASK(sbi) (F2FS_IO_SIZE(sbi) - 1) + /* This flag is used by node and meta inodes, and by recovery */ #define GFP_F2FS_ZERO (GFP_NOFS | __GFP_ZERO) #define GFP_F2FS_HIGH_ZERO (GFP_NOFS | __GFP_ZERO | __GFP_HIGHMEM) From 168fef245e107466a47431627a66414460822faa Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 21 Dec 2016 17:09:19 -0800 Subject: [PATCH 0087/1212] f2fs: get io size bit from mount option This patch adds to set io_size_bits from mount option. Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 2 ++ fs/f2fs/super.c | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index 753dd4f96afe..d99faced79cb 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -157,6 +157,8 @@ data_flush Enable data flushing before checkpoint in order to mode=%s Control block allocation mode which supports "adaptive" and "lfs". In "lfs" mode, there should be no random writes towards main area. +io_bits=%u Set the bit size of write IO requests. It should be set + with "mode=lfs". ================================================================================ DEBUGFS ENTRIES diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index fb9f6c09fa11..3b169927408e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -101,6 +101,7 @@ enum { Opt_noinline_data, Opt_data_flush, Opt_mode, + Opt_io_size_bits, Opt_fault_injection, Opt_lazytime, Opt_nolazytime, @@ -133,6 +134,7 @@ static match_table_t f2fs_tokens = { {Opt_noinline_data, "noinline_data"}, {Opt_data_flush, "data_flush"}, {Opt_mode, "mode=%s"}, + {Opt_io_size_bits, "io_bits=%u"}, {Opt_fault_injection, "fault_injection=%u"}, {Opt_lazytime, "lazytime"}, {Opt_nolazytime, "nolazytime"}, @@ -535,6 +537,17 @@ static int parse_options(struct super_block *sb, char *options) } kfree(name); break; + case Opt_io_size_bits: + if (args->from && match_int(args, &arg)) + return -EINVAL; + if (arg > __ilog2_u32(BIO_MAX_PAGES)) { + f2fs_msg(sb, KERN_WARNING, + "Not support %d, larger than %d", + 1 << arg, BIO_MAX_PAGES); + return -EINVAL; + } + sbi->write_io_size_bits = arg; + break; case Opt_fault_injection: if (args->from && match_int(args, &arg)) return -EINVAL; @@ -558,6 +571,13 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; } } + + if (F2FS_IO_SIZE_BITS(sbi) && !test_opt(sbi, LFS)) { + f2fs_msg(sb, KERN_ERR, + "Should set mode=lfs with %uKB-sized IO", + F2FS_IO_SIZE_KB(sbi)); + return -EINVAL; + } return 0; } @@ -918,6 +938,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) else if (test_opt(sbi, LFS)) seq_puts(seq, "lfs"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); + if (F2FS_IO_SIZE_BITS(sbi)) + seq_printf(seq, ",io_size=%uKB", F2FS_IO_SIZE_KB(sbi)); return 0; } From 75e402e690b9f1713458f3adbef306a73de74f3c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 28 Dec 2016 13:55:09 -0800 Subject: [PATCH 0088/1212] f2fs: show the max number of atomic operations This patch adds to show the max number of atomic operations which are conducting concurrently. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 7 +++++++ fs/f2fs/f2fs.h | 17 +++++++++++++++++ fs/f2fs/file.c | 8 ++++++-- fs/f2fs/segment.c | 1 + 4 files changed, 31 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index fbd5184140d0..29cdf0c1da1d 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -50,6 +50,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); + si->aw_cnt = atomic_read(&sbi->aw_cnt); + si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA); si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; @@ -256,6 +258,8 @@ static int stat_show(struct seq_file *s, void *v) si->inline_dir); seq_printf(s, " - Orphan Inode: %u\n", si->orphans); + seq_printf(s, " - Atomic write count: %4d (Max. %4d)\n", + si->aw_cnt, si->max_aw_cnt); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", si->main_area_segs, si->main_area_sections, si->main_area_zones); @@ -414,6 +418,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->inline_dir, 0); atomic_set(&sbi->inplace_count, 0); + atomic_set(&sbi->aw_cnt, 0); + atomic_set(&sbi->max_aw_cnt, 0); + mutex_lock(&f2fs_stat_mutex); list_add_tail(&si->stat_list, &f2fs_stat_list); mutex_unlock(&f2fs_stat_mutex); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f9a739ffca0f..19e054b1c4f8 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -951,6 +951,8 @@ struct f2fs_sb_info { atomic_t inline_xattr; /* # of inline_xattr inodes */ atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ + atomic_t aw_cnt; /* # of atomic writes */ + atomic_t max_aw_cnt; /* max # of atomic writes */ int bg_gc; /* background gc calls */ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ #endif @@ -2303,6 +2305,7 @@ struct f2fs_stat_info { int total_count, utilization; int bg_gc, nr_wb_cp_data, nr_wb_data; int inline_xattr, inline_inode, inline_dir, orphans; + int aw_cnt, max_aw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; @@ -2374,6 +2377,17 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) ((sbi)->block_count[(curseg)->alloc_type]++) #define stat_inc_inplace_blocks(sbi) \ (atomic_inc(&(sbi)->inplace_count)) +#define stat_inc_atomic_write(inode) \ + (atomic_inc(&F2FS_I_SB(inode)->aw_cnt)); +#define stat_dec_atomic_write(inode) \ + (atomic_dec(&F2FS_I_SB(inode)->aw_cnt)); +#define stat_update_max_atomic_write(inode) \ + do { \ + int cur = atomic_read(&F2FS_I_SB(inode)->aw_cnt); \ + int max = atomic_read(&F2FS_I_SB(inode)->max_aw_cnt); \ + if (cur > max) \ + atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ + } while (0) #define stat_inc_seg_count(sbi, type, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ @@ -2427,6 +2441,9 @@ void f2fs_destroy_root_stats(void); #define stat_dec_inline_inode(inode) #define stat_inc_inline_dir(inode) #define stat_dec_inline_dir(inode) +#define stat_inc_atomic_write(inode) +#define stat_dec_atomic_write(inode) +#define stat_update_max_atomic_write(inode) #define stat_inc_seg_type(sbi, curseg) #define stat_inc_block_count(sbi, curseg) #define stat_inc_inplace_blocks(sbi) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5808d5c709a7..d7eacef08797 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1546,6 +1546,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (ret) clear_inode_flag(inode, FI_ATOMIC_FILE); out: + stat_inc_atomic_write(inode); + stat_update_max_atomic_write(inode); inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -1575,9 +1577,11 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) set_inode_flag(inode, FI_ATOMIC_FILE); goto err_out; } + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); + stat_dec_atomic_write(inode); + } else { + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); } - - ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); err_out: inode_unlock(inode); mnt_drop_write_file(filp); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a7bb97826445..353ec85b3835 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -243,6 +243,7 @@ void drop_inmem_pages(struct inode *inode) struct f2fs_inode_info *fi = F2FS_I(inode); clear_inode_flag(inode, FI_ATOMIC_FILE); + stat_dec_atomic_write(inode); mutex_lock(&fi->inmem_lock); __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); From 3c299af84525fc49d1ea46bf4f420df132a31d3f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 28 Dec 2016 17:31:15 -0800 Subject: [PATCH 0089/1212] f2fs: don't allow encrypted operations without keys This patch fixes the renaming bug on encrypted filenames, which was pointed by (ext4: don't allow encrypted operations without keys) Cc: Theodore Ts'o Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index ca9e2f85eae8..db3079cd665d 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -660,6 +660,12 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, bool is_old_inline = f2fs_has_inline_dentry(old_dir); int err = -ENOENT; + if ((f2fs_encrypted_inode(old_dir) && + !fscrypt_has_encryption_key(old_dir)) || + (f2fs_encrypted_inode(new_dir) && + !fscrypt_has_encryption_key(new_dir))) + return -ENOKEY; + if ((old_dir != new_dir) && f2fs_encrypted_inode(new_dir) && !fscrypt_has_permitted_context(new_dir, old_inode)) { err = -EPERM; @@ -840,6 +846,12 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, int old_nlink = 0, new_nlink = 0; int err = -ENOENT; + if ((f2fs_encrypted_inode(old_dir) && + !fscrypt_has_encryption_key(old_dir)) || + (f2fs_encrypted_inode(new_dir) && + !fscrypt_has_encryption_key(new_dir))) + return -ENOKEY; + if ((f2fs_encrypted_inode(old_dir) || f2fs_encrypted_inode(new_dir)) && (old_dir != new_dir) && (!fscrypt_has_permitted_context(new_dir, old_inode) || From 7e43f19b5ecdff3ff4ac97149446ad7c035b2185 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 3 Jan 2017 17:19:30 -0800 Subject: [PATCH 0090/1212] f2fs: drop exist_data for inline_data when truncated to 0 A test program gets the SEEK_DATA with two values between a new created file and the exist file on f2fs filesystem. F2FS filesystem, (the first "test1" is a new file) SEEK_DATA size != 0 (offset = 8192) SEEK_DATA size != 0 (offset = 4096) PNFS filesystem, (the first "test1" is a new file) SEEK_DATA size != 0 (offset = 4096) SEEK_DATA size != 0 (offset = 4096) int main(int argc, char **argv) { char *filename = argv[1]; int offset = 1, i = 0, fd = -1; if (argc < 2) { printf("Usage: %s f2fsfilename\n", argv[0]); return -1; } /* if (!access(filename, F_OK) || errno != ENOENT) { printf("Needs a new file for test, %m\n"); return -1; }*/ fd = open(filename, O_RDWR | O_CREAT, 0777); if (fd < 0) { printf("Create test file %s failed, %m\n", filename); return -1; } for (i = 0; i < 20; i++) { offset = 1 << i; ftruncate(fd, 0); lseek(fd, offset, SEEK_SET); write(fd, "test", 5); /* Get the alloc size by seek data equal zero*/ if (lseek(fd, 0, SEEK_DATA)) { printf("SEEK_DATA size != 0 (offset = %d)\n", offset); break; } } close(fd); return 0; } Reported-and-Tested-by: Kinglong Mee Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index d7eacef08797..9da13847cda4 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -571,6 +571,8 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) if (f2fs_has_inline_data(inode)) { if (truncate_inline_inode(ipage, from)) set_page_dirty(ipage); + if (from == 0) + clear_inode_flag(inode, FI_DATA_EXIST); f2fs_put_page(ipage, 1); truncate_page = true; goto out; From 5521ead70476162d3cef2324320784cb4dbd0c10 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 29 Dec 2016 14:07:53 -0800 Subject: [PATCH 0091/1212] f2fs: relax async discard commands more This patch relaxes async discard commands to avoid waiting its end_io during checkpoint. Instead of waiting them during checkpoint, it will be done when actually reusing them. Test on initial partition of nvme drive. # time fstrim /mnt/test Before : 6.158s After : 4.822s Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 7 ++----- fs/f2fs/f2fs.h | 4 +++- fs/f2fs/segment.c | 24 +++++++++++++++++++----- fs/f2fs/super.c | 3 +++ 4 files changed, 27 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index d485bea3d6bb..2ed785e5ffbb 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1255,7 +1255,6 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_bug_on(sbi, prefree_segments(sbi)); flush_sit_entries(sbi, cpc); clear_prefree_segments(sbi, cpc); - f2fs_wait_all_discard_bio(sbi); unblock_operations(sbi); goto out; } @@ -1274,12 +1273,10 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* unlock all the fs_lock[] in do_checkpoint() */ err = do_checkpoint(sbi, cpc); - if (err) { + if (err) release_discard_addrs(sbi); - } else { + else clear_prefree_segments(sbi, cpc); - f2fs_wait_all_discard_bio(sbi); - } unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 19e054b1c4f8..3409392dde9c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -250,6 +250,8 @@ struct discard_entry { struct bio_entry { struct list_head list; + block_t lstart; + block_t len; struct bio *bio; struct completion event; int error; @@ -2178,7 +2180,7 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *, bool); void invalidate_blocks(struct f2fs_sb_info *, block_t); bool is_checkpointed_data(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); -void f2fs_wait_all_discard_bio(struct f2fs_sb_info *); +void f2fs_wait_discard_bio(struct f2fs_sb_info *, block_t); void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *); void release_discard_addrs(struct f2fs_sb_info *); int npages_for_summary_flush(struct f2fs_sb_info *, bool); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 353ec85b3835..fa3d4f8db389 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -625,20 +625,23 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) } static struct bio_entry *__add_bio_entry(struct f2fs_sb_info *sbi, - struct bio *bio) + struct bio *bio, block_t lstart, block_t len) { struct list_head *wait_list = &(SM_I(sbi)->wait_list); struct bio_entry *be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS); INIT_LIST_HEAD(&be->list); be->bio = bio; + be->lstart = lstart; + be->len = len; init_completion(&be->event); list_add_tail(&be->list, wait_list); return be; } -void f2fs_wait_all_discard_bio(struct f2fs_sb_info *sbi) +/* This should be covered by global mutex, &sit_i->sentry_lock */ +void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) { struct list_head *wait_list = &(SM_I(sbi)->wait_list); struct bio_entry *be, *tmp; @@ -647,7 +650,15 @@ void f2fs_wait_all_discard_bio(struct f2fs_sb_info *sbi) struct bio *bio = be->bio; int err; - wait_for_completion_io(&be->event); + if (!completion_done(&be->event)) { + if ((be->lstart <= blkaddr && + blkaddr < be->lstart + be->len) || + blkaddr == NULL_ADDR) + wait_for_completion_io(&be->event); + else + continue; + } + err = be->error; if (err == -EOPNOTSUPP) err = 0; @@ -756,6 +767,7 @@ static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { struct bio *bio = NULL; + block_t lblkstart = blkstart; int err; trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); @@ -770,13 +782,13 @@ static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, SECTOR_FROM_BLOCK(blklen), GFP_NOFS, 0, &bio); if (!err && bio) { - struct bio_entry *be = __add_bio_entry(sbi, bio); + struct bio_entry *be = __add_bio_entry(sbi, bio, + lblkstart, blklen); bio->bi_private = be; bio->bi_end_io = f2fs_submit_bio_wait_endio; submit_bio(REQ_SYNC, bio); } - return err; } @@ -1655,6 +1667,8 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + f2fs_wait_discard_bio(sbi, *new_blkaddr); + /* * __add_sum_entry should be resided under the curseg_mutex * because, this function updates a summary entry in the diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3b169927408e..84d5686c4aa4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -770,6 +770,9 @@ static void f2fs_put_super(struct super_block *sb) write_checkpoint(sbi, &cpc); } + /* be sure to wait for any on-going discard commands */ + f2fs_wait_discard_bio(sbi, NULL_ADDR); + /* write_checkpoint can update stat informaion */ f2fs_destroy_stats(sbi); From 711f0385dc67a56d49c57e3857833fdf74fd40f8 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 29 Dec 2016 16:58:54 -0800 Subject: [PATCH 0092/1212] f2fs: avoid needless checkpoint in f2fs_trim_fs The f2fs_trim_fs() doesn't need to do checkpoint if there are newly allocated data blocks only which didn't change the critical checkpoint data such as nat and sit entries. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 2ed785e5ffbb..886b96c12c31 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1249,14 +1249,15 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_flush_merged_bios(sbi); /* this is the case of multiple fstrims without any changes */ - if (cpc->reason == CP_DISCARD && !is_sbi_flag_set(sbi, SBI_IS_DIRTY)) { - f2fs_bug_on(sbi, NM_I(sbi)->dirty_nat_cnt); - f2fs_bug_on(sbi, SIT_I(sbi)->dirty_sentries); - f2fs_bug_on(sbi, prefree_segments(sbi)); - flush_sit_entries(sbi, cpc); - clear_prefree_segments(sbi, cpc); - unblock_operations(sbi); - goto out; + if (cpc->reason == CP_DISCARD) { + if (NM_I(sbi)->dirty_nat_cnt == 0 && + SIT_I(sbi)->dirty_sentries == 0 && + prefree_segments(sbi) == 0) { + flush_sit_entries(sbi, cpc); + clear_prefree_segments(sbi, cpc); + unblock_operations(sbi); + goto out; + } } /* From 99a5dca4d9c6efd55dd548cf1e30ce86912f47ac Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 29 Dec 2016 22:06:15 -0800 Subject: [PATCH 0093/1212] f2fs: return fs_trim if there is no candidate If there is no candidate to submit discard command during f2sf_trim_fs, let's return without checkpoint. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 5 +++++ fs/f2fs/f2fs.h | 1 + fs/f2fs/segment.c | 28 +++++++++++++++++++++++----- 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 886b96c12c31..fbf04d4d7964 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1250,6 +1250,11 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* this is the case of multiple fstrims without any changes */ if (cpc->reason == CP_DISCARD) { + if (!exist_trim_candidates(sbi, cpc)) { + unblock_operations(sbi); + goto out; + } + if (NM_I(sbi)->dirty_nat_cnt == 0 && SIT_I(sbi)->dirty_sentries == 0 && prefree_segments(sbi) == 0) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3409392dde9c..3eb53e3a8eae 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2186,6 +2186,7 @@ void release_discard_addrs(struct f2fs_sb_info *); int npages_for_summary_flush(struct f2fs_sb_info *, bool); void allocate_new_segments(struct f2fs_sb_info *); int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *); +bool exist_trim_candidates(struct f2fs_sb_info *, struct cp_control *); struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); void update_meta_page(struct f2fs_sb_info *, void *, block_t); void write_meta_page(struct f2fs_sb_info *, struct page *); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index fa3d4f8db389..12f8d5ab7ccf 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -914,7 +914,8 @@ static void __add_discard_entry(struct f2fs_sb_info *sbi, SM_I(sbi)->nr_discards += end - start; } -static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) +static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, + bool check_only) { int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); int max_blocks = sbi->blocks_per_seg; @@ -928,12 +929,12 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) int i; if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi)) - return; + return false; if (!force) { if (!test_opt(sbi, DISCARD) || !se->valid_blocks || SM_I(sbi)->nr_discards >= SM_I(sbi)->max_discards) - return; + return false; } /* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */ @@ -951,8 +952,12 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) && (end - start) < cpc->trim_minlen) continue; + if (check_only) + return true; + __add_discard_entry(sbi, cpc, se, start, end); } + return false; } void release_discard_addrs(struct f2fs_sb_info *sbi) @@ -1533,6 +1538,19 @@ static const struct segment_allocation default_salloc_ops = { .allocate_segment = allocate_segment_by_default, }; +bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc) +{ + __u64 trim_start = cpc->trim_start; + + mutex_lock(&SIT_I(sbi)->sentry_lock); + for (; trim_start <= cpc->trim_end; trim_start++) + if (add_discard_addrs(sbi, cpc, true)) + break; + mutex_unlock(&SIT_I(sbi)->sentry_lock); + + return trim_start <= cpc->trim_end; +} + int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) { __u64 start = F2FS_BYTES_TO_BLK(range->start); @@ -2329,7 +2347,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* add discard candidates */ if (cpc->reason != CP_DISCARD) { cpc->trim_start = segno; - add_discard_addrs(sbi, cpc); + add_discard_addrs(sbi, cpc, false); } if (to_journal) { @@ -2367,7 +2385,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) __u64 trim_start = cpc->trim_start; for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) - add_discard_addrs(sbi, cpc); + add_discard_addrs(sbi, cpc, false); cpc->trim_start = trim_start; } From f948bcc51e136aa4f23f6fe05ea51a5dc66bcdd5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 7 Jan 2017 18:49:42 +0800 Subject: [PATCH 0094/1212] f2fs: clean up with list_{first, last}_entry Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 ++-- fs/f2fs/data.c | 4 ++-- fs/f2fs/node.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index fbf04d4d7964..45ef3b6bfb04 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -892,7 +892,7 @@ int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); return 0; } - fi = list_entry(head->next, struct f2fs_inode_info, dirty_list); + fi = list_first_entry(head, struct f2fs_inode_info, dirty_list); inode = igrab(&fi->vfs_inode); spin_unlock(&sbi->inode_lock[type]); if (inode) { @@ -925,7 +925,7 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) spin_unlock(&sbi->inode_lock[DIRTY_META]); return 0; } - fi = list_entry(head->next, struct f2fs_inode_info, + fi = list_first_entry(head, struct f2fs_inode_info, gdirty_list); inode = igrab(&fi->vfs_inode); spin_unlock(&sbi->inode_lock[DIRTY_META]); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 1b19b805ef81..669c267cd36e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1143,7 +1143,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, prefetchw(&page->flags); if (pages) { - page = list_entry(pages->prev, struct page, lru); + page = list_last_entry(pages, struct page, lru); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, page->index, GFP_KERNEL)) @@ -1261,7 +1261,7 @@ static int f2fs_read_data_pages(struct file *file, struct list_head *pages, unsigned nr_pages) { struct inode *inode = file->f_mapping->host; - struct page *page = list_entry(pages->prev, struct page, lru); + struct page *page = list_last_entry(pages, struct page, lru); trace_f2fs_readpages(inode, page, nr_pages); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index e7997e240366..9278b21ee073 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -174,7 +174,7 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) spin_unlock(&nm_i->nid_list_lock); return; } - fnid = list_entry(nm_i->nid_list[FREE_NID_LIST].next, + fnid = list_first_entry(&nm_i->nid_list[FREE_NID_LIST], struct free_nid, list); *nid = fnid->nid; spin_unlock(&nm_i->nid_list_lock); From 72d48dabe998550f45038d44c98ec286e5161ce6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 7 Jan 2017 18:50:26 +0800 Subject: [PATCH 0095/1212] f2fs: introduce FI_ATOMIC_COMMIT This patch introduces a new flag to indicate inode status of doing atomic write committing, so that, we can keep atomic write status for inode during atomic committing, then we can skip GCing pages of atomic write inode, that avoids random GCed datas being mixed with current transaction, so isolation of transaction can be kept. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/f2fs.h | 6 ++++++ fs/f2fs/file.c | 11 ++++++----- fs/f2fs/gc.c | 6 ++++++ fs/f2fs/segment.c | 10 +++++++--- 5 files changed, 26 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 669c267cd36e..7efc2bf88641 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1976,7 +1976,7 @@ static int f2fs_set_data_page_dirty(struct page *page) if (!PageUptodate(page)) SetPageUptodate(page); - if (f2fs_is_atomic_file(inode)) { + if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) { if (!IS_ATOMIC_WRITTEN_PAGE(page)) { register_inmem_page(inode, page); return 1; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3eb53e3a8eae..807855d37c63 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1706,6 +1706,7 @@ enum { FI_UPDATE_WRITE, /* inode has in-place-update data */ FI_NEED_IPU, /* used for ipu per file */ FI_ATOMIC_FILE, /* indicate atomic file */ + FI_ATOMIC_COMMIT, /* indicate the state of atomical committing */ FI_VOLATILE_FILE, /* indicate volatile file */ FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */ FI_DROP_CACHE, /* drop dirty page cache */ @@ -1895,6 +1896,11 @@ static inline bool f2fs_is_atomic_file(struct inode *inode) return is_inode_flag_set(inode, FI_ATOMIC_FILE); } +static inline bool f2fs_is_commit_atomic_write(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_ATOMIC_COMMIT); +} + static inline bool f2fs_is_volatile_file(struct inode *inode) { return is_inode_flag_set(inode, FI_VOLATILE_FILE); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 9da13847cda4..e4e5d76d80b0 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1573,14 +1573,15 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) goto err_out; if (f2fs_is_atomic_file(inode)) { - clear_inode_flag(inode, FI_ATOMIC_FILE); ret = commit_inmem_pages(inode); - if (ret) { - set_inode_flag(inode, FI_ATOMIC_FILE); + if (ret) goto err_out; - } + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); - stat_dec_atomic_write(inode); + if (!ret) { + clear_inode_flag(inode, FI_ATOMIC_FILE); + stat_dec_atomic_write(inode); + } } else { ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d3a36e4b442c..7f0c3e02408c 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -569,6 +569,9 @@ static void move_encrypted_block(struct inode *inode, block_t bidx, if (!check_valid_map(F2FS_I_SB(inode), segno, off)) goto out; + if (f2fs_is_atomic_file(inode)) + goto out; + set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE); if (err) @@ -661,6 +664,9 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, if (!check_valid_map(F2FS_I_SB(inode), segno, off)) goto out; + if (f2fs_is_atomic_file(inode)) + goto out; + if (gc_type == BG_GC) { if (PageWriteback(page)) goto out; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 12f8d5ab7ccf..6a870677d58a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -242,12 +242,12 @@ void drop_inmem_pages(struct inode *inode) { struct f2fs_inode_info *fi = F2FS_I(inode); - clear_inode_flag(inode, FI_ATOMIC_FILE); - stat_dec_atomic_write(inode); - mutex_lock(&fi->inmem_lock); __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); mutex_unlock(&fi->inmem_lock); + + clear_inode_flag(inode, FI_ATOMIC_FILE); + stat_dec_atomic_write(inode); } static int __commit_inmem_pages(struct inode *inode, @@ -316,6 +316,8 @@ int commit_inmem_pages(struct inode *inode) f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); + set_inode_flag(inode, FI_ATOMIC_COMMIT); + mutex_lock(&fi->inmem_lock); err = __commit_inmem_pages(inode, &revoke_list); if (err) { @@ -337,6 +339,8 @@ int commit_inmem_pages(struct inode *inode) } mutex_unlock(&fi->inmem_lock); + clear_inode_flag(inode, FI_ATOMIC_COMMIT); + f2fs_unlock_op(sbi); return err; } From c50d5c09193e413467fca1a3fdfa5a69e59a6930 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 7 Jan 2017 18:51:01 +0800 Subject: [PATCH 0096/1212] f2fs: check in-memory block bitmap This patch adds a mirror for valid block bitmap, and use it to detect in-memory bitmap corruption which may be caused by bit-transition of cache or memory overflow. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 32 ++++++++++++++++++++++++++++++-- fs/f2fs/segment.h | 6 ++++++ 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6a870677d58a..aae1c2ea7a1d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1101,14 +1101,32 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) /* Update valid block bitmap */ if (del > 0) { - if (f2fs_test_and_set_bit(offset, se->cur_valid_map)) + if (f2fs_test_and_set_bit(offset, se->cur_valid_map)) { +#ifdef CONFIG_F2FS_CHECK_FS + if (f2fs_test_and_set_bit(offset, + se->cur_valid_map_mir)) + f2fs_bug_on(sbi, 1); + else + WARN_ON(1); +#else f2fs_bug_on(sbi, 1); +#endif + } if (f2fs_discard_en(sbi) && !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; } else { - if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) + if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) { +#ifdef CONFIG_F2FS_CHECK_FS + if (!f2fs_test_and_clear_bit(offset, + se->cur_valid_map_mir)) + f2fs_bug_on(sbi, 1); + else + WARN_ON(1); +#else f2fs_bug_on(sbi, 1); +#endif + } if (f2fs_discard_en(sbi) && f2fs_test_and_clear_bit(offset, se->discard_map)) sbi->discard_blks++; @@ -2432,6 +2450,13 @@ static int build_sit_info(struct f2fs_sb_info *sbi) !sit_i->sentries[start].ckpt_valid_map) return -ENOMEM; +#ifdef CONFIG_F2FS_CHECK_FS + sit_i->sentries[start].cur_valid_map_mir + = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + if (!sit_i->sentries[start].cur_valid_map_mir) + return -ENOMEM; +#endif + if (f2fs_discard_en(sbi)) { sit_i->sentries[start].discard_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); @@ -2861,6 +2886,9 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) if (sit_i->sentries) { for (start = 0; start < MAIN_SEGS(sbi); start++) { kfree(sit_i->sentries[start].cur_valid_map); +#ifdef CONFIG_F2FS_CHECK_FS + kfree(sit_i->sentries[start].cur_valid_map_mir); +#endif kfree(sit_i->sentries[start].ckpt_valid_map); kfree(sit_i->sentries[start].discard_map); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 08f1455c812c..9af95194db06 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -164,6 +164,9 @@ struct seg_entry { unsigned int ckpt_valid_blocks:10; /* # of valid blocks last cp */ unsigned int padding:6; /* padding */ unsigned char *cur_valid_map; /* validity bitmap of blocks */ +#ifdef CONFIG_F2FS_CHECK_FS + unsigned char *cur_valid_map_mir; /* mirror of current valid bitmap */ +#endif /* * # of valid blocks and the validity bitmap stored in the the last * checkpoint pack. This information is used by the SSR mode. @@ -320,6 +323,9 @@ static inline void seg_info_from_raw_sit(struct seg_entry *se, se->ckpt_valid_blocks = GET_SIT_VBLOCKS(rs); memcpy(se->cur_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); +#ifdef CONFIG_F2FS_CHECK_FS + memcpy(se->cur_valid_map_mir, rs->valid_map, SIT_VBLOCK_MAP_SIZE); +#endif se->type = GET_SIT_TYPE(rs); se->mtime = le64_to_cpu(rs->mtime); } From 8a576d4d407b72324e476544b3ab9ae2b0998788 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 7 Jan 2017 18:52:01 +0800 Subject: [PATCH 0097/1212] f2fs: check in-memory nat version bitmap This patch adds a mirror for nat version bitmap, and use it to detect in-memory bitmap corruption which may be caused by bit-transition of cache or memory overflow. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 +++ fs/f2fs/node.c | 11 +++++++++++ fs/f2fs/node.h | 15 +++++++++++++++ 3 files changed, 29 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 807855d37c63..d4783d9cf4e0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -607,6 +607,9 @@ struct f2fs_nm_info { /* for checkpoint */ char *nat_bitmap; /* NAT bitmap pointer */ +#ifdef CONFIG_F2FS_CHECK_FS + char *nat_bitmap_mir; /* NAT bitmap mirror */ +#endif int bitmap_size; /* bitmap size */ }; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b01b01cfc39e..bc67dc323f7e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2366,6 +2366,14 @@ static int init_node_manager(struct f2fs_sb_info *sbi) GFP_KERNEL); if (!nm_i->nat_bitmap) return -ENOMEM; + +#ifdef CONFIG_F2FS_CHECK_FS + nm_i->nat_bitmap_mir = kmemdup(version_bitmap, nm_i->bitmap_size, + GFP_KERNEL); + if (!nm_i->nat_bitmap_mir) + return -ENOMEM; +#endif + return 0; } @@ -2440,6 +2448,9 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) up_write(&nm_i->nat_tree_lock); kfree(nm_i->nat_bitmap); +#ifdef CONFIG_F2FS_CHECK_FS + kfree(nm_i->nat_bitmap_mir); +#endif sbi->nm_info = NULL; kfree(nm_i); } diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 9278b21ee073..29ff783eb9c3 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -186,6 +186,12 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) static inline void get_nat_bitmap(struct f2fs_sb_info *sbi, void *addr) { struct f2fs_nm_info *nm_i = NM_I(sbi); + +#ifdef CONFIG_F2FS_CHECK_FS + if (memcmp(nm_i->nat_bitmap, nm_i->nat_bitmap_mir, + nm_i->bitmap_size)) + f2fs_bug_on(sbi, 1); +#endif memcpy(addr, nm_i->nat_bitmap, nm_i->bitmap_size); } @@ -203,6 +209,12 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start) (seg_off << sbi->log_blocks_per_seg << 1) + (block_off & (sbi->blocks_per_seg - 1))); +#ifdef CONFIG_F2FS_CHECK_FS + if (f2fs_test_bit(block_off, nm_i->nat_bitmap) != + f2fs_test_bit(block_off, nm_i->nat_bitmap_mir)) + f2fs_bug_on(sbi, 1); +#endif + if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) block_addr += sbi->blocks_per_seg; @@ -228,6 +240,9 @@ static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid) unsigned int block_off = NAT_BLOCK_OFFSET(start_nid); f2fs_change_bit(block_off, nm_i->nat_bitmap); +#ifdef CONFIG_F2FS_CHECK_FS + f2fs_change_bit(block_off, nm_i->nat_bitmap_mir); +#endif } static inline nid_t ino_of_node(struct page *node_page) From e3d4c4b5f18cd817225906e341aadf8fd8a01345 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 7 Jan 2017 18:52:34 +0800 Subject: [PATCH 0098/1212] f2fs: check in-memory sit version bitmap This patch adds a mirror for sit version bitmap, and use it to detect in-memory bitmap corruption which may be caused by bit-transition of cache or memory overflow. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 16 ++++++++++++---- fs/f2fs/segment.h | 18 ++++++++++++++++++ 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index aae1c2ea7a1d..c39bbffb0cac 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2421,7 +2421,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct sit_info *sit_i; unsigned int sit_segs, start; - char *src_bitmap, *dst_bitmap; + char *src_bitmap; unsigned int bitmap_size; /* allocate memory for SIT information */ @@ -2483,17 +2483,22 @@ static int build_sit_info(struct f2fs_sb_info *sbi) bitmap_size = __bitmap_size(sbi, SIT_BITMAP); src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP); - dst_bitmap = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL); - if (!dst_bitmap) + sit_i->sit_bitmap = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL); + if (!sit_i->sit_bitmap) return -ENOMEM; +#ifdef CONFIG_F2FS_CHECK_FS + sit_i->sit_bitmap_mir = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL); + if (!sit_i->sit_bitmap_mir) + return -ENOMEM; +#endif + /* init SIT information */ sit_i->s_ops = &default_salloc_ops; sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr); sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg; sit_i->written_valid_blocks = 0; - sit_i->sit_bitmap = dst_bitmap; sit_i->bitmap_size = bitmap_size; sit_i->dirty_sentries = 0; sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK; @@ -2901,6 +2906,9 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) SM_I(sbi)->sit_info = NULL; kfree(sit_i->sit_bitmap); +#ifdef CONFIG_F2FS_CHECK_FS + kfree(sit_i->sit_bitmap_mir); +#endif kfree(sit_i); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 9af95194db06..5cb5755c75d9 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -209,6 +209,9 @@ struct sit_info { block_t sit_blocks; /* # of blocks used by SIT area */ block_t written_valid_blocks; /* # of valid blocks in main area */ char *sit_bitmap; /* SIT bitmap pointer */ +#ifdef CONFIG_F2FS_CHECK_FS + char *sit_bitmap_mir; /* SIT bitmap mirror */ +#endif unsigned int bitmap_size; /* SIT bitmap size */ unsigned long *tmp_map; /* bitmap for temporal use */ @@ -423,6 +426,12 @@ static inline void get_sit_bitmap(struct f2fs_sb_info *sbi, void *dst_addr) { struct sit_info *sit_i = SIT_I(sbi); + +#ifdef CONFIG_F2FS_CHECK_FS + if (memcmp(sit_i->sit_bitmap, sit_i->sit_bitmap_mir, + sit_i->bitmap_size)) + f2fs_bug_on(sbi, 1); +#endif memcpy(dst_addr, sit_i->sit_bitmap, sit_i->bitmap_size); } @@ -643,6 +652,12 @@ static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi, check_seg_range(sbi, start); +#ifdef CONFIG_F2FS_CHECK_FS + if (f2fs_test_bit(offset, sit_i->sit_bitmap) != + f2fs_test_bit(offset, sit_i->sit_bitmap_mir)) + f2fs_bug_on(sbi, 1); +#endif + /* calculate sit block address */ if (f2fs_test_bit(offset, sit_i->sit_bitmap)) blk_addr += sit_i->sit_blocks; @@ -668,6 +683,9 @@ static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start) unsigned int block_off = SIT_BLOCK_OFFSET(start); f2fs_change_bit(block_off, sit_i->sit_bitmap); +#ifdef CONFIG_F2FS_CHECK_FS + f2fs_change_bit(block_off, sit_i->sit_bitmap_mir); +#endif } static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi) From 0d7a55b0135b91c5eb8ecaf1b6bfe0c0f5eca3fd Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 9 Jan 2017 14:13:03 -0800 Subject: [PATCH 0099/1212] f2fs: clean up flush/discard command namings This patch simply cleans up the names for flush/discard commands. Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/segment.c --- fs/f2fs/debug.c | 2 +- fs/f2fs/f2fs.h | 20 +++++----- fs/f2fs/segment.c | 98 +++++++++++++++++++++++------------------------ 3 files changed, 59 insertions(+), 61 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 29cdf0c1da1d..883f1ea9e0b6 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -194,7 +194,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->cache_mem += sizeof(struct f2fs_gc_kthread); /* build merge flush thread */ - if (SM_I(sbi)->cmd_control_info) + if (SM_I(sbi)->fcc_info) si->cache_mem += sizeof(struct flush_cmd_control); /* free nids */ diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d4783d9cf4e0..167c5f841b5f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -248,13 +248,12 @@ struct discard_entry { int len; /* # of consecutive blocks of the discard */ }; -struct bio_entry { - struct list_head list; - block_t lstart; - block_t len; - struct bio *bio; - struct completion event; - int error; +struct discard_cmd { + struct list_head list; /* command list */ + struct completion wait; /* compleation */ + block_t lstart; /* logical start address */ + block_t len; /* length */ + struct bio *bio; /* bio */ }; /* for the list of fsync inodes, used only during recovery */ @@ -701,8 +700,8 @@ struct f2fs_sm_info { unsigned int rec_prefree_segments; /* for small discard management */ - struct list_head discard_list; /* 4KB discard list */ - struct list_head wait_list; /* linked with issued discard bio */ + struct list_head discard_entry_list; /* 4KB discard entry list */ + struct list_head discard_cmd_list; /* discard cmd list */ int nr_discards; /* # of discards in the list */ int max_discards; /* max. discards to be issued */ @@ -716,8 +715,7 @@ struct f2fs_sm_info { unsigned int min_fsync_blocks; /* threshold for fsync */ /* for flush command control */ - struct flush_cmd_control *cmd_control_info; - + struct flush_cmd_control *fcc_info; }; /* diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c39bbffb0cac..289b3facd2d8 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -26,7 +26,7 @@ #define __reverse_ffz(x) __reverse_ffs(~(x)) static struct kmem_cache *discard_entry_slab; -static struct kmem_cache *bio_entry_slab; +static struct kmem_cache *discard_cmd_slab; static struct kmem_cache *sit_entry_set_slab; static struct kmem_cache *inmem_entry_slab; @@ -439,7 +439,7 @@ static int submit_flush_wait(struct f2fs_sb_info *sbi) static int issue_flush_thread(void *data) { struct f2fs_sb_info *sbi = data; - struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; + struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; wait_queue_head_t *q = &fcc->flush_wait_queue; repeat: if (kthread_should_stop()) @@ -468,7 +468,7 @@ static int issue_flush_thread(void *data) int f2fs_issue_flush(struct f2fs_sb_info *sbi) { - struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; + struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; struct flush_cmd cmd; trace_f2fs_issue_flush(sbi->sb, test_opt(sbi, NOBARRIER), @@ -511,8 +511,8 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) struct flush_cmd_control *fcc; int err = 0; - if (SM_I(sbi)->cmd_control_info) { - fcc = SM_I(sbi)->cmd_control_info; + if (SM_I(sbi)->fcc_info) { + fcc = SM_I(sbi)->fcc_info; goto init_thread; } @@ -522,14 +522,14 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) atomic_set(&fcc->submit_flush, 0); init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); - SM_I(sbi)->cmd_control_info = fcc; + SM_I(sbi)->fcc_info = fcc; init_thread: fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { err = PTR_ERR(fcc->f2fs_issue_flush); kfree(fcc); - SM_I(sbi)->cmd_control_info = NULL; + SM_I(sbi)->fcc_info = NULL; return err; } @@ -538,7 +538,7 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) { - struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; + struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; if (fcc && fcc->f2fs_issue_flush) { struct task_struct *flush_thread = fcc->f2fs_issue_flush; @@ -548,7 +548,7 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) } if (free) { kfree(fcc); - SM_I(sbi)->cmd_control_info = NULL; + SM_I(sbi)->fcc_info = NULL; } } @@ -628,42 +628,43 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_unlock(&dirty_i->seglist_lock); } -static struct bio_entry *__add_bio_entry(struct f2fs_sb_info *sbi, +static struct discard_cmd *__add_discard_cmd(struct f2fs_sb_info *sbi, struct bio *bio, block_t lstart, block_t len) { - struct list_head *wait_list = &(SM_I(sbi)->wait_list); - struct bio_entry *be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS); + struct list_head *wait_list = &(SM_I(sbi)->discard_cmd_list); + struct discard_cmd *dc; - INIT_LIST_HEAD(&be->list); - be->bio = bio; - be->lstart = lstart; - be->len = len; - init_completion(&be->event); - list_add_tail(&be->list, wait_list); + dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS); + INIT_LIST_HEAD(&dc->list); + dc->bio = bio; + dc->lstart = lstart; + dc->len = len; + init_completion(&dc->wait); + list_add_tail(&dc->list, wait_list); - return be; + return dc; } /* This should be covered by global mutex, &sit_i->sentry_lock */ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) { - struct list_head *wait_list = &(SM_I(sbi)->wait_list); - struct bio_entry *be, *tmp; + struct list_head *wait_list = &(SM_I(sbi)->discard_cmd_list); + struct discard_cmd *dc, *tmp; - list_for_each_entry_safe(be, tmp, wait_list, list) { - struct bio *bio = be->bio; + list_for_each_entry_safe(dc, tmp, wait_list, list) { + struct bio *bio = dc->bio; int err; - if (!completion_done(&be->event)) { - if ((be->lstart <= blkaddr && - blkaddr < be->lstart + be->len) || + if (!completion_done(&dc->wait)) { + if ((dc->lstart <= blkaddr && + blkaddr < dc->lstart + dc->len) || blkaddr == NULL_ADDR) - wait_for_completion_io(&be->event); + wait_for_completion_io(&dc->wait); else continue; } - err = be->error; + err = bio->bi_error; if (err == -EOPNOTSUPP) err = 0; @@ -672,17 +673,16 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) "Issue discard failed, ret: %d", err); bio_put(bio); - list_del(&be->list); - kmem_cache_free(bio_entry_slab, be); + list_del(&dc->list); + kmem_cache_free(discard_cmd_slab, dc); } } -static void f2fs_submit_bio_wait_endio(struct bio *bio) +static void f2fs_submit_discard_endio(struct bio *bio) { - struct bio_entry *be = (struct bio_entry *)bio->bi_private; + struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; - be->error = bio->bi_error; - complete(&be->event); + complete(&dc->wait); } /* copied from block/blk-lib.c in 4.10-rc1 */ @@ -786,11 +786,11 @@ static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, SECTOR_FROM_BLOCK(blklen), GFP_NOFS, 0, &bio); if (!err && bio) { - struct bio_entry *be = __add_bio_entry(sbi, bio, + struct discard_cmd *dc = __add_discard_cmd(sbi, bio, lblkstart, blklen); - bio->bi_private = be; - bio->bi_end_io = f2fs_submit_bio_wait_endio; + bio->bi_private = dc; + bio->bi_end_io = f2fs_submit_discard_endio; submit_bio(REQ_SYNC, bio); } return err; @@ -897,7 +897,7 @@ static void __add_discard_entry(struct f2fs_sb_info *sbi, struct cp_control *cpc, struct seg_entry *se, unsigned int start, unsigned int end) { - struct list_head *head = &SM_I(sbi)->discard_list; + struct list_head *head = &SM_I(sbi)->discard_entry_list; struct discard_entry *new, *last; if (!list_empty(head)) { @@ -966,7 +966,7 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, void release_discard_addrs(struct f2fs_sb_info *sbi) { - struct list_head *head = &(SM_I(sbi)->discard_list); + struct list_head *head = &(SM_I(sbi)->discard_entry_list); struct discard_entry *entry, *this; /* drop caches */ @@ -992,7 +992,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) { - struct list_head *head = &(SM_I(sbi)->discard_list); + struct list_head *head = &(SM_I(sbi)->discard_entry_list); struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct blk_plug plug; @@ -2783,8 +2783,8 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; - INIT_LIST_HEAD(&sm_info->discard_list); - INIT_LIST_HEAD(&sm_info->wait_list); + INIT_LIST_HEAD(&sm_info->discard_entry_list); + INIT_LIST_HEAD(&sm_info->discard_cmd_list); sm_info->nr_discards = 0; sm_info->max_discards = 0; @@ -2934,15 +2934,15 @@ int __init create_segment_manager_caches(void) if (!discard_entry_slab) goto fail; - bio_entry_slab = f2fs_kmem_cache_create("bio_entry", - sizeof(struct bio_entry)); - if (!bio_entry_slab) + discard_cmd_slab = f2fs_kmem_cache_create("discard_cmd", + sizeof(struct discard_cmd)); + if (!discard_cmd_slab) goto destroy_discard_entry; sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set", sizeof(struct sit_entry_set)); if (!sit_entry_set_slab) - goto destroy_bio_entry; + goto destroy_discard_cmd; inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry", sizeof(struct inmem_pages)); @@ -2952,8 +2952,8 @@ int __init create_segment_manager_caches(void) destroy_sit_entry_set: kmem_cache_destroy(sit_entry_set_slab); -destroy_bio_entry: - kmem_cache_destroy(bio_entry_slab); +destroy_discard_cmd: + kmem_cache_destroy(discard_cmd_slab); destroy_discard_entry: kmem_cache_destroy(discard_entry_slab); fail: @@ -2963,7 +2963,7 @@ int __init create_segment_manager_caches(void) void destroy_segment_manager_caches(void) { kmem_cache_destroy(sit_entry_set_slab); - kmem_cache_destroy(bio_entry_slab); + kmem_cache_destroy(discard_cmd_slab); kmem_cache_destroy(discard_entry_slab); kmem_cache_destroy(inmem_entry_slab); } From 4844bb76e75265a7c742cfcaf0e8e54f02994933 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 11 Jan 2017 10:21:15 -0800 Subject: [PATCH 0100/1212] f2fs: reorganize stat information This patch modifies stat information more clearly. Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 883f1ea9e0b6..cd338ca24941 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -258,8 +258,6 @@ static int stat_show(struct seq_file *s, void *v) si->inline_dir); seq_printf(s, " - Orphan Inode: %u\n", si->orphans); - seq_printf(s, " - Atomic write count: %4d (Max. %4d)\n", - si->aw_cnt, si->max_aw_cnt); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", si->main_area_segs, si->main_area_sections, si->main_area_zones); @@ -318,8 +316,10 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - inmem: %4d, wb_cp_data: %4d, wb_data: %4d\n", - si->inmem_pages, si->nr_wb_cp_data, si->nr_wb_data); + seq_printf(s, " - IO (CP: %4d, Data: %4d)\n", + si->nr_wb_cp_data, si->nr_wb_data); + seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d)\n", + si->inmem_pages, si->aw_cnt, si->max_aw_cnt); seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", From c4cc29d19eaf010c1133823438f5a3adba155f05 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 9 Jan 2017 18:16:29 -0800 Subject: [PATCH 0101/1212] f2fs: remove batched discard in f2fs_trim_fs We don't need to do multiple checkpoints, since we don't actually wait for completion of discard commands during checkpoint. Instead, we still need to avoid very big discard commands, since that large discard can interfere block allocation. Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 6 ---- fs/f2fs/f2fs.h | 9 +----- fs/f2fs/segment.c | 38 +++++++++---------------- fs/f2fs/super.c | 2 -- 4 files changed, 14 insertions(+), 41 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 0345f2d1c727..bc8fbfa1c800 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -75,12 +75,6 @@ Contact: "Jaegeuk Kim" Description: Controls the memory footprint used by f2fs. -What: /sys/fs/f2fs//trim_sections -Date: February 2015 -Contact: "Jaegeuk Kim" -Description: - Controls the trimming rate in batch mode. - What: /sys/fs/f2fs//cp_interval Date: October 2015 Contact: "Jaegeuk Kim" diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 167c5f841b5f..dc436f780295 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -195,11 +195,7 @@ enum { CP_DISCARD, }; -#define DEF_BATCHED_TRIM_SECTIONS 2 -#define BATCHED_TRIM_SEGMENTS(sbi) \ - (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec) -#define BATCHED_TRIM_BLOCKS(sbi) \ - (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) +#define MAX_DISCARD_BLOCKS(sbi) (1 << (sbi)->log_blocks_per_seg) #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ @@ -705,9 +701,6 @@ struct f2fs_sm_info { int nr_discards; /* # of discards in the list */ int max_discards; /* max. discards to be issued */ - /* for batched trimming */ - unsigned int trim_sections; /* # of sections to trim */ - struct list_head sit_entry_set; /* sit entry set list */ unsigned int ipu_policy; /* in-place-update policy */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 289b3facd2d8..245ba28529b1 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -903,7 +903,8 @@ static void __add_discard_entry(struct f2fs_sb_info *sbi, if (!list_empty(head)) { last = list_last_entry(head, struct discard_entry, list); if (START_BLOCK(sbi, cpc->trim_start) + start == - last->blkaddr + last->len) { + last->blkaddr + last->len && + last->len <= MAX_DISCARD_BLOCKS(sbi)) { last->len += end - start; goto done; } @@ -1593,36 +1594,25 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) "Found FS corruption, run fsck to fix."); goto out; } + if (sbi->discard_blks == 0) + goto out; /* start/end segment number in main_area */ start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start); end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 : GET_SEGNO(sbi, end); + /* + * do checkpoint to issue discard commands safely since we now can + * use async discard. + */ cpc.reason = CP_DISCARD; cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen)); + cpc.trim_start = start_segno; + cpc.trim_end = end_segno; - /* do checkpoint to issue discard commands safely */ - for (; start_segno <= end_segno; start_segno = cpc.trim_end + 1) { - cpc.trim_start = start_segno; - - if (sbi->discard_blks == 0) - break; - else if (sbi->discard_blks < BATCHED_TRIM_BLOCKS(sbi)) - cpc.trim_end = end_segno; - else - cpc.trim_end = min_t(unsigned int, - rounddown(start_segno + - BATCHED_TRIM_SEGMENTS(sbi), - sbi->segs_per_sec) - 1, end_segno); - - mutex_lock(&sbi->gc_mutex); - err = write_checkpoint(sbi, &cpc); - mutex_unlock(&sbi->gc_mutex); - if (err) - break; - - schedule(); - } + mutex_lock(&sbi->gc_mutex); + err = write_checkpoint(sbi, &cpc); + mutex_unlock(&sbi->gc_mutex); out: range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); return err; @@ -2788,8 +2778,6 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->nr_discards = 0; sm_info->max_discards = 0; - sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS; - INIT_LIST_HEAD(&sm_info->sit_entry_set); if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) { diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 84d5686c4aa4..38d40670aed0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -284,7 +284,6 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); @@ -309,7 +308,6 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(gc_idle), ATTR_LIST(reclaim_segments), ATTR_LIST(max_small_discards), - ATTR_LIST(batched_trim_sections), ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), ATTR_LIST(min_fsync_blocks), From 565f0225f95f1518132952e8fe6854c92a60fd46 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 11 Jan 2017 14:40:24 -0800 Subject: [PATCH 0102/1212] f2fs: factor out discard command info into discard_cmd_control This patch adds discard_cmd_control with the existing discarding controls. Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 2 ++ fs/f2fs/f2fs.h | 16 ++++++----- fs/f2fs/segment.c | 68 ++++++++++++++++++++++++++++++++++++----------- fs/f2fs/super.c | 5 +++- 4 files changed, 69 insertions(+), 22 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index cd338ca24941..f9f6b0aeba02 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -196,6 +196,8 @@ static void update_mem_info(struct f2fs_sb_info *sbi) /* build merge flush thread */ if (SM_I(sbi)->fcc_info) si->cache_mem += sizeof(struct flush_cmd_control); + if (SM_I(sbi)->dcc_info) + si->cache_mem += sizeof(struct discard_cmd_control); /* free nids */ si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] + diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index dc436f780295..1bec4707e830 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -252,6 +252,13 @@ struct discard_cmd { struct bio *bio; /* bio */ }; +struct discard_cmd_control { + struct list_head discard_entry_list; /* 4KB discard entry list */ + int nr_discards; /* # of discards in the list */ + struct list_head discard_cmd_list; /* discard cmd list */ + int max_discards; /* max. discards to be issued */ +}; + /* for the list of fsync inodes, used only during recovery */ struct fsync_inode_entry { struct list_head list; /* list head */ @@ -695,12 +702,6 @@ struct f2fs_sm_info { /* a threshold to reclaim prefree segments */ unsigned int rec_prefree_segments; - /* for small discard management */ - struct list_head discard_entry_list; /* 4KB discard entry list */ - struct list_head discard_cmd_list; /* discard cmd list */ - int nr_discards; /* # of discards in the list */ - int max_discards; /* max. discards to be issued */ - struct list_head sit_entry_set; /* sit entry set list */ unsigned int ipu_policy; /* in-place-update policy */ @@ -709,6 +710,9 @@ struct f2fs_sm_info { /* for flush command control */ struct flush_cmd_control *fcc_info; + + /* for discard command control */ + struct discard_cmd_control *dcc_info; }; /* diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 245ba28529b1..dbe4b3e3198f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -631,7 +631,8 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) static struct discard_cmd *__add_discard_cmd(struct f2fs_sb_info *sbi, struct bio *bio, block_t lstart, block_t len) { - struct list_head *wait_list = &(SM_I(sbi)->discard_cmd_list); + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *cmd_list = &(dcc->discard_cmd_list); struct discard_cmd *dc; dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS); @@ -640,7 +641,7 @@ static struct discard_cmd *__add_discard_cmd(struct f2fs_sb_info *sbi, dc->lstart = lstart; dc->len = len; init_completion(&dc->wait); - list_add_tail(&dc->list, wait_list); + list_add_tail(&dc->list, cmd_list); return dc; } @@ -648,7 +649,8 @@ static struct discard_cmd *__add_discard_cmd(struct f2fs_sb_info *sbi, /* This should be covered by global mutex, &sit_i->sentry_lock */ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) { - struct list_head *wait_list = &(SM_I(sbi)->discard_cmd_list); + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *wait_list = &(dcc->discard_cmd_list); struct discard_cmd *dc, *tmp; list_for_each_entry_safe(dc, tmp, wait_list, list) { @@ -897,7 +899,7 @@ static void __add_discard_entry(struct f2fs_sb_info *sbi, struct cp_control *cpc, struct seg_entry *se, unsigned int start, unsigned int end) { - struct list_head *head = &SM_I(sbi)->discard_entry_list; + struct list_head *head = &SM_I(sbi)->dcc_info->discard_entry_list; struct discard_entry *new, *last; if (!list_empty(head)) { @@ -916,7 +918,7 @@ static void __add_discard_entry(struct f2fs_sb_info *sbi, new->len = end - start; list_add_tail(&new->list, head); done: - SM_I(sbi)->nr_discards += end - start; + SM_I(sbi)->dcc_info->nr_discards += end - start; } static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, @@ -938,7 +940,8 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, if (!force) { if (!test_opt(sbi, DISCARD) || !se->valid_blocks || - SM_I(sbi)->nr_discards >= SM_I(sbi)->max_discards) + SM_I(sbi)->dcc_info->nr_discards >= + SM_I(sbi)->dcc_info->max_discards) return false; } @@ -947,7 +950,8 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, dmap[i] = force ? ~ckpt_map[i] & ~discard_map[i] : (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i]; - while (force || SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) { + while (force || SM_I(sbi)->dcc_info->nr_discards <= + SM_I(sbi)->dcc_info->max_discards) { start = __find_rev_next_bit(dmap, max_blocks, end + 1); if (start >= max_blocks) break; @@ -967,7 +971,7 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, void release_discard_addrs(struct f2fs_sb_info *sbi) { - struct list_head *head = &(SM_I(sbi)->discard_entry_list); + struct list_head *head = &(SM_I(sbi)->dcc_info->discard_entry_list); struct discard_entry *entry, *this; /* drop caches */ @@ -993,7 +997,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) { - struct list_head *head = &(SM_I(sbi)->discard_entry_list); + struct list_head *head = &(SM_I(sbi)->dcc_info->discard_entry_list); struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct blk_plug plug; @@ -1053,13 +1057,47 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) cpc->trimmed += entry->len; skip: list_del(&entry->list); - SM_I(sbi)->nr_discards -= entry->len; + SM_I(sbi)->dcc_info->nr_discards -= entry->len; kmem_cache_free(discard_entry_slab, entry); } blk_finish_plug(&plug); } +int create_discard_cmd_control(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc; + int err = 0; + + if (SM_I(sbi)->dcc_info) { + dcc = SM_I(sbi)->dcc_info; + goto init_thread; + } + + dcc = kzalloc(sizeof(struct discard_cmd_control), GFP_KERNEL); + if (!dcc) + return -ENOMEM; + + INIT_LIST_HEAD(&dcc->discard_entry_list); + INIT_LIST_HEAD(&dcc->discard_cmd_list); + dcc->nr_discards = 0; + dcc->max_discards = 0; + + SM_I(sbi)->dcc_info = dcc; +init_thread: + return err; +} + +void destroy_discard_cmd_control(struct f2fs_sb_info *sbi, bool free) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + + if (free) { + kfree(dcc); + SM_I(sbi)->dcc_info = NULL; + } +} + static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); @@ -2773,11 +2811,6 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; - INIT_LIST_HEAD(&sm_info->discard_entry_list); - INIT_LIST_HEAD(&sm_info->discard_cmd_list); - sm_info->nr_discards = 0; - sm_info->max_discards = 0; - INIT_LIST_HEAD(&sm_info->sit_entry_set); if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) { @@ -2786,6 +2819,10 @@ int build_segment_manager(struct f2fs_sb_info *sbi) return err; } + err = create_discard_cmd_control(sbi); + if (err) + return err; + err = build_sit_info(sbi); if (err) return err; @@ -2907,6 +2944,7 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi) if (!sm_info) return; destroy_flush_cmd_control(sbi, true); + destroy_discard_cmd_control(sbi, true); destroy_dirty_segmap(sbi); destroy_curseg(sbi); destroy_free_segmap(sbi); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 38d40670aed0..1f152734b2ec 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -145,6 +145,7 @@ static match_table_t f2fs_tokens = { enum { GC_THREAD, /* struct f2fs_gc_thread */ SM_INFO, /* struct f2fs_sm_info */ + DCC_INFO, /* struct discard_cmd_control */ NM_INFO, /* struct f2fs_nm_info */ F2FS_SBI, /* struct f2fs_sb_info */ #ifdef CONFIG_F2FS_FAULT_INJECTION @@ -168,6 +169,8 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) return (unsigned char *)sbi->gc_thread; else if (struct_type == SM_INFO) return (unsigned char *)SM_I(sbi); + else if (struct_type == DCC_INFO) + return (unsigned char *)SM_I(sbi)->dcc_info; else if (struct_type == NM_INFO) return (unsigned char *)NM_I(sbi); else if (struct_type == F2FS_SBI) @@ -283,7 +286,7 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); From 587ad91ac9a8fe33865b05e086fea6384ecfbe48 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 9 Jan 2017 20:32:07 -0800 Subject: [PATCH 0103/1212] f2fs: add a kernel thread to issue discard commands asynchronously This patch adds a kernel thread to issue discard commands. It proposes three states, D_PREP, D_SUBMIT, and D_DONE to identify current bio status. Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/segment.c --- fs/f2fs/f2fs.h | 11 ++++ fs/f2fs/segment.c | 131 ++++++++++++++++++++++++++++++++++------------ 2 files changed, 109 insertions(+), 33 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1bec4707e830..29aa96496c67 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -196,6 +196,7 @@ enum { }; #define MAX_DISCARD_BLOCKS(sbi) (1 << (sbi)->log_blocks_per_seg) +#define DISCARD_ISSUE_RATE 8 #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ @@ -244,18 +245,28 @@ struct discard_entry { int len; /* # of consecutive blocks of the discard */ }; +enum { + D_PREP, + D_SUBMIT, + D_DONE, +}; + struct discard_cmd { struct list_head list; /* command list */ struct completion wait; /* compleation */ block_t lstart; /* logical start address */ block_t len; /* length */ struct bio *bio; /* bio */ + int state; /* state */ }; struct discard_cmd_control { + struct task_struct *f2fs_issue_discard; /* discard thread */ struct list_head discard_entry_list; /* 4KB discard entry list */ int nr_discards; /* # of discards in the list */ struct list_head discard_cmd_list; /* discard cmd list */ + wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ + struct mutex cmd_lock; int max_discards; /* max. discards to be issued */ }; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index dbe4b3e3198f..bae15887ac98 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -628,7 +628,7 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_unlock(&dirty_i->seglist_lock); } -static struct discard_cmd *__add_discard_cmd(struct f2fs_sb_info *sbi, +static void __add_discard_cmd(struct f2fs_sb_info *sbi, struct bio *bio, block_t lstart, block_t len) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; @@ -638,12 +638,30 @@ static struct discard_cmd *__add_discard_cmd(struct f2fs_sb_info *sbi, dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS); INIT_LIST_HEAD(&dc->list); dc->bio = bio; + bio->bi_private = dc; dc->lstart = lstart; dc->len = len; + dc->state = D_PREP; init_completion(&dc->wait); - list_add_tail(&dc->list, cmd_list); - return dc; + mutex_lock(&dcc->cmd_lock); + list_add_tail(&dc->list, cmd_list); + mutex_unlock(&dcc->cmd_lock); +} + +static void __remove_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc) +{ + int err = dc->bio->bi_error; + + if (err == -EOPNOTSUPP) + err = 0; + + if (err) + f2fs_msg(sbi->sb, KERN_INFO, + "Issue discard failed, ret: %d", err); + bio_put(dc->bio); + list_del(&dc->list); + kmem_cache_free(discard_cmd_slab, dc); } /* This should be covered by global mutex, &sit_i->sentry_lock */ @@ -653,31 +671,28 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) struct list_head *wait_list = &(dcc->discard_cmd_list); struct discard_cmd *dc, *tmp; + mutex_lock(&dcc->cmd_lock); list_for_each_entry_safe(dc, tmp, wait_list, list) { - struct bio *bio = dc->bio; - int err; - if (!completion_done(&dc->wait)) { - if ((dc->lstart <= blkaddr && - blkaddr < dc->lstart + dc->len) || - blkaddr == NULL_ADDR) - wait_for_completion_io(&dc->wait); - else - continue; + if (blkaddr == NULL_ADDR) { + if (dc->state == D_PREP) { + dc->state = D_SUBMIT; + submit_bio(REQ_SYNC, dc->bio); + } + wait_for_completion_io(&dc->wait); + + __remove_discard_cmd(sbi, dc); + continue; } - err = bio->bi_error; - if (err == -EOPNOTSUPP) - err = 0; - - if (err) - f2fs_msg(sbi->sb, KERN_INFO, - "Issue discard failed, ret: %d", err); - - bio_put(bio); - list_del(&dc->list); - kmem_cache_free(discard_cmd_slab, dc); + if (dc->lstart <= blkaddr && blkaddr < dc->lstart + dc->len) { + if (dc->state == D_SUBMIT) + wait_for_completion_io(&dc->wait); + else + __remove_discard_cmd(sbi, dc); + } } + mutex_unlock(&dcc->cmd_lock); } static void f2fs_submit_discard_endio(struct bio *bio) @@ -685,6 +700,7 @@ static void f2fs_submit_discard_endio(struct bio *bio) struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; complete(&dc->wait); + dc->state = D_DONE; } /* copied from block/blk-lib.c in 4.10-rc1 */ @@ -768,6 +784,45 @@ static int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, return 0; } +static int issue_discard_thread(void *data) +{ + struct f2fs_sb_info *sbi = data; + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + wait_queue_head_t *q = &dcc->discard_wait_queue; + struct list_head *cmd_list = &dcc->discard_cmd_list; + struct discard_cmd *dc, *tmp; + struct blk_plug plug; + int iter = 0; +repeat: + if (kthread_should_stop()) + return 0; + + blk_start_plug(&plug); + + mutex_lock(&dcc->cmd_lock); + list_for_each_entry_safe(dc, tmp, cmd_list, list) { + if (dc->state == D_PREP) { + dc->state = D_SUBMIT; + submit_bio(REQ_SYNC, dc->bio); + if (iter++ > DISCARD_ISSUE_RATE) + break; + } else if (dc->state == D_DONE) { + __remove_discard_cmd(sbi, dc); + } + } + mutex_unlock(&dcc->cmd_lock); + + blk_finish_plug(&plug); + + iter = 0; + congestion_wait(BLK_RW_SYNC, HZ/50); + + wait_event_interruptible(*q, + kthread_should_stop() || !list_empty(&dcc->discard_cmd_list)); + goto repeat; +} + + /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) @@ -788,12 +843,9 @@ static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, SECTOR_FROM_BLOCK(blklen), GFP_NOFS, 0, &bio); if (!err && bio) { - struct discard_cmd *dc = __add_discard_cmd(sbi, bio, - lblkstart, blklen); - - bio->bi_private = dc; bio->bi_end_io = f2fs_submit_discard_endio; - submit_bio(REQ_SYNC, bio); + __add_discard_cmd(sbi, bio, lblkstart, blklen); + wake_up(&SM_I(sbi)->dcc_info->discard_wait_queue); } return err; } @@ -1000,14 +1052,11 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct list_head *head = &(SM_I(sbi)->dcc_info->discard_entry_list); struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - struct blk_plug plug; unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int start = 0, end = -1; unsigned int secno, start_segno; bool force = (cpc->reason == CP_DISCARD); - blk_start_plug(&plug); - mutex_lock(&dirty_i->seglist_lock); while (1) { @@ -1060,12 +1109,11 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) SM_I(sbi)->dcc_info->nr_discards -= entry->len; kmem_cache_free(discard_entry_slab, entry); } - - blk_finish_plug(&plug); } int create_discard_cmd_control(struct f2fs_sb_info *sbi) { + dev_t dev = sbi->sb->s_bdev->bd_dev; struct discard_cmd_control *dcc; int err = 0; @@ -1080,11 +1128,22 @@ int create_discard_cmd_control(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&dcc->discard_entry_list); INIT_LIST_HEAD(&dcc->discard_cmd_list); + mutex_init(&dcc->cmd_lock); dcc->nr_discards = 0; dcc->max_discards = 0; + init_waitqueue_head(&dcc->discard_wait_queue); SM_I(sbi)->dcc_info = dcc; init_thread: + dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi, + "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(dcc->f2fs_issue_discard)) { + err = PTR_ERR(dcc->f2fs_issue_discard); + kfree(dcc); + SM_I(sbi)->dcc_info = NULL; + return err; + } + return err; } @@ -1092,6 +1151,12 @@ void destroy_discard_cmd_control(struct f2fs_sb_info *sbi, bool free) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + if (dcc && dcc->f2fs_issue_discard) { + struct task_struct *discard_thread = dcc->f2fs_issue_discard; + + dcc->f2fs_issue_discard = NULL; + kthread_stop(discard_thread); + } if (free) { kfree(dcc); SM_I(sbi)->dcc_info = NULL; From 334173cc4ca16534d011446c301e85a7cba5c035 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 11 Jan 2017 10:20:04 -0800 Subject: [PATCH 0104/1212] f2fs: show # of on-going flush and discard bios This patch adds stat information for flush and discard commands. Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/segment.c --- fs/f2fs/debug.c | 11 +++++++++-- fs/f2fs/f2fs.h | 3 ++- fs/f2fs/segment.c | 6 ++++++ 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index f9f6b0aeba02..0ca977a94c13 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -54,6 +54,12 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA); + if (SM_I(sbi) && SM_I(sbi)->fcc_info) + si->nr_flush = + atomic_read(&SM_I(sbi)->fcc_info->submit_flush); + if (SM_I(sbi) && SM_I(sbi)->dcc_info) + si->nr_discard = + atomic_read(&SM_I(sbi)->dcc_info->submit_discard); si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); @@ -318,8 +324,9 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - IO (CP: %4d, Data: %4d)\n", - si->nr_wb_cp_data, si->nr_wb_data); + seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: %4d, Discard: %4d)\n", + si->nr_wb_cp_data, si->nr_wb_data, + si->nr_flush, si->nr_discard); seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d)\n", si->inmem_pages, si->aw_cnt, si->max_aw_cnt); seq_printf(s, " - nodes: %4d in %4d\n", diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 29aa96496c67..9a4e2012ba36 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -268,6 +268,7 @@ struct discard_cmd_control { wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ struct mutex cmd_lock; int max_discards; /* max. discards to be issued */ + atomic_t submit_discard; /* # of issued discard */ }; /* for the list of fsync inodes, used only during recovery */ @@ -2321,7 +2322,7 @@ struct f2fs_stat_info { unsigned int ndirty_dirs, ndirty_files, ndirty_all; int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids; int total_count, utilization; - int bg_gc, nr_wb_cp_data, nr_wb_data; + int bg_gc, nr_wb_cp_data, nr_wb_data, nr_flush, nr_discard; int inline_xattr, inline_inode, inline_dir, orphans; int aw_cnt, max_aw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index bae15887ac98..5efc36f88b4a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -653,6 +653,9 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *d { int err = dc->bio->bi_error; + if (dc->state == D_DONE) + atomic_dec(&(SM_I(sbi)->dcc_info->submit_discard)); + if (err == -EOPNOTSUPP) err = 0; @@ -678,6 +681,7 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) if (dc->state == D_PREP) { dc->state = D_SUBMIT; submit_bio(REQ_SYNC, dc->bio); + atomic_inc(&dcc->submit_discard); } wait_for_completion_io(&dc->wait); @@ -804,6 +808,7 @@ static int issue_discard_thread(void *data) if (dc->state == D_PREP) { dc->state = D_SUBMIT; submit_bio(REQ_SYNC, dc->bio); + atomic_inc(&dcc->submit_discard); if (iter++ > DISCARD_ISSUE_RATE) break; } else if (dc->state == D_DONE) { @@ -1129,6 +1134,7 @@ int create_discard_cmd_control(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&dcc->discard_entry_list); INIT_LIST_HEAD(&dcc->discard_cmd_list); mutex_init(&dcc->cmd_lock); + atomic_set(&dcc->submit_discard, 0); dcc->nr_discards = 0; dcc->max_discards = 0; From 87d83ae92ee06c4478d17db9934c0ea3b52be164 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 13 Jan 2017 13:12:29 -0800 Subject: [PATCH 0105/1212] f2fs: do not preallocate blocks which has wrong buffer Sheng Yong reports needless preallocation if write(small_buffer, large_size) is called. In that case, f2fs preallocates large_size, but vfs returns early due to small_buffer size. Let's detect it before preallocation phase in f2fs. Reported-by: Sheng Yong Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/file.c --- fs/f2fs/data.c | 6 +++++- fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 8 +++++++- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7efc2bf88641..ead210ae9468 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -749,6 +749,9 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) struct f2fs_map_blocks map; int err = 0; + if (is_inode_flag_set(inode, FI_NO_PREALLOC)) + return 0; + map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos); map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from)); if (map.m_len > map.m_lblk) @@ -1650,7 +1653,8 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, * we already allocated all the blocks, so we don't need to get * the block addresses when there is no need to fill the page. */ - if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE) + if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE && + !is_inode_flag_set(inode, FI_NO_PREALLOC)) return 0; if (f2fs_has_inline_data(inode) || diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9a4e2012ba36..7fc161474647 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1724,6 +1724,7 @@ enum { FI_INLINE_DOTS, /* indicate inline dot dentries */ FI_DO_DEFRAG, /* indicate defragment is running */ FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ + FI_NO_PREALLOC, /* indicate skipped preallocated blocks */ }; static inline void __mark_inode_dirty_flag(struct inode *inode, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index e4e5d76d80b0..27ef66ff7aab 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -2277,8 +2278,12 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret > 0) { - int err = f2fs_preallocate_blocks(iocb, from); + int err; + if (iov_iter_fault_in_readable(from, iov_iter_count(from))) + set_inode_flag(inode, FI_NO_PREALLOC); + + err = f2fs_preallocate_blocks(iocb, from); if (err) { inode_unlock(inode); return err; @@ -2286,6 +2291,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) blk_start_plug(&plug); ret = __generic_file_write_iter(iocb, from); blk_finish_plug(&plug); + clear_inode_flag(inode, FI_NO_PREALLOC); } inode_unlock(inode); From b89d1d4dfd7d78e91b425706c23130bbdcebb813 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Sun, 22 Jan 2017 12:21:02 +0800 Subject: [PATCH 0106/1212] f2fs: fix a dead loop in f2fs_fiemap() A dead loop can be triggered in f2fs_fiemap() using the test case as below: ... fd = open(); fallocate(fd, 0, 0, 4294967296); ioctl(fd, FS_IOC_FIEMAP, fiemap_buf); ... It's caused by an overflow in __get_data_block(): ... bh->b_size = map.m_len << inode->i_blkbits; ... map.m_len is an unsigned int, and bh->b_size is a size_t which is 64 bits on 64 bits archtecture, type conversion from an unsigned int to a size_t will result in an overflow. In the above-mentioned case, bh->b_size will be zero, and f2fs_fiemap() will call get_data_block() at block 0 again an again. Fix this by adding a force conversion before left shift. Signed-off-by: Wei Fang Acked-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ead210ae9468..dbd2822b5ab8 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -964,7 +964,7 @@ static int __get_data_block(struct inode *inode, sector_t iblock, if (!err) { map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags; - bh->b_size = map.m_len << inode->i_blkbits; + bh->b_size = (u64)map.m_len << inode->i_blkbits; } return err; } From 21980a25e7276552f4923089dc28bb9af9024e9d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 24 Jan 2017 20:39:51 +0800 Subject: [PATCH 0107/1212] f2fs: enhance lookup xattr Previously, in getxattr we will load all entries both in inline xattr and xattr node block, and then do the lookup in all entries, but our lookup flow shows low efficiency, since if we can lookup and hit in inline xattr of inode page cache first, we don't need to load and lookup xattr node block, which can obviously save cpu time and IO latency. Signed-off-by: Chao Yu [Jaegeuk Kim: initialize NULL to avoid warning] Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 132 ++++++++++++++++++++++++++++++++++++++++++------ fs/f2fs/xattr.h | 7 +-- 2 files changed, 121 insertions(+), 18 deletions(-) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 1c4d5e39586c..ba67ca0c7014 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -264,6 +264,112 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index, return entry; } +static struct f2fs_xattr_entry *__find_inline_xattr(void *base_addr, + void **last_addr, int index, + size_t len, const char *name) +{ + struct f2fs_xattr_entry *entry; + unsigned int inline_size = F2FS_INLINE_XATTR_ADDRS << 2; + + list_for_each_xattr(entry, base_addr) { + if ((void *)entry + sizeof(__u32) > base_addr + inline_size || + (void *)XATTR_NEXT_ENTRY(entry) + sizeof(__u32) > + base_addr + inline_size) { + *last_addr = entry; + return NULL; + } + if (entry->e_name_index != index) + continue; + if (entry->e_name_len != len) + continue; + if (!memcmp(entry->e_name, name, len)) + break; + } + return entry; +} + +static int lookup_all_xattrs(struct inode *inode, struct page *ipage, + unsigned int index, unsigned int len, + const char *name, struct f2fs_xattr_entry **xe, + void **base_addr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + void *cur_addr, *txattr_addr, *last_addr = NULL; + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + unsigned int size = xnid ? VALID_XATTR_BLOCK_SIZE : 0; + unsigned int inline_size = 0; + int err = 0; + + inline_size = inline_xattr_size(inode); + + if (!size && !inline_size) + return -ENODATA; + + txattr_addr = kzalloc(inline_size + size + sizeof(__u32), + GFP_F2FS_ZERO); + if (!txattr_addr) + return -ENOMEM; + + /* read from inline xattr */ + if (inline_size) { + struct page *page = NULL; + void *inline_addr; + + if (ipage) { + inline_addr = inline_xattr_addr(ipage); + } else { + page = get_node_page(sbi, inode->i_ino); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out; + } + inline_addr = inline_xattr_addr(page); + } + memcpy(txattr_addr, inline_addr, inline_size); + f2fs_put_page(page, 1); + + *xe = __find_inline_xattr(txattr_addr, &last_addr, + index, len, name); + if (*xe) + goto check; + } + + /* read from xattr node block */ + if (xnid) { + struct page *xpage; + void *xattr_addr; + + /* The inode already has an extended attribute block. */ + xpage = get_node_page(sbi, xnid); + if (IS_ERR(xpage)) { + err = PTR_ERR(xpage); + goto out; + } + + xattr_addr = page_address(xpage); + memcpy(txattr_addr + inline_size, xattr_addr, size); + f2fs_put_page(xpage, 1); + } + + if (last_addr) + cur_addr = XATTR_HDR(last_addr) - 1; + else + cur_addr = txattr_addr; + + *xe = __find_xattr(cur_addr, index, len, name); +check: + if (IS_XATTR_LAST_ENTRY(*xe)) { + err = -ENODATA; + goto out; + } + + *base_addr = txattr_addr; + return 0; +out: + kzfree(txattr_addr); + return err; +} + static int read_all_xattrs(struct inode *inode, struct page *ipage, void **base_addr) { @@ -395,8 +501,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, } xattr_addr = page_address(xpage); - memcpy(xattr_addr, txattr_addr + inline_size, PAGE_SIZE - - sizeof(struct node_footer)); + memcpy(xattr_addr, txattr_addr + inline_size, MAX_XATTR_BLOCK_SIZE); set_page_dirty(xpage); f2fs_put_page(xpage, 1); @@ -408,10 +513,11 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, int f2fs_getxattr(struct inode *inode, int index, const char *name, void *buffer, size_t buffer_size, struct page *ipage) { - struct f2fs_xattr_entry *entry; - void *base_addr; + struct f2fs_xattr_entry *entry = NULL; int error = 0; - size_t size, len; + unsigned int size, len; + char *pval; + void *base_addr = NULL; if (name == NULL) return -EINVAL; @@ -420,30 +526,26 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, if (len > F2FS_NAME_LEN) return -ERANGE; - error = read_all_xattrs(inode, ipage, &base_addr); + error = lookup_all_xattrs(inode, ipage, index, len, name, + &entry, &base_addr); if (error) return error; - entry = __find_xattr(base_addr, index, len, name); - if (IS_XATTR_LAST_ENTRY(entry)) { - error = -ENODATA; - goto cleanup; - } - size = le16_to_cpu(entry->e_value_size); if (buffer && size > buffer_size) { error = -ERANGE; - goto cleanup; + goto out; } + pval = entry->e_name + entry->e_name_len; + if (buffer) { char *pval = entry->e_name + entry->e_name_len; memcpy(buffer, pval, size); } error = size; - -cleanup: +out: kzfree(base_addr); return error; } diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index d2fd0387a3c7..ba64f43d163d 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -72,9 +72,10 @@ struct f2fs_xattr_entry { for (entry = XATTR_FIRST_ENTRY(addr);\ !IS_XATTR_LAST_ENTRY(entry);\ entry = XATTR_NEXT_ENTRY(entry)) - -#define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + PAGE_SIZE - \ - sizeof(struct node_footer) - sizeof(__u32)) +#define MAX_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer)) +#define VALID_XATTR_BLOCK_SIZE (MAX_XATTR_BLOCK_SIZE - sizeof(__u32)) +#define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + \ + VALID_XATTR_BLOCK_SIZE) #define MAX_VALUE_LEN(i) (MIN_OFFSET(i) - \ sizeof(struct f2fs_xattr_header) - \ From d27bebf86574c0edbbf7d17dfd6568ba72c752c7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 25 Jan 2017 10:52:39 +0800 Subject: [PATCH 0108/1212] f2fs: fix to avoid overflow when left shifting page offset We use following method to calculate size with current page index: size = index << PAGE_SHIFT If type of index has only 32-bits size, left shifting will incur overflow, which makes result incorrect. So let's cast index with 64-bits type to avoid such issue. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 4fb4471a3206..e93316ea8d1b 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -428,8 +428,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, } if (!file_keep_isize(inode) && - (i_size_read(inode) <= (start << PAGE_SHIFT))) - f2fs_i_size_write(inode, (start + 1) << PAGE_SHIFT); + (i_size_read(inode) <= ((loff_t)start << PAGE_SHIFT))) + f2fs_i_size_write(inode, + (loff_t)(start + 1) << PAGE_SHIFT); /* * dest is reserved block, invalidate src block From fb40e1231cbc48f58432e095a22aadcff0c0c557 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 25 Jan 2017 10:52:40 +0800 Subject: [PATCH 0109/1212] f2fs: fix null pointer dereference when issuing flush in ->fsync We only allocate flush merge control structure sbi::sm_info::fcc_info when flush_merge option is on, but in f2fs_issue_flush we still try to access member of the control structure without that option, it incurs panic as show below, fix it. Call Trace: __remove_ino_entry+0xa9/0xc0 [f2fs] f2fs_do_sync_file.isra.27+0x214/0x6d0 [f2fs] f2fs_sync_file+0x18/0x20 [f2fs] vfs_fsync_range+0x3d/0xb0 __do_page_fault+0x261/0x4d0 do_fsync+0x3d/0x70 SyS_fsync+0x10/0x20 do_syscall_64+0x6e/0x180 entry_SYSCALL64_slow_path+0x25/0x25 RIP: 0033:0x7f18ce260de0 RSP: 002b:00007ffdd4589258 EFLAGS: 00000246 ORIG_RAX: 000000000000004a RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f18ce260de0 RDX: 0000000000000006 RSI: 00000000016c0360 RDI: 0000000000000003 RBP: 00000000016c0360 R08: 000000000000ffff R09: 000000000000001f R10: 00007ffdd4589020 R11: 0000000000000246 R12: 00000000016c0100 R13: 0000000000000000 R14: 00000000016c1f00 R15: 00000000016c0100 Code: fb 81 e3 00 08 00 00 48 89 45 a0 0f 1f 44 00 00 31 c0 85 db 75 27 41 81 e7 00 04 00 00 74 0c 41 8b 45 20 85 c0 0f 85 81 00 00 00 41 ff 45 20 4c 89 e7 e8 f8 e9 ff ff f0 41 ff 4d 20 48 83 c4 RIP: f2fs_issue_flush+0x5b/0x170 [f2fs] RSP: ffffc90003b5fd78 CR2: 0000000000000020 ---[ end trace a09314c24f037648 ]--- Reported-by: Shuoran Liu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5efc36f88b4a..11f2eccd873d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -477,7 +477,10 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) if (test_opt(sbi, NOBARRIER)) return 0; - if (!test_opt(sbi, FLUSH_MERGE) || !atomic_read(&fcc->submit_flush)) { + if (!test_opt(sbi, FLUSH_MERGE)) + return submit_flush_wait(sbi); + + if (!atomic_read(&fcc->submit_flush)) { int ret; atomic_inc(&fcc->submit_flush); From f3ca0da5c793d6a74ad6355c70b40f0fb314a779 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Fri, 27 Jan 2017 09:35:37 +0800 Subject: [PATCH 0110/1212] f2fs: show the fault injection mount option This patch shows the fault injection mount option in f2fs_show_options(). Signed-off-by: Kaixu Xia Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1f152734b2ec..08b6ba9b3f14 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -554,6 +554,7 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; #ifdef CONFIG_F2FS_FAULT_INJECTION f2fs_build_fault_attr(sbi, arg); + set_opt(sbi, FAULT_INJECTION); #else f2fs_msg(sb, KERN_INFO, "FAULT_INJECTION was not selected"); @@ -944,6 +945,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",active_logs=%u", sbi->active_logs); if (F2FS_IO_SIZE_BITS(sbi)) seq_printf(seq, ",io_size=%uKB", F2FS_IO_SIZE_KB(sbi)); +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (test_opt(sbi, FAULT_INJECTION)) + seq_puts(seq, ",fault_injection"); +#endif return 0; } From 01940a21a97e8f03b8f8fe618728ff099f0e9481 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sun, 29 Jan 2017 14:27:02 +0900 Subject: [PATCH 0111/1212] f2fs: declare missing static function We missed two functions declared as static functions. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 11f2eccd873d..9f0d77b4eefd 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1119,7 +1119,7 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) } } -int create_discard_cmd_control(struct f2fs_sb_info *sbi) +static int create_discard_cmd_control(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; struct discard_cmd_control *dcc; @@ -1156,7 +1156,7 @@ int create_discard_cmd_control(struct f2fs_sb_info *sbi) return err; } -void destroy_discard_cmd_control(struct f2fs_sb_info *sbi, bool free) +static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi, bool free) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; From c99f2de7780a483dd50456282442d1021ad9d246 Mon Sep 17 00:00:00 2001 From: DongOh Shin Date: Mon, 30 Jan 2017 10:55:17 -0800 Subject: [PATCH 0112/1212] f2fs: fix 3 coding style errors in f2fs.h Two coding style errors below have been resolved: "Macros with complex values should be enclosed in parentheses" And a coding style error below has been resolved: "space prohibited before that ',' (ctx:WxW)" Signed-off-by: DongOh Shin Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7fc161474647..95a559a359ca 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -109,9 +109,9 @@ struct f2fs_mount_info { #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) #define F2FS_SET_FEATURE(sb, mask) \ - F2FS_SB(sb)->raw_super->feature |= cpu_to_le32(mask) + (F2FS_SB(sb)->raw_super->feature |= cpu_to_le32(mask)) #define F2FS_CLEAR_FEATURE(sb, mask) \ - F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask) + (F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask)) /* bio stuffs */ #define REQ_OP_READ READ @@ -2107,7 +2107,7 @@ void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, struct page *, struct inode *); int update_dent_inode(struct inode *, struct inode *, const struct qstr *); void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *, - const struct qstr *, f2fs_hash_t , unsigned int); + const struct qstr *, f2fs_hash_t, unsigned int); int f2fs_add_regular_entry(struct inode *, const struct qstr *, const struct qstr *, struct inode *, nid_t, umode_t); int __f2fs_do_add_link(struct inode *, struct fscrypt_name*, struct inode *, From a94d94904cd3ecce3dc44f582c5213747467161b Mon Sep 17 00:00:00 2001 From: DongOh Shin Date: Mon, 30 Jan 2017 10:55:18 -0800 Subject: [PATCH 0113/1212] f2fs: fix 446 coding style warnings in f2fs.h 1) Nine coding style warnings below have been resolved: "Missing a blank line after declarations" 2) 435 coding style warnings below have been resolved: "function definition argument 'x' should also have an identifier name" 3) Two coding style warnings below have been resolved: "macros should not use a trailing semicolon" Signed-off-by: DongOh Shin Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 498 +++++++++++++++++++++++++++---------------------- 1 file changed, 270 insertions(+), 228 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 95a559a359ca..7f97aee70b12 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -293,6 +293,7 @@ struct fsync_inode_entry { static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i) { int before = nats_in_cursum(journal); + journal->n_nats = cpu_to_le16(before + i); return before; } @@ -300,6 +301,7 @@ static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i) static inline int update_sits_in_cursum(struct f2fs_journal *journal, int i) { int before = sits_in_cursum(journal); + journal->n_sits = cpu_to_le16(before + i); return before; } @@ -385,12 +387,14 @@ static inline void make_dentry_ptr(struct inode *inode, if (type == 1) { struct f2fs_dentry_block *t = (struct f2fs_dentry_block *)src; + d->max = NR_DENTRY_IN_BLOCK; d->bitmap = &t->dentry_bitmap; d->dentry = t->dentry; d->filename = t->filename; } else { struct f2fs_inline_dentry *t = (struct f2fs_inline_dentry *)src; + d->max = NR_INLINE_DENTRY; d->bitmap = &t->dentry_bitmap; d->dentry = t->dentry; @@ -579,7 +583,7 @@ static inline bool __is_front_mergeable(struct extent_info *cur, return __is_extent_mergeable(cur, front); } -extern void f2fs_mark_inode_dirty_sync(struct inode *, bool); +extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync); static inline void __try_update_largest_extent(struct inode *inode, struct extent_tree *et, struct extent_node *en) { @@ -1537,6 +1541,7 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, { #ifdef CONFIG_F2FS_FAULT_INJECTION struct page *page = find_lock_page(mapping, index); + if (page) return page; @@ -1620,6 +1625,7 @@ static inline void f2fs_radix_tree_insert(struct radix_tree_root *root, static inline bool IS_INODE(struct page *page) { struct f2fs_node *p = F2FS_NODE(page); + return RAW_IS_INODE(p); } @@ -1633,6 +1639,7 @@ static inline block_t datablock_addr(struct page *node_page, { struct f2fs_node *raw_node; __le32 *addr_array; + raw_node = F2FS_NODE(node_page); addr_array = blkaddr_in_node(raw_node); return le32_to_cpu(addr_array[offset]); @@ -1869,6 +1876,7 @@ static inline unsigned int addrs_per_inode(struct inode *inode) static inline void *inline_xattr_addr(struct page *page) { struct f2fs_inode *ri = F2FS_INODE(page); + return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS]); } @@ -1930,6 +1938,7 @@ static inline bool f2fs_is_drop_cache(struct inode *inode) static inline void *inline_data_addr(struct page *page) { struct f2fs_inode *ri = F2FS_INODE(page); + return (void *)&(ri->i_addr[1]); } @@ -2052,29 +2061,30 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags) /* * file.c */ -int f2fs_sync_file(struct file *, loff_t, loff_t, int); -void truncate_data_blocks(struct dnode_of_data *); -int truncate_blocks(struct inode *, u64, bool); -int f2fs_truncate(struct inode *); -int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *); -int f2fs_setattr(struct dentry *, struct iattr *); -int truncate_hole(struct inode *, pgoff_t, pgoff_t); -int truncate_data_blocks_range(struct dnode_of_data *, int); -long f2fs_ioctl(struct file *, unsigned int, unsigned long); -long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long); +int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); +void truncate_data_blocks(struct dnode_of_data *dn); +int truncate_blocks(struct inode *inode, u64 from, bool lock); +int f2fs_truncate(struct inode *inode); +int f2fs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat); +int f2fs_setattr(struct dentry *dentry, struct iattr *attr); +int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end); +int truncate_data_blocks_range(struct dnode_of_data *dn, int count); +long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); +long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); /* * inode.c */ -void f2fs_set_inode_flags(struct inode *); -struct inode *f2fs_iget(struct super_block *, unsigned long); -struct inode *f2fs_iget_retry(struct super_block *, unsigned long); -int try_to_free_nats(struct f2fs_sb_info *, int); -int update_inode(struct inode *, struct page *); -int update_inode_page(struct inode *); -int f2fs_write_inode(struct inode *, struct writeback_control *); -void f2fs_evict_inode(struct inode *); -void handle_failed_inode(struct inode *); +void f2fs_set_inode_flags(struct inode *inode); +struct inode *f2fs_iget(struct super_block *sb, unsigned long ino); +struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino); +int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink); +int update_inode(struct inode *inode, struct page *node_page); +int update_inode_page(struct inode *inode); +int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc); +void f2fs_evict_inode(struct inode *inode); +void handle_failed_inode(struct inode *inode); /* * namei.c @@ -2084,40 +2094,47 @@ struct dentry *f2fs_get_parent(struct dentry *child); /* * dir.c */ -void set_de_type(struct f2fs_dir_entry *, umode_t); -unsigned char get_de_type(struct f2fs_dir_entry *); -struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *, - f2fs_hash_t, int *, struct f2fs_dentry_ptr *); -int f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, - unsigned int, struct fscrypt_str *); -void do_make_empty_dir(struct inode *, struct inode *, - struct f2fs_dentry_ptr *); -struct page *init_inode_metadata(struct inode *, struct inode *, - const struct qstr *, const struct qstr *, struct page *); -void update_parent_metadata(struct inode *, struct inode *, unsigned int); -int room_for_filename(const void *, int, int); -void f2fs_drop_nlink(struct inode *, struct inode *); -struct f2fs_dir_entry *__f2fs_find_entry(struct inode *, struct fscrypt_name *, - struct page **); -struct f2fs_dir_entry *f2fs_find_entry(struct inode *, const struct qstr *, - struct page **); -struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **); -ino_t f2fs_inode_by_name(struct inode *, const struct qstr *, struct page **); -void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, - struct page *, struct inode *); -int update_dent_inode(struct inode *, struct inode *, const struct qstr *); -void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *, - const struct qstr *, f2fs_hash_t, unsigned int); -int f2fs_add_regular_entry(struct inode *, const struct qstr *, - const struct qstr *, struct inode *, nid_t, umode_t); -int __f2fs_do_add_link(struct inode *, struct fscrypt_name*, struct inode *, - nid_t, umode_t); -int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t, - umode_t); -void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *, - struct inode *); -int f2fs_do_tmpfile(struct inode *, struct inode *); -bool f2fs_empty_dir(struct inode *); +void set_de_type(struct f2fs_dir_entry *de, umode_t mode); +unsigned char get_de_type(struct f2fs_dir_entry *de); +struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, + f2fs_hash_t namehash, int *max_slots, + struct f2fs_dentry_ptr *d); +int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, + unsigned int start_pos, struct fscrypt_str *fstr); +void do_make_empty_dir(struct inode *inode, struct inode *parent, + struct f2fs_dentry_ptr *d); +struct page *init_inode_metadata(struct inode *inode, struct inode *dir, + const struct qstr *new_name, + const struct qstr *orig_name, struct page *dpage); +void update_parent_metadata(struct inode *dir, struct inode *inode, + unsigned int current_depth); +int room_for_filename(const void *bitmap, int slots, int max_slots); +void f2fs_drop_nlink(struct inode *dir, struct inode *inode); +struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, + struct fscrypt_name *fname, struct page **res_page); +struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, + const struct qstr *child, struct page **res_page); +struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p); +ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, + struct page **page); +void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, + struct page *page, struct inode *inode); +int update_dent_inode(struct inode *inode, struct inode *to, + const struct qstr *name); +void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, + const struct qstr *name, f2fs_hash_t name_hash, + unsigned int bit_pos); +int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, + const struct qstr *orig_name, + struct inode *inode, nid_t ino, umode_t mode); +int __f2fs_do_add_link(struct inode *dir, struct fscrypt_name *fname, + struct inode *inode, nid_t ino, umode_t mode); +int __f2fs_add_link(struct inode *dir, const struct qstr *name, + struct inode *inode, nid_t ino, umode_t mode); +void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, + struct inode *dir, struct inode *inode); +int f2fs_do_tmpfile(struct inode *inode, struct inode *dir); +bool f2fs_empty_dir(struct inode *dir); static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) { @@ -2128,18 +2145,18 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) /* * super.c */ -int f2fs_inode_dirtied(struct inode *, bool); -void f2fs_inode_synced(struct inode *); -int f2fs_commit_super(struct f2fs_sb_info *, bool); -int f2fs_sync_fs(struct super_block *, int); +int f2fs_inode_dirtied(struct inode *inode, bool sync); +void f2fs_inode_synced(struct inode *inode); +int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); +int f2fs_sync_fs(struct super_block *sb, int sync); extern __printf(3, 4) -void f2fs_msg(struct super_block *, const char *, const char *, ...); +void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...); int sanity_check_ckpt(struct f2fs_sb_info *sbi); /* * hash.c */ -f2fs_hash_t f2fs_dentry_hash(const struct qstr *); +f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info); /* * node.c @@ -2147,164 +2164,183 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *); struct dnode_of_data; struct node_info; -bool available_free_memory(struct f2fs_sb_info *, int); -int need_dentry_mark(struct f2fs_sb_info *, nid_t); -bool is_checkpointed_node(struct f2fs_sb_info *, nid_t); -bool need_inode_block_update(struct f2fs_sb_info *, nid_t); -void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); -pgoff_t get_next_page_offset(struct dnode_of_data *, pgoff_t); -int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); -int truncate_inode_blocks(struct inode *, pgoff_t); -int truncate_xattr_node(struct inode *, struct page *); -int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t); -int remove_inode_page(struct inode *); -struct page *new_inode_page(struct inode *); -struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *); -void ra_node_page(struct f2fs_sb_info *, nid_t); -struct page *get_node_page(struct f2fs_sb_info *, pgoff_t); -struct page *get_node_page_ra(struct page *, int); -void move_node_page(struct page *, int); -int fsync_node_pages(struct f2fs_sb_info *, struct inode *, - struct writeback_control *, bool); -int sync_node_pages(struct f2fs_sb_info *, struct writeback_control *); -void build_free_nids(struct f2fs_sb_info *, bool); -bool alloc_nid(struct f2fs_sb_info *, nid_t *); -void alloc_nid_done(struct f2fs_sb_info *, nid_t); -void alloc_nid_failed(struct f2fs_sb_info *, nid_t); -int try_to_free_nids(struct f2fs_sb_info *, int); -void recover_inline_xattr(struct inode *, struct page *); -void recover_xattr_data(struct inode *, struct page *, block_t); -int recover_inode_page(struct f2fs_sb_info *, struct page *); -int restore_node_summary(struct f2fs_sb_info *, unsigned int, - struct f2fs_summary_block *); -void flush_nat_entries(struct f2fs_sb_info *); -int build_node_manager(struct f2fs_sb_info *); -void destroy_node_manager(struct f2fs_sb_info *); +bool available_free_memory(struct f2fs_sb_info *sbi, int type); +int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid); +bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid); +bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino); +void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni); +pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs); +int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode); +int truncate_inode_blocks(struct inode *inode, pgoff_t from); +int truncate_xattr_node(struct inode *inode, struct page *page); +int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino); +int remove_inode_page(struct inode *inode); +struct page *new_inode_page(struct inode *inode); +struct page *new_node_page(struct dnode_of_data *dn, + unsigned int ofs, struct page *ipage); +void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); +struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); +struct page *get_node_page_ra(struct page *parent, int start); +void move_node_page(struct page *node_page, int gc_type); +int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, + struct writeback_control *wbc, bool atomic); +int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc); +void build_free_nids(struct f2fs_sb_info *sbi, bool sync); +bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid); +void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); +void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid); +int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink); +void recover_inline_xattr(struct inode *inode, struct page *page); +void recover_xattr_data(struct inode *inode, struct page *page, + block_t blkaddr); +int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); +int restore_node_summary(struct f2fs_sb_info *sbi, + unsigned int segno, struct f2fs_summary_block *sum); +void flush_nat_entries(struct f2fs_sb_info *sbi); +int build_node_manager(struct f2fs_sb_info *sbi); +void destroy_node_manager(struct f2fs_sb_info *sbi); int __init create_node_manager_caches(void); void destroy_node_manager_caches(void); /* * segment.c */ -void register_inmem_page(struct inode *, struct page *); -void drop_inmem_pages(struct inode *); -int commit_inmem_pages(struct inode *); -void f2fs_balance_fs(struct f2fs_sb_info *, bool); -void f2fs_balance_fs_bg(struct f2fs_sb_info *); -int f2fs_issue_flush(struct f2fs_sb_info *); -int create_flush_cmd_control(struct f2fs_sb_info *); -void destroy_flush_cmd_control(struct f2fs_sb_info *, bool); -void invalidate_blocks(struct f2fs_sb_info *, block_t); -bool is_checkpointed_data(struct f2fs_sb_info *, block_t); -void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); -void f2fs_wait_discard_bio(struct f2fs_sb_info *, block_t); -void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *); -void release_discard_addrs(struct f2fs_sb_info *); -int npages_for_summary_flush(struct f2fs_sb_info *, bool); -void allocate_new_segments(struct f2fs_sb_info *); -int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *); -bool exist_trim_candidates(struct f2fs_sb_info *, struct cp_control *); -struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); -void update_meta_page(struct f2fs_sb_info *, void *, block_t); -void write_meta_page(struct f2fs_sb_info *, struct page *); -void write_node_page(unsigned int, struct f2fs_io_info *); -void write_data_page(struct dnode_of_data *, struct f2fs_io_info *); -void rewrite_data_page(struct f2fs_io_info *); -void __f2fs_replace_block(struct f2fs_sb_info *, struct f2fs_summary *, - block_t, block_t, bool, bool); -void f2fs_replace_block(struct f2fs_sb_info *, struct dnode_of_data *, - block_t, block_t, unsigned char, bool, bool); -void allocate_data_block(struct f2fs_sb_info *, struct page *, - block_t, block_t *, struct f2fs_summary *, int); -void f2fs_wait_on_page_writeback(struct page *, enum page_type, bool); -void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *, block_t); -void write_data_summaries(struct f2fs_sb_info *, block_t); -void write_node_summaries(struct f2fs_sb_info *, block_t); -int lookup_journal_in_cursum(struct f2fs_journal *, int, unsigned int, int); -void flush_sit_entries(struct f2fs_sb_info *, struct cp_control *); -int build_segment_manager(struct f2fs_sb_info *); -void destroy_segment_manager(struct f2fs_sb_info *); +void register_inmem_page(struct inode *inode, struct page *page); +void drop_inmem_pages(struct inode *inode); +int commit_inmem_pages(struct inode *inode); +void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); +void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi); +int f2fs_issue_flush(struct f2fs_sb_info *sbi); +int create_flush_cmd_control(struct f2fs_sb_info *sbi); +void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); +void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); +bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); +void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new); +void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr); +void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); +void release_discard_addrs(struct f2fs_sb_info *sbi); +int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); +void allocate_new_segments(struct f2fs_sb_info *sbi); +int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range); +bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc); +struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno); +void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr); +void write_meta_page(struct f2fs_sb_info *sbi, struct page *page); +void write_node_page(unsigned int nid, struct f2fs_io_info *fio); +void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio); +void rewrite_data_page(struct f2fs_io_info *fio); +void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, + block_t old_blkaddr, block_t new_blkaddr, + bool recover_curseg, bool recover_newaddr); +void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, + block_t old_addr, block_t new_addr, + unsigned char version, bool recover_curseg, + bool recover_newaddr); +void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, + block_t old_blkaddr, block_t *new_blkaddr, + struct f2fs_summary *sum, int type); +void f2fs_wait_on_page_writeback(struct page *page, + enum page_type type, bool ordered); +void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi, + block_t blkaddr); +void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk); +void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk); +int lookup_journal_in_cursum(struct f2fs_journal *journal, int type, + unsigned int val, int alloc); +void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); +int build_segment_manager(struct f2fs_sb_info *sbi); +void destroy_segment_manager(struct f2fs_sb_info *sbi); int __init create_segment_manager_caches(void); void destroy_segment_manager_caches(void); /* * checkpoint.c */ -void f2fs_stop_checkpoint(struct f2fs_sb_info *, bool); -struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); -struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); -struct page *get_tmp_page(struct f2fs_sb_info *, pgoff_t); -bool is_valid_blkaddr(struct f2fs_sb_info *, block_t, int); -int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int, bool); -void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t); -long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); -void add_ino_entry(struct f2fs_sb_info *, nid_t, int type); -void remove_ino_entry(struct f2fs_sb_info *, nid_t, int type); -void release_ino_entry(struct f2fs_sb_info *, bool); -bool exist_written_data(struct f2fs_sb_info *, nid_t, int); -int f2fs_sync_inode_meta(struct f2fs_sb_info *); -int acquire_orphan_inode(struct f2fs_sb_info *); -void release_orphan_inode(struct f2fs_sb_info *); -void add_orphan_inode(struct inode *); -void remove_orphan_inode(struct f2fs_sb_info *, nid_t); -int recover_orphan_inodes(struct f2fs_sb_info *); -int get_valid_checkpoint(struct f2fs_sb_info *); -void update_dirty_page(struct inode *, struct page *); -void remove_dirty_inode(struct inode *); -int sync_dirty_inodes(struct f2fs_sb_info *, enum inode_type); -int write_checkpoint(struct f2fs_sb_info *, struct cp_control *); -void init_ino_entry_info(struct f2fs_sb_info *); +void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io); +struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); +struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); +struct page *get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index); +bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); +int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, + int type, bool sync); +void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index); +long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, + long nr_to_write); +void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); +void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); +void release_ino_entry(struct f2fs_sb_info *sbi, bool all); +bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode); +int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi); +int acquire_orphan_inode(struct f2fs_sb_info *sbi); +void release_orphan_inode(struct f2fs_sb_info *sbi); +void add_orphan_inode(struct inode *inode); +void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino); +int recover_orphan_inodes(struct f2fs_sb_info *sbi); +int get_valid_checkpoint(struct f2fs_sb_info *sbi); +void update_dirty_page(struct inode *inode, struct page *page); +void remove_dirty_inode(struct inode *inode); +int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type); +int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); +void init_ino_entry_info(struct f2fs_sb_info *sbi); int __init create_checkpoint_caches(void); void destroy_checkpoint_caches(void); /* * data.c */ -void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int); -void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *, - struct page *, nid_t, enum page_type, int); -void f2fs_flush_merged_bios(struct f2fs_sb_info *); -int f2fs_submit_page_bio(struct f2fs_io_info *); -int f2fs_submit_page_mbio(struct f2fs_io_info *); -struct block_device *f2fs_target_device(struct f2fs_sb_info *, - block_t, struct bio *); -int f2fs_target_device_index(struct f2fs_sb_info *, block_t); -void set_data_blkaddr(struct dnode_of_data *); -void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t); -int reserve_new_blocks(struct dnode_of_data *, blkcnt_t); -int reserve_new_block(struct dnode_of_data *); -int f2fs_get_block(struct dnode_of_data *, pgoff_t); -int f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *); -int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); -struct page *get_read_data_page(struct inode *, pgoff_t, int, bool); -struct page *find_data_page(struct inode *, pgoff_t); -struct page *get_lock_data_page(struct inode *, pgoff_t, bool); -struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); -int do_write_data_page(struct f2fs_io_info *); -int f2fs_map_blocks(struct inode *, struct f2fs_map_blocks *, int, int); -int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); -void f2fs_set_page_dirty_nobuffers(struct page *); -void f2fs_invalidate_page(struct page *, unsigned int, unsigned int); -int f2fs_release_page(struct page *, gfp_t); +void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, enum page_type type, + int rw); +void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *sbi, + struct inode *inode, struct page *page, + nid_t ino, enum page_type type, int rw); +void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi); +int f2fs_submit_page_bio(struct f2fs_io_info *fio); +int f2fs_submit_page_mbio(struct f2fs_io_info *fio); +struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, + block_t blk_addr, struct bio *bio); +int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr); +void set_data_blkaddr(struct dnode_of_data *dn); +void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr); +int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count); +int reserve_new_block(struct dnode_of_data *dn); +int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index); +int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from); +int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); +struct page *get_read_data_page(struct inode *inode, pgoff_t index, + int op_flags, bool for_write); +struct page *find_data_page(struct inode *inode, pgoff_t index); +struct page *get_lock_data_page(struct inode *inode, pgoff_t index, + bool for_write); +struct page *get_new_data_page(struct inode *inode, + struct page *ipage, pgoff_t index, bool new_i_size); +int do_write_data_page(struct f2fs_io_info *fio); +int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, + int create, int flag); +int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len); +void f2fs_set_page_dirty_nobuffers(struct page *page); +void f2fs_invalidate_page(struct page *page, unsigned int offset, + unsigned int length); +int f2fs_release_page(struct page *page, gfp_t wait); #ifdef CONFIG_MIGRATION -int f2fs_migrate_page(struct address_space *, struct page *, struct page *, - enum migrate_mode); +int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, + struct page *page, enum migrate_mode mode); #endif /* * gc.c */ -int start_gc_thread(struct f2fs_sb_info *); -void stop_gc_thread(struct f2fs_sb_info *); -block_t start_bidx_of_node(unsigned int, struct inode *); -int f2fs_gc(struct f2fs_sb_info *, bool, bool); -void build_gc_manager(struct f2fs_sb_info *); +int start_gc_thread(struct f2fs_sb_info *sbi); +void stop_gc_thread(struct f2fs_sb_info *sbi); +block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode); +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background); +void build_gc_manager(struct f2fs_sb_info *sbi); /* * recovery.c */ -int recover_fsync_data(struct f2fs_sb_info *, bool); -bool space_for_roll_forward(struct f2fs_sb_info *); +int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only); +bool space_for_roll_forward(struct f2fs_sb_info *sbi); /* * debug.c @@ -2398,9 +2434,9 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) #define stat_inc_inplace_blocks(sbi) \ (atomic_inc(&(sbi)->inplace_count)) #define stat_inc_atomic_write(inode) \ - (atomic_inc(&F2FS_I_SB(inode)->aw_cnt)); + (atomic_inc(&F2FS_I_SB(inode)->aw_cnt)) #define stat_dec_atomic_write(inode) \ - (atomic_dec(&F2FS_I_SB(inode)->aw_cnt)); + (atomic_dec(&F2FS_I_SB(inode)->aw_cnt)) #define stat_update_max_atomic_write(inode) \ do { \ int cur = atomic_read(&F2FS_I_SB(inode)->aw_cnt); \ @@ -2440,8 +2476,8 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) si->bg_node_blks += (gc_type == BG_GC) ? (blks) : 0; \ } while (0) -int f2fs_build_stats(struct f2fs_sb_info *); -void f2fs_destroy_stats(struct f2fs_sb_info *); +int f2fs_build_stats(struct f2fs_sb_info *sbi); +void f2fs_destroy_stats(struct f2fs_sb_info *sbi); int __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); #else @@ -2493,49 +2529,55 @@ extern struct kmem_cache *inode_entry_slab; /* * inline.c */ -bool f2fs_may_inline_data(struct inode *); -bool f2fs_may_inline_dentry(struct inode *); -void read_inline_data(struct page *, struct page *); -bool truncate_inline_inode(struct page *, u64); -int f2fs_read_inline_data(struct inode *, struct page *); -int f2fs_convert_inline_page(struct dnode_of_data *, struct page *); -int f2fs_convert_inline_inode(struct inode *); -int f2fs_write_inline_data(struct inode *, struct page *); -bool recover_inline_data(struct inode *, struct page *); -struct f2fs_dir_entry *find_in_inline_dir(struct inode *, - struct fscrypt_name *, struct page **); -int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *); -int f2fs_add_inline_entry(struct inode *, const struct qstr *, - const struct qstr *, struct inode *, nid_t, umode_t); -void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *, - struct inode *, struct inode *); -bool f2fs_empty_inline_dir(struct inode *); -int f2fs_read_inline_dir(struct file *, struct dir_context *, - struct fscrypt_str *); -int f2fs_inline_data_fiemap(struct inode *, - struct fiemap_extent_info *, __u64, __u64); +bool f2fs_may_inline_data(struct inode *inode); +bool f2fs_may_inline_dentry(struct inode *inode); +void read_inline_data(struct page *page, struct page *ipage); +bool truncate_inline_inode(struct page *ipage, u64 from); +int f2fs_read_inline_data(struct inode *inode, struct page *page); +int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page); +int f2fs_convert_inline_inode(struct inode *inode); +int f2fs_write_inline_data(struct inode *inode, struct page *page); +bool recover_inline_data(struct inode *inode, struct page *npage); +struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, + struct fscrypt_name *fname, struct page **res_page); +int make_empty_inline_dir(struct inode *inode, struct inode *parent, + struct page *ipage); +int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, + const struct qstr *orig_name, + struct inode *inode, nid_t ino, umode_t mode); +void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, + struct inode *dir, struct inode *inode); +bool f2fs_empty_inline_dir(struct inode *dir); +int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, + struct fscrypt_str *fstr); +int f2fs_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); /* * shrinker.c */ -unsigned long f2fs_shrink_count(struct shrinker *, struct shrink_control *); -unsigned long f2fs_shrink_scan(struct shrinker *, struct shrink_control *); -void f2fs_join_shrinker(struct f2fs_sb_info *); -void f2fs_leave_shrinker(struct f2fs_sb_info *); +unsigned long f2fs_shrink_count(struct shrinker *shrink, + struct shrink_control *sc); +unsigned long f2fs_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc); +void f2fs_join_shrinker(struct f2fs_sb_info *sbi); +void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); /* * extent_cache.c */ -unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int); -bool f2fs_init_extent_tree(struct inode *, struct f2fs_extent *); -void f2fs_drop_extent_tree(struct inode *); -unsigned int f2fs_destroy_extent_node(struct inode *); -void f2fs_destroy_extent_tree(struct inode *); -bool f2fs_lookup_extent_cache(struct inode *, pgoff_t, struct extent_info *); -void f2fs_update_extent_cache(struct dnode_of_data *); +unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); +bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext); +void f2fs_drop_extent_tree(struct inode *inode); +unsigned int f2fs_destroy_extent_node(struct inode *inode); +void f2fs_destroy_extent_tree(struct inode *inode); +bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei); +void f2fs_update_extent_cache(struct dnode_of_data *dn); void f2fs_update_extent_cache_range(struct dnode_of_data *dn, - pgoff_t, block_t, unsigned int); -void init_extent_cache_info(struct f2fs_sb_info *); + pgoff_t fofs, block_t blkaddr, unsigned int len); +void init_extent_cache_info(struct f2fs_sb_info *sbi); int __init create_extent_cache(void); void destroy_extent_cache(void); From de0e3bc1a54a884d5c7392a0441b272c0e60ed5b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 1 Feb 2017 15:40:11 -0800 Subject: [PATCH 0114/1212] f2fs: show # of APPEND and UPDATE inodes This patch shows cached # of APPEND and UPDATE inode entries. Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 6 ++++-- fs/f2fs/f2fs.h | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 0ca977a94c13..de8da9fc5c99 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -70,6 +70,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->inline_xattr = atomic_read(&sbi->inline_xattr); si->inline_inode = atomic_read(&sbi->inline_inode); si->inline_dir = atomic_read(&sbi->inline_dir); + si->append = sbi->im[APPEND_INO].ino_num; + si->update = sbi->im[UPDATE_INO].ino_num; si->orphans = sbi->im[ORPHAN_INO].ino_num; si->utilization = utilization(sbi); @@ -264,8 +266,8 @@ static int stat_show(struct seq_file *s, void *v) si->inline_inode); seq_printf(s, " - Inline_dentry Inode: %u\n", si->inline_dir); - seq_printf(s, " - Orphan Inode: %u\n", - si->orphans); + seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n", + si->orphans, si->append, si->update); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", si->main_area_segs, si->main_area_sections, si->main_area_zones); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7f97aee70b12..a32d1a2523a5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2360,7 +2360,7 @@ struct f2fs_stat_info { int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids; int total_count, utilization; int bg_gc, nr_wb_cp_data, nr_wb_data, nr_flush, nr_discard; - int inline_xattr, inline_inode, inline_dir, orphans; + int inline_xattr, inline_inode, inline_dir, append, update, orphans; int aw_cnt, max_aw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; From 040eb7fd6c297f6f2b7e899930892e87d6a52796 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 2 Feb 2017 16:40:55 -0800 Subject: [PATCH 0115/1212] f2fs: move flush tracepoint This patch moves the tracepoint location for flush command. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 9f0d77b4eefd..ab62f0be2b15 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -426,6 +426,9 @@ static int submit_flush_wait(struct f2fs_sb_info *sbi) int ret = __submit_flush_wait(sbi->sb->s_bdev); int i; + trace_f2fs_issue_flush(sbi->sb, test_opt(sbi, NOBARRIER), + test_opt(sbi, FLUSH_MERGE)); + if (sbi->s_ndevs && !ret) { for (i = 1; i < sbi->s_ndevs; i++) { ret = __submit_flush_wait(FDEV(i).bdev); @@ -471,9 +474,6 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; struct flush_cmd cmd; - trace_f2fs_issue_flush(sbi->sb, test_opt(sbi, NOBARRIER), - test_opt(sbi, FLUSH_MERGE)); - if (test_opt(sbi, NOBARRIER)) return 0; From 8f70c40113f4c69e1bb9e1906e8d3b7a4c2b16d4 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 2 Feb 2017 18:27:17 -0800 Subject: [PATCH 0116/1212] f2fs: move write_node_page above fsync_node_pages This patch just moves write_node_page and introduces an inner function. Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 140 ++++++++++++++++++++++++++----------------------- 1 file changed, 73 insertions(+), 67 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index bc67dc323f7e..5cdcf5ba43a0 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1318,6 +1318,78 @@ static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) return last_page; } +static int __write_node_page(struct page *page, + struct writeback_control *wbc) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(page); + nid_t nid; + struct node_info ni; + struct f2fs_io_info fio = { + .sbi = sbi, + .type = NODE, + .op = REQ_OP_WRITE, + .op_flags = wbc_to_write_flags(wbc), + .page = page, + .encrypted_page = NULL, + }; + + trace_f2fs_writepage(page, NODE); + + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto redirty_out; + if (unlikely(f2fs_cp_error(sbi))) + goto redirty_out; + + /* get old block addr of this node page */ + nid = nid_of_node(page); + f2fs_bug_on(sbi, page->index != nid); + + if (wbc->for_reclaim) { + if (!down_read_trylock(&sbi->node_write)) + goto redirty_out; + } else { + down_read(&sbi->node_write); + } + + get_node_info(sbi, nid, &ni); + + /* This page is already truncated */ + if (unlikely(ni.blk_addr == NULL_ADDR)) { + ClearPageUptodate(page); + dec_page_count(sbi, F2FS_DIRTY_NODES); + up_read(&sbi->node_write); + unlock_page(page); + return 0; + } + + set_page_writeback(page); + fio.old_blkaddr = ni.blk_addr; + write_node_page(nid, &fio); + set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); + dec_page_count(sbi, F2FS_DIRTY_NODES); + up_read(&sbi->node_write); + + if (wbc->for_reclaim) + f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, NODE, WRITE); + + unlock_page(page); + + if (unlikely(f2fs_cp_error(sbi))) + f2fs_submit_merged_bio(sbi, NODE, WRITE); + + return 0; + +redirty_out: + redirty_page_for_writepage(wbc, page); + return AOP_WRITEPAGE_ACTIVATE; +} + +static int f2fs_write_node_page(struct page *page, + struct writeback_control *wbc) +{ + return __write_node_page(page, wbc); +} + int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic) { @@ -1397,7 +1469,7 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, if (!clear_page_dirty_for_io(page)) goto continue_unlock; - ret = NODE_MAPPING(sbi)->a_ops->writepage(page, wbc); + ret = __write_node_page(page, wbc); if (ret) { unlock_page(page); f2fs_put_page(last_page, 0); @@ -1580,72 +1652,6 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) return ret; } -static int f2fs_write_node_page(struct page *page, - struct writeback_control *wbc) -{ - struct f2fs_sb_info *sbi = F2FS_P_SB(page); - nid_t nid; - struct node_info ni; - struct f2fs_io_info fio = { - .sbi = sbi, - .type = NODE, - .op = REQ_OP_WRITE, - .op_flags = wbc_to_write_flags(wbc), - .page = page, - .encrypted_page = NULL, - }; - - trace_f2fs_writepage(page, NODE); - - if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) - goto redirty_out; - if (unlikely(f2fs_cp_error(sbi))) - goto redirty_out; - - /* get old block addr of this node page */ - nid = nid_of_node(page); - f2fs_bug_on(sbi, page->index != nid); - - if (wbc->for_reclaim) { - if (!down_read_trylock(&sbi->node_write)) - goto redirty_out; - } else { - down_read(&sbi->node_write); - } - - get_node_info(sbi, nid, &ni); - - /* This page is already truncated */ - if (unlikely(ni.blk_addr == NULL_ADDR)) { - ClearPageUptodate(page); - dec_page_count(sbi, F2FS_DIRTY_NODES); - up_read(&sbi->node_write); - unlock_page(page); - return 0; - } - - set_page_writeback(page); - fio.old_blkaddr = ni.blk_addr; - write_node_page(nid, &fio); - set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); - dec_page_count(sbi, F2FS_DIRTY_NODES); - up_read(&sbi->node_write); - - if (wbc->for_reclaim) - f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, NODE, WRITE); - - unlock_page(page); - - if (unlikely(f2fs_cp_error(sbi))) - f2fs_submit_merged_bio(sbi, NODE, WRITE); - - return 0; - -redirty_out: - redirty_page_for_writepage(wbc, page); - return AOP_WRITEPAGE_ACTIVATE; -} - static int f2fs_write_node_pages(struct address_space *mapping, struct writeback_control *wbc) { From ed0eee678877e7e9276262616d447813edbb33ab Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 2 Feb 2017 18:18:06 -0800 Subject: [PATCH 0117/1212] f2fs: avoid out-of-order execution of atomic writes We need to flush data writes before flushing last node block writes by using FUA with PREFLUSH. We don't need to guarantee precedent node writes since if those are not written, we can't reach to the last node block when scanning node block chain during roll-forward recovery. Afterwards f2fs_wait_on_page_writeback guarantees all the IO submission to disk, which builds a valid node block chain. Signed-off-by: Jaegeuk Kim Conflicts: include/trace/events/f2fs.h --- fs/f2fs/file.c | 3 ++- fs/f2fs/node.c | 10 +++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 27ef66ff7aab..12c12cb4a06f 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -279,7 +279,8 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, flush_out: remove_ino_entry(sbi, ino, UPDATE_INO); clear_inode_flag(inode, FI_UPDATE_WRITE); - ret = f2fs_issue_flush(sbi); + if (!atomic) + ret = f2fs_issue_flush(sbi); f2fs_update_time(sbi, REQ_TIME); out: trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 5cdcf5ba43a0..d24bdb970a24 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1318,7 +1318,7 @@ static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) return last_page; } -static int __write_node_page(struct page *page, +static int __write_node_page(struct page *page, bool atomic, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); @@ -1362,6 +1362,9 @@ static int __write_node_page(struct page *page, return 0; } + if (atomic && !test_opt(sbi, NOBARRIER)) + fio.op_flags |= WRITE_FLUSH_FUA; + set_page_writeback(page); fio.old_blkaddr = ni.blk_addr; write_node_page(nid, &fio); @@ -1387,7 +1390,7 @@ static int __write_node_page(struct page *page, static int f2fs_write_node_page(struct page *page, struct writeback_control *wbc) { - return __write_node_page(page, wbc); + return __write_node_page(page, false, wbc); } int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, @@ -1469,7 +1472,8 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, if (!clear_page_dirty_for_io(page)) goto continue_unlock; - ret = __write_node_page(page, wbc); + ret = __write_node_page(page, atomic && + page == last_page, wbc); if (ret) { unlock_page(page); f2fs_put_page(last_page, 0); From 33f44e9f9c086f3e4767e7ebb17256e3b6da79db Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 3 Feb 2017 17:18:00 -0800 Subject: [PATCH 0118/1212] f2fs: call internal __write_data_page directly This patch introduces __write_data_page to call it by f2fs_write_cache_pages directly.. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index dbd2822b5ab8..f8a0bab49f0d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1343,7 +1343,7 @@ int do_write_data_page(struct f2fs_io_info *fio) return err; } -static int f2fs_write_data_page(struct page *page, +static int __write_data_page(struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; @@ -1445,6 +1445,12 @@ static int f2fs_write_data_page(struct page *page, return err; } +static int f2fs_write_data_page(struct page *page, + struct writeback_control *wbc) +{ + return __write_data_page(page, wbc); +} + /* * This function was copied from write_cche_pages from mm/page-writeback.c. * The major change is making write step of cold data page separately from @@ -1534,7 +1540,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, if (!clear_page_dirty_for_io(page)) goto continue_unlock; - ret = mapping->a_ops->writepage(page, wbc); + ret = __write_data_page(page, wbc); if (unlikely(ret)) { /* * keep nr_to_write, since vfs uses this to From 6cd8a154387ddc8025c93327e86cfbe690dd7035 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 23 Feb 2017 09:52:35 -0800 Subject: [PATCH 0119/1212] f2fs: fix missing bio_alloc(1) For discard commands, we should use bio_alloc(1) in old versions. Fixes: 373bb0247a ("f2fs: support async discard based on v4.9") Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ab62f0be2b15..83d34c80ec37 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -769,7 +769,7 @@ static int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, if (ret) return ret; } - bio = f2fs_bio_alloc(0); + bio = f2fs_bio_alloc(1); bio->bi_iter.bi_sector = sector; bio->bi_bdev = bdev; bio_set_op_attrs(bio, op, 0); From 5e95180bf64c01a1ed4238ff92b1524ebfb8e34f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 23 Feb 2017 16:53:14 -0800 Subject: [PATCH 0120/1212] Revert "f2fs: remove batched discard in f2fs_trim_fs" This reverts commit c4cc29d19eaf010c1133823438f5a3adba155f05. Conflicts: fs/f2fs/f2fs.h fs/f2fs/segment.c fs/f2fs/super.c --- Documentation/ABI/testing/sysfs-fs-f2fs | 6 ++++ fs/f2fs/f2fs.h | 10 ++++++- fs/f2fs/segment.c | 38 ++++++++++++++++--------- fs/f2fs/super.c | 2 ++ 4 files changed, 42 insertions(+), 14 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index bc8fbfa1c800..0345f2d1c727 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -75,6 +75,12 @@ Contact: "Jaegeuk Kim" Description: Controls the memory footprint used by f2fs. +What: /sys/fs/f2fs//trim_sections +Date: February 2015 +Contact: "Jaegeuk Kim" +Description: + Controls the trimming rate in batch mode. + What: /sys/fs/f2fs//cp_interval Date: October 2015 Contact: "Jaegeuk Kim" diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a32d1a2523a5..0c22dfd69d6e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -195,7 +195,12 @@ enum { CP_DISCARD, }; -#define MAX_DISCARD_BLOCKS(sbi) (1 << (sbi)->log_blocks_per_seg) +#define DEF_BATCHED_TRIM_SECTIONS 2 +#define BATCHED_TRIM_SEGMENTS(sbi) \ + (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec) +#define BATCHED_TRIM_BLOCKS(sbi) \ + (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) + #define DISCARD_ISSUE_RATE 8 #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ @@ -718,6 +723,9 @@ struct f2fs_sm_info { /* a threshold to reclaim prefree segments */ unsigned int rec_prefree_segments; + /* for batched trimming */ + unsigned int trim_sections; /* # of sections to trim */ + struct list_head sit_entry_set; /* sit entry set list */ unsigned int ipu_policy; /* in-place-update policy */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 83d34c80ec37..ab0fc88bfe17 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -965,8 +965,7 @@ static void __add_discard_entry(struct f2fs_sb_info *sbi, if (!list_empty(head)) { last = list_last_entry(head, struct discard_entry, list); if (START_BLOCK(sbi, cpc->trim_start) + start == - last->blkaddr + last->len && - last->len <= MAX_DISCARD_BLOCKS(sbi)) { + last->blkaddr + last->len) { last->len += end - start; goto done; } @@ -1706,25 +1705,36 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) "Found FS corruption, run fsck to fix."); goto out; } - if (sbi->discard_blks == 0) - goto out; /* start/end segment number in main_area */ start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start); end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 : GET_SEGNO(sbi, end); - /* - * do checkpoint to issue discard commands safely since we now can - * use async discard. - */ cpc.reason = CP_DISCARD; cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen)); - cpc.trim_start = start_segno; - cpc.trim_end = end_segno; - mutex_lock(&sbi->gc_mutex); - err = write_checkpoint(sbi, &cpc); - mutex_unlock(&sbi->gc_mutex); + /* do checkpoint to issue discard commands safely */ + for (; start_segno <= end_segno; start_segno = cpc.trim_end + 1) { + cpc.trim_start = start_segno; + + if (sbi->discard_blks == 0) + break; + else if (sbi->discard_blks < BATCHED_TRIM_BLOCKS(sbi)) + cpc.trim_end = end_segno; + else + cpc.trim_end = min_t(unsigned int, + rounddown(start_segno + + BATCHED_TRIM_SEGMENTS(sbi), + sbi->segs_per_sec) - 1, end_segno); + + mutex_lock(&sbi->gc_mutex); + err = write_checkpoint(sbi, &cpc); + mutex_unlock(&sbi->gc_mutex); + if (err) + break; + + schedule(); + } out: range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); return err; @@ -2885,6 +2895,8 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; + sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS; + INIT_LIST_HEAD(&sm_info->sit_entry_set); if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) { diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 08b6ba9b3f14..171ca84c7769 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -287,6 +287,7 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); @@ -311,6 +312,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(gc_idle), ATTR_LIST(reclaim_segments), ATTR_LIST(max_small_discards), + ATTR_LIST(batched_trim_sections), ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), ATTR_LIST(min_fsync_blocks), From 0166e6469f9dbbd0c98c71a4803f818f98749929 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 23 Feb 2017 16:58:09 -0800 Subject: [PATCH 0121/1212] f2fs: fix trim_fs assignment This is missing fix from upstream. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ab0fc88bfe17..f69ddd77558f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1675,14 +1675,19 @@ static const struct segment_allocation default_salloc_ops = { bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc) { __u64 trim_start = cpc->trim_start; + bool has_candidate = false; mutex_lock(&SIT_I(sbi)->sentry_lock); - for (; trim_start <= cpc->trim_end; trim_start++) - if (add_discard_addrs(sbi, cpc, true)) + for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) { + if (add_discard_addrs(sbi, cpc, true)) { + has_candidate = true; break; + } + } mutex_unlock(&SIT_I(sbi)->sentry_lock); - return trim_start <= cpc->trim_end; + cpc->trim_start = trim_start; + return has_candidate; } int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) From 4e50b7053c191969b664d447068113990a2e7ca9 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 3 Feb 2017 17:44:04 -0800 Subject: [PATCH 0122/1212] f2fs: check io submission more precisely This patch check IO submission more precisely than previous rough check. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 23 +++++++++++++++++------ fs/f2fs/f2fs.h | 1 + fs/f2fs/node.c | 27 +++++++++++++++++++-------- 3 files changed, 37 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f8a0bab49f0d..7e1b93dff2bb 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -379,6 +379,9 @@ int f2fs_submit_page_mbio(struct f2fs_io_info *fio) bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; + /* set submitted = 1 as a return value */ + fio->submitted = 1; + if (!is_read) inc_page_count(sbi, WB_DATA_TYPE(bio_page)); @@ -1343,8 +1346,8 @@ int do_write_data_page(struct f2fs_io_info *fio) return err; } -static int __write_data_page(struct page *page, - struct writeback_control *wbc) +static int __write_data_page(struct page *page, bool *submitted, + struct writeback_control *wbc) { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -1362,6 +1365,7 @@ static int __write_data_page(struct page *page, .op_flags = wbc_to_write_flags(wbc), .page = page, .encrypted_page = NULL, + .submitted = false, }; trace_f2fs_writepage(page, DATA); @@ -1427,13 +1431,19 @@ static int __write_data_page(struct page *page, if (wbc->for_reclaim) { f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, DATA, WRITE); remove_dirty_inode(inode); + submitted = NULL; } unlock_page(page); f2fs_balance_fs(sbi, need_balance_fs); - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi))) { f2fs_submit_merged_bio(sbi, DATA, WRITE); + submitted = NULL; + } + + if (submitted) + *submitted = fio.submitted; return 0; @@ -1448,7 +1458,7 @@ static int __write_data_page(struct page *page, static int f2fs_write_data_page(struct page *page, struct writeback_control *wbc) { - return __write_data_page(page, wbc); + return __write_data_page(page, NULL, wbc); } /* @@ -1507,6 +1517,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + bool submitted = false; if (page->index > end) { done = 1; @@ -1540,7 +1551,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, if (!clear_page_dirty_for_io(page)) goto continue_unlock; - ret = __write_data_page(page, wbc); + ret = __write_data_page(page, &submitted, wbc); if (unlikely(ret)) { /* * keep nr_to_write, since vfs uses this to @@ -1554,7 +1565,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, done_index = page->index + 1; done = 1; break; - } else { + } else if (submitted) { nwritten++; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0c22dfd69d6e..b51c4a36bf50 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -795,6 +795,7 @@ struct f2fs_io_info { block_t old_blkaddr; /* old block address before Cow */ struct page *page; /* page to be written */ struct page *encrypted_page; /* encrypted page */ + bool submitted; /* indicate IO submission */ }; #define is_read_io(rw) (rw == READ) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d24bdb970a24..4b9e116b781e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1318,7 +1318,7 @@ static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) return last_page; } -static int __write_node_page(struct page *page, bool atomic, +static int __write_node_page(struct page *page, bool atomic, bool *submitted, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); @@ -1331,6 +1331,7 @@ static int __write_node_page(struct page *page, bool atomic, .op_flags = wbc_to_write_flags(wbc), .page = page, .encrypted_page = NULL, + .submitted = false, }; trace_f2fs_writepage(page, NODE); @@ -1372,13 +1373,19 @@ static int __write_node_page(struct page *page, bool atomic, dec_page_count(sbi, F2FS_DIRTY_NODES); up_read(&sbi->node_write); - if (wbc->for_reclaim) + if (wbc->for_reclaim) { f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, NODE, WRITE); + submitted = NULL; + } unlock_page(page); - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi))) { f2fs_submit_merged_bio(sbi, NODE, WRITE); + submitted = NULL; + } + if (submitted) + *submitted = fio.submitted; return 0; @@ -1390,7 +1397,7 @@ static int __write_node_page(struct page *page, bool atomic, static int f2fs_write_node_page(struct page *page, struct writeback_control *wbc) { - return __write_node_page(page, false, wbc); + return __write_node_page(page, false, NULL, wbc); } int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, @@ -1424,6 +1431,7 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + bool submitted = false; if (unlikely(f2fs_cp_error(sbi))) { f2fs_put_page(last_page, 0); @@ -1473,12 +1481,13 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, goto continue_unlock; ret = __write_node_page(page, atomic && - page == last_page, wbc); + page == last_page, + &submitted, wbc); if (ret) { unlock_page(page); f2fs_put_page(last_page, 0); break; - } else { + } else if (submitted) { nwritten++; } @@ -1534,6 +1543,7 @@ int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc) for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + bool submitted = false; if (unlikely(f2fs_cp_error(sbi))) { pagevec_release(&pvec); @@ -1587,9 +1597,10 @@ int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc) set_fsync_mark(page, 0); set_dentry_mark(page, 0); - if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc)) + ret = __write_node_page(page, false, &submitted, wbc); + if (ret) unlock_page(page); - else + else if (submitted) nwritten++; if (--wbc->nr_to_write == 0) From 5b7c84083345af3007d71394037cb2cec31e26f7 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 1 Feb 2017 16:51:22 -0800 Subject: [PATCH 0123/1212] f2fs: check last page index in cached bio to decide submission If the cached bio has the last page's index, then we need to submit it. Otherwise, we don't need to submit it and can wait for further IO merges. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 3 ++- fs/f2fs/data.c | 43 ++++++++++++++++++++++--------------------- fs/f2fs/f2fs.h | 4 ++-- fs/f2fs/node.c | 12 +++++++----- fs/f2fs/segment.c | 13 +++++++------ 5 files changed, 40 insertions(+), 35 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 45ef3b6bfb04..c943452098a3 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -250,7 +250,8 @@ static int f2fs_write_meta_page(struct page *page, dec_page_count(sbi, F2FS_DIRTY_META); if (wbc->for_reclaim) - f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, META, WRITE); + f2fs_submit_merged_bio_cond(sbi, page->mapping->host, + 0, page->index, META, WRITE); unlock_page(page); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7e1b93dff2bb..b362b2ce3b3a 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -243,8 +243,8 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) io->bio = NULL; } -static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, - struct page *page, nid_t ino) +static bool __has_merged_page(struct f2fs_bio_info *io, + struct inode *inode, nid_t ino, pgoff_t idx) { struct bio_vec *bvec; struct page *target; @@ -253,7 +253,7 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, if (!io->bio) return false; - if (!inode && !page && !ino) + if (!inode && !ino) return true; bio_for_each_segment_all(bvec, io->bio, i) { @@ -263,10 +263,11 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, else target = fscrypt_control_page(bvec->bv_page); + if (idx != target->index) + continue; + if (inode && inode == target->mapping->host) return true; - if (page && page == target) - return true; if (ino && ino == ino_of_node(target)) return true; } @@ -275,22 +276,21 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, } static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode, - struct page *page, nid_t ino, - enum page_type type) + nid_t ino, pgoff_t idx, enum page_type type) { enum page_type btype = PAGE_TYPE_OF_BIO(type); struct f2fs_bio_info *io = &sbi->write_io[btype]; bool ret; down_read(&io->io_rwsem); - ret = __has_merged_page(io, inode, page, ino); + ret = __has_merged_page(io, inode, ino, idx); up_read(&io->io_rwsem); return ret; } static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, - struct inode *inode, struct page *page, - nid_t ino, enum page_type type, int rw) + struct inode *inode, nid_t ino, pgoff_t idx, + enum page_type type, int rw) { enum page_type btype = PAGE_TYPE_OF_BIO(type); struct f2fs_bio_info *io; @@ -299,7 +299,7 @@ static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, down_write(&io->io_rwsem); - if (!__has_merged_page(io, inode, page, ino)) + if (!__has_merged_page(io, inode, ino, idx)) goto out; /* change META to META_FLUSH in the checkpoint procedure */ @@ -318,15 +318,15 @@ static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, enum page_type type, int rw) { - __f2fs_submit_merged_bio(sbi, NULL, NULL, 0, type, rw); + __f2fs_submit_merged_bio(sbi, NULL, 0, 0, type, rw); } void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *sbi, - struct inode *inode, struct page *page, - nid_t ino, enum page_type type, int rw) + struct inode *inode, nid_t ino, pgoff_t idx, + enum page_type type, int rw) { - if (has_merged_page(sbi, inode, page, ino, type)) - __f2fs_submit_merged_bio(sbi, inode, page, ino, type, rw); + if (has_merged_page(sbi, inode, ino, idx, type)) + __f2fs_submit_merged_bio(sbi, inode, ino, idx, type, rw); } void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi) @@ -1429,7 +1429,8 @@ static int __write_data_page(struct page *page, bool *submitted, ClearPageUptodate(page); if (wbc->for_reclaim) { - f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, DATA, WRITE); + f2fs_submit_merged_bio_cond(sbi, inode, 0, page->index, + DATA, WRITE); remove_dirty_inode(inode); submitted = NULL; } @@ -1477,10 +1478,10 @@ static int f2fs_write_cache_pages(struct address_space *mapping, pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; + pgoff_t last_idx = ULONG_MAX; int cycled; int range_whole = 0; int tag; - int nwritten = 0; pagevec_init(&pvec, 0); @@ -1566,7 +1567,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, done = 1; break; } else if (submitted) { - nwritten++; + last_idx = page->index; } if (--wbc->nr_to_write <= 0 && @@ -1588,9 +1589,9 @@ static int f2fs_write_cache_pages(struct address_space *mapping, if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; - if (nwritten) + if (last_idx != ULONG_MAX) f2fs_submit_merged_bio_cond(F2FS_M_SB(mapping), mapping->host, - NULL, 0, DATA, WRITE); + 0, last_idx, DATA, WRITE); return ret; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b51c4a36bf50..840a37c20566 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2300,8 +2300,8 @@ void destroy_checkpoint_caches(void); void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, enum page_type type, int rw); void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *sbi, - struct inode *inode, struct page *page, - nid_t ino, enum page_type type, int rw); + struct inode *inode, nid_t ino, pgoff_t idx, + enum page_type type, int rw); void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi); int f2fs_submit_page_bio(struct f2fs_io_info *fio); int f2fs_submit_page_mbio(struct f2fs_io_info *fio); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 4b9e116b781e..86ff0da6d6aa 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1374,7 +1374,8 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, up_read(&sbi->node_write); if (wbc->for_reclaim) { - f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, NODE, WRITE); + f2fs_submit_merged_bio_cond(sbi, page->mapping->host, 0, + page->index, NODE, WRITE); submitted = NULL; } @@ -1404,12 +1405,12 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic) { pgoff_t index, end; + pgoff_t last_idx = ULONG_MAX; struct pagevec pvec; int ret = 0; struct page *last_page = NULL; bool marked = false; nid_t ino = inode->i_ino; - int nwritten = 0; if (atomic) { last_page = last_fsync_dnode(sbi, ino); @@ -1488,7 +1489,7 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, f2fs_put_page(last_page, 0); break; } else if (submitted) { - nwritten++; + last_idx = page->index; } if (page == last_page) { @@ -1514,8 +1515,9 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, goto retry; } out: - if (nwritten) - f2fs_submit_merged_bio_cond(sbi, NULL, NULL, ino, NODE, WRITE); + if (last_idx != ULONG_MAX) + f2fs_submit_merged_bio_cond(sbi, NULL, ino, last_idx, + NODE, WRITE); return ret ? -EIO: 0; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f69ddd77558f..ddada895787f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -263,7 +263,7 @@ static int __commit_inmem_pages(struct inode *inode, .op_flags = REQ_SYNC | REQ_NOIDLE | REQ_PRIO, .encrypted_page = NULL, }; - bool submit_bio = false; + pgoff_t last_idx = ULONG_MAX; int err = 0; list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { @@ -289,15 +289,15 @@ static int __commit_inmem_pages(struct inode *inode, /* record old blkaddr for revoking */ cur->old_addr = fio.old_blkaddr; - - submit_bio = true; + last_idx = page->index; } unlock_page(page); list_move_tail(&cur->list, revoke_list); } - if (submit_bio) - f2fs_submit_merged_bio_cond(sbi, inode, NULL, 0, DATA, WRITE); + if (last_idx != ULONG_MAX) + f2fs_submit_merged_bio_cond(sbi, inode, 0, last_idx, + DATA, WRITE); if (!err) __revoke_inmem_pages(inode, revoke_list, false, false); @@ -2011,7 +2011,8 @@ void f2fs_wait_on_page_writeback(struct page *page, if (PageWriteback(page)) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); - f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, type, WRITE); + f2fs_submit_merged_bio_cond(sbi, page->mapping->host, + 0, page->index, type, WRITE); if (ordered) wait_on_page_writeback(page); else From 1b1f1ea0e7f750d746f75b110978f3ebab22b0c4 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 6 Feb 2017 13:57:58 -0800 Subject: [PATCH 0124/1212] f2fs: remove preflush for nobarrier case This patch removes REQ_PREFLUSH in the nobarrier case. Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/data.c --- fs/f2fs/data.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b362b2ce3b3a..e8f605bafc4e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -306,9 +306,9 @@ static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, if (type >= META_FLUSH) { io->fio.type = META_FLUSH; io->fio.op = REQ_OP_WRITE; - io->fio.op_flags = WRITE_FLUSH | REQ_META | REQ_PRIO; + io->fio.op_flags = REQ_META | REQ_PRIO; if (!test_opt(sbi, NOBARRIER)) - io->fio.op_flags |= REQ_FUA; + io->fio.op_flags |= WRITE_FLUSH | REQ_FUA; } __submit_merged_bio(io); out: From 194fbd2a5710c4a9446f52b6a4e144b0925cb8ea Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 9 Feb 2017 13:25:35 -0800 Subject: [PATCH 0125/1212] f2fs: show checkpoint version at mount time If we mounted f2fs successfully, let's show current checkpoint version. Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 171ca84c7769..6ab7f6aa337c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2103,6 +2103,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) sbi->valid_super_block ? 1 : 2, err); } + f2fs_msg(sbi->sb, KERN_NOTICE, "Mounted with checkpoint version = %llx", + cur_cp_version(F2FS_CKPT(sbi))); f2fs_update_time(sbi, CP_TIME); f2fs_update_time(sbi, REQ_TIME); return 0; From 56cba038b6226df91b11a557fdf2c3492b92c054 Mon Sep 17 00:00:00 2001 From: Bhumika Goyal Date: Sat, 11 Feb 2017 15:50:46 +0530 Subject: [PATCH 0126/1212] f2fs: super: constify fscrypt_operations structure Declare fscrypt_operations structure as const as it is only stored in the s_cop field of a super_block structure. This field is of type const, so fscrypt_operations structure having this property can be made const too. File size before: fs/f2fs/super.o text data bss dec hex filename 54131 31355 184 85670 14ea6 fs/f2fs/super.o File size after: fs/f2fs/super.o text data bss dec hex filename 54227 31259 184 85670 14ea6 fs/f2fs/super.o Signed-off-by: Bhumika Goyal Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 6ab7f6aa337c..ebad846ba1f1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1210,7 +1210,7 @@ static unsigned f2fs_max_namelen(struct inode *inode) inode->i_sb->s_blocksize : F2FS_NAME_LEN; } -static struct fscrypt_operations f2fs_cryptops = { +static const struct fscrypt_operations f2fs_cryptops = { .get_context = f2fs_get_context, .key_prefix = f2fs_key_prefix, .set_context = f2fs_set_context, @@ -1219,7 +1219,7 @@ static struct fscrypt_operations f2fs_cryptops = { .max_namelen = f2fs_max_namelen, }; #else -static struct fscrypt_operations f2fs_cryptops = { +static const struct fscrypt_operations f2fs_cryptops = { .is_encrypted = f2fs_encrypted_inode, }; #endif From a71c22fcd5c85453e5d362d2cddcd0fb45230630 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 8 Feb 2017 17:39:45 +0800 Subject: [PATCH 0127/1212] f2fs: change recovery policy of xattr node block Currently, if we call fsync after updating the xattr date belongs to the file, f2fs needs to trigger checkpoint to keep xattr data consistent. But, this policy cause low performance as checkpoint will block most foreground operations and cause unneeded and unrelated IOs around checkpoint. This patch will reuse regular file recovery policy for xattr node block, so, we change to write xattr node block tagged with fsync flag to warm area instead of cold area, and during recovery, we search warm node chain for fsynced xattr block, and do the recovery. So, for below application IO pattern, performance can be improved obviously: - touch file - create/update/delete xattr entry in file - fsync file Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 +-- fs/f2fs/file.c | 3 --- fs/f2fs/node.c | 29 +++++++++++++++++------------ fs/f2fs/node.h | 2 +- fs/f2fs/recovery.c | 8 +++----- fs/f2fs/xattr.c | 2 -- 6 files changed, 22 insertions(+), 25 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 840a37c20566..848f24e40cdb 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -527,7 +527,6 @@ struct f2fs_inode_info { f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ nid_t i_xattr_nid; /* node id that contains xattrs */ - unsigned long long xattr_ver; /* cp version of xattr modification */ loff_t last_disk_size; /* lastly written file size */ struct list_head dirty_list; /* dirty list for dirs and files */ @@ -2200,7 +2199,7 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid); int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink); void recover_inline_xattr(struct inode *inode, struct page *page); -void recover_xattr_data(struct inode *inode, struct page *page, +int recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr); int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); int restore_node_summary(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 12c12cb4a06f..948d440dd3ce 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -143,8 +143,6 @@ static inline bool need_do_checkpoint(struct inode *inode) need_cp = true; else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino)) need_cp = true; - else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi))) - need_cp = true; else if (test_opt(sbi, FASTBOOT)) need_cp = true; else if (sbi->active_logs == 2) @@ -170,7 +168,6 @@ static void try_to_fix_pino(struct inode *inode) nid_t pino; down_write(&fi->i_sem); - fi->xattr_ver = 0; if (file_wrong_pino(inode) && inode->i_nlink == 1 && get_parent_ino(inode, &pino)) { f2fs_i_pino_write(inode, pino); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 86ff0da6d6aa..f8abf61be75b 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -971,9 +971,6 @@ int truncate_xattr_node(struct inode *inode, struct page *page) f2fs_i_xnid_write(inode, 0); - /* need to do checkpoint during fsync */ - F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi)); - set_new_dnode(&dn, inode, page, npage, nid); if (page) @@ -2057,18 +2054,18 @@ void recover_inline_xattr(struct inode *inode, struct page *page) f2fs_put_page(ipage, 1); } -void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) +int recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; nid_t new_xnid = nid_of_node(page); struct node_info ni; + struct page *xpage; - /* 1: invalidate the previous xattr nid */ if (!prev_xnid) goto recover_xnid; - /* Deallocate node address */ + /* 1: invalidate the previous xattr nid */ get_node_info(sbi, prev_xnid, &ni); f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR); invalidate_blocks(sbi, ni.blk_addr); @@ -2076,19 +2073,27 @@ void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) set_node_addr(sbi, &ni, NULL_ADDR, false); recover_xnid: - /* 2: allocate new xattr nid */ + /* 2: update xattr nid in inode */ + remove_free_nid(sbi, new_xnid); + f2fs_i_xnid_write(inode, new_xnid); if (unlikely(!inc_valid_node_count(sbi, inode))) f2fs_bug_on(sbi, 1); + update_inode_page(inode); + + /* 3: update and set xattr node page dirty */ + xpage = grab_cache_page(NODE_MAPPING(sbi), new_xnid); + if (!xpage) + return -ENOMEM; + + memcpy(F2FS_NODE(xpage), F2FS_NODE(page), PAGE_SIZE); - remove_free_nid(sbi, new_xnid); get_node_info(sbi, new_xnid, &ni); ni.ino = inode->i_ino; set_node_addr(sbi, &ni, NEW_ADDR, false); - f2fs_i_xnid_write(inode, new_xnid); + set_page_dirty(xpage); + f2fs_put_page(xpage, 1); - /* 3: update xattr blkaddr */ - refresh_sit_entry(sbi, NEW_ADDR, blkaddr); - set_node_addr(sbi, &ni, blkaddr, false); + return 0; } int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 29ff783eb9c3..d3d289306469 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -358,7 +358,7 @@ static inline bool IS_DNODE(struct page *node_page) unsigned int ofs = ofs_of_node(node_page); if (f2fs_has_xattr_block(ofs)) - return false; + return true; if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK || ofs == 5 + 2 * NIDS_PER_BLOCK) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index e93316ea8d1b..d025aa83fb5b 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -378,11 +378,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, if (IS_INODE(page)) { recover_inline_xattr(inode, page); } else if (f2fs_has_xattr_block(ofs_of_node(page))) { - /* - * Deprecated; xattr blocks should be found from cold log. - * But, we should remain this for backward compatibility. - */ - recover_xattr_data(inode, page, blkaddr); + err = recover_xattr_data(inode, page, blkaddr); + if (!err) + recovered++; goto out; } diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index ba67ca0c7014..8eca9022bf16 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -505,8 +505,6 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, set_page_dirty(xpage); f2fs_put_page(xpage, 1); - /* need to checkpoint during fsync */ - F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi)); return 0; } From 69a0a6912f7882f3ddec5b1f5559871fddd1a05c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sat, 11 Feb 2017 10:46:44 -0800 Subject: [PATCH 0128/1212] f2fs: remove build_free_nids() during checkpoint Let's avoid build_free_nids() in checkpoint path. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index c943452098a3..fd8db9d1ceea 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1000,8 +1000,6 @@ static int block_operations(struct f2fs_sb_info *sbi) static void unblock_operations(struct f2fs_sb_info *sbi) { up_write(&sbi->node_write); - - build_free_nids(sbi, false); f2fs_unlock_all(sbi); } From 34a65412384c9e4f0126b95e7e4c1ce818623703 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 13 Feb 2017 17:02:44 -0800 Subject: [PATCH 0129/1212] f2fs: avoid reading NAT page by get_node_info We've not seen this buggy case for a long time, so it's time to avoid this unnecessary get_node_info() call which reading NAT page to cache nat entry. Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index f8abf61be75b..8137903c9012 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1028,7 +1028,7 @@ struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs, struct page *ipage) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); - struct node_info old_ni, new_ni; + struct node_info new_ni; struct page *page; int err; @@ -1043,13 +1043,15 @@ struct page *new_node_page(struct dnode_of_data *dn, err = -ENOSPC; goto fail; } - - get_node_info(sbi, dn->nid, &old_ni); - - /* Reinitialize old_ni with new node page */ - f2fs_bug_on(sbi, old_ni.blk_addr != NULL_ADDR); - new_ni = old_ni; +#ifdef CONFIG_F2FS_CHECK_FS + get_node_info(sbi, dn->nid, &new_ni); + f2fs_bug_on(sbi, new_ni.blk_addr != NULL_ADDR); +#endif + new_ni.nid = dn->nid; new_ni.ino = dn->inode->i_ino; + new_ni.blk_addr = NULL_ADDR; + new_ni.flag = 0; + new_ni.version = 0; set_node_addr(sbi, &new_ni, NEW_ADDR, false); f2fs_wait_on_page_writeback(page, NODE, true); From 79887aed377059190c10d76a55674e6d189614f9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 15 Feb 2017 10:34:45 +0800 Subject: [PATCH 0130/1212] f2fs: introduce noinline_xattr mount option This patch introduces new mount option 'noinline_xattr', so we can disable inline xattr functionality which is already set as a default mount option. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 1 + fs/f2fs/super.c | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index d99faced79cb..8e454b0559f1 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -125,6 +125,7 @@ active_logs=%u Support configuring the number of active logs. In the disable_ext_identify Disable the extension list configured by mkfs, so f2fs does not aware of cold files such as media files. inline_xattr Enable the inline xattrs feature. +noinline_xattr Disable the inline xattrs feature. inline_data Enable the inline data feature: New created small(<~3.4k) files can be written into inode block. inline_dentry Enable the inline dir feature: data in new created diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ebad846ba1f1..7b3fe81db741 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -89,6 +89,7 @@ enum { Opt_active_logs, Opt_disable_ext_identify, Opt_inline_xattr, + Opt_noinline_xattr, Opt_inline_data, Opt_inline_dentry, Opt_noinline_dentry, @@ -122,6 +123,7 @@ static match_table_t f2fs_tokens = { {Opt_active_logs, "active_logs=%u"}, {Opt_disable_ext_identify, "disable_ext_identify"}, {Opt_inline_xattr, "inline_xattr"}, + {Opt_noinline_xattr, "noinline_xattr"}, {Opt_inline_data, "inline_data"}, {Opt_inline_dentry, "inline_dentry"}, {Opt_noinline_dentry, "noinline_dentry"}, @@ -444,6 +446,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_inline_xattr: set_opt(sbi, INLINE_XATTR); break; + case Opt_noinline_xattr: + clear_opt(sbi, INLINE_XATTR); + break; #else case Opt_user_xattr: f2fs_msg(sb, KERN_INFO, @@ -457,6 +462,10 @@ static int parse_options(struct super_block *sb, char *options) f2fs_msg(sb, KERN_INFO, "inline_xattr options not supported"); break; + case Opt_noinline_xattr: + f2fs_msg(sb, KERN_INFO, + "noinline_xattr options not supported"); + break; #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL case Opt_acl: @@ -909,6 +918,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",nouser_xattr"); if (test_opt(sbi, INLINE_XATTR)) seq_puts(seq, ",inline_xattr"); + else + seq_puts(seq, ",noinline_xattr"); #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL if (test_opt(sbi, POSIX_ACL)) From 675dd8213b7074093662cd83519ce4ed5d78bca4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 8 Feb 2017 17:39:44 +0800 Subject: [PATCH 0131/1212] f2fs: enable inline_xattr by default In android, since SElinux is enable, security policy will be appliedd for each file, it stores in inode as an xattr entry, so it will take one 4k size node block additionally for each file. Let's enable inline_xattr by default in order to save storage space. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7b3fe81db741..35e712bbccf1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1040,6 +1040,7 @@ static void default_options(struct f2fs_sb_info *sbi) sbi->active_logs = NR_CURSEG_TYPE; set_opt(sbi, BG_GC); + set_opt(sbi, INLINE_XATTR); set_opt(sbi, INLINE_DATA); set_opt(sbi, INLINE_DENTRY); set_opt(sbi, EXTENT_CACHE); From 5331a1d87fc592190b4eb7cbe6031ea1c6e6a70c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 14 Feb 2017 19:32:51 -0800 Subject: [PATCH 0132/1212] f2fs: use SSR for warm node as well We have had node chains, but haven't used it so far due to stale node blocks. Now, we have crc|cp_ver in node footer and give random cp_ver at format time, we can start to use it again. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ddada895787f..3c39982bd4b8 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1642,7 +1642,8 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, if (force) new_curseg(sbi, type, true); - else if (type == CURSEG_WARM_NODE) + else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && + type == CURSEG_WARM_NODE) new_curseg(sbi, type, false); else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) new_curseg(sbi, type, false); From ff3bf2f2079260ccd00a2847f244dd0acb6c67b1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 15 Feb 2017 11:14:06 -0800 Subject: [PATCH 0133/1212] f2fs: show actual device info in tracepoints This patch shows actual device information in the tracepoints. Signed-off-by: Jaegeuk Kim Conflicts: include/trace/events/f2fs.h --- fs/f2fs/segment.c | 10 ++++---- include/trace/events/f2fs.h | 49 ++++++++++++++++++++----------------- 2 files changed, 31 insertions(+), 28 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 3c39982bd4b8..4fc23afc03e2 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -426,11 +426,11 @@ static int submit_flush_wait(struct f2fs_sb_info *sbi) int ret = __submit_flush_wait(sbi->sb->s_bdev); int i; - trace_f2fs_issue_flush(sbi->sb, test_opt(sbi, NOBARRIER), - test_opt(sbi, FLUSH_MERGE)); - if (sbi->s_ndevs && !ret) { for (i = 1; i < sbi->s_ndevs; i++) { + trace_f2fs_issue_flush(FDEV(i).bdev, + test_opt(sbi, NOBARRIER), + test_opt(sbi, FLUSH_MERGE)); ret = __submit_flush_wait(FDEV(i).bdev); if (ret) break; @@ -839,7 +839,7 @@ static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, block_t lblkstart = blkstart; int err; - trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); + trace_f2fs_issue_discard(bdev, blkstart, blklen); if (sbi->s_ndevs) { int devi = f2fs_target_device_index(sbi, blkstart); @@ -894,7 +894,7 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen); case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF: - trace_f2fs_issue_reset_zone(sbi->sb, blkstart); + trace_f2fs_issue_reset_zone(bdev, blkstart); return blkdev_reset_zones(bdev, sector, nr_sects, GFP_NOFS); default: diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 217691582dd4..bd1772879c8c 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -6,8 +6,8 @@ #include -#define show_dev(entry) MAJOR(entry->dev), MINOR(entry->dev) -#define show_dev_ino(entry) show_dev(entry), (unsigned long)entry->ino +#define show_dev(dev) MAJOR(dev), MINOR(dev) +#define show_dev_ino(entry) show_dev(entry->dev), (unsigned long)entry->ino TRACE_DEFINE_ENUM(NODE); TRACE_DEFINE_ENUM(DATA); @@ -239,7 +239,7 @@ TRACE_EVENT(f2fs_sync_fs, ), TP_printk("dev = (%d,%d), superblock is %s, wait = %d", - show_dev(__entry), + show_dev(__entry->dev), __entry->dirty ? "dirty" : "not dirty", __entry->wait) ); @@ -538,7 +538,7 @@ TRACE_EVENT(f2fs_background_gc, ), TP_printk("dev = (%d,%d), wait_ms = %ld, prefree = %u, free = %u", - show_dev(__entry), + show_dev(__entry->dev), __entry->wait_ms, __entry->prefree, __entry->free) @@ -580,7 +580,7 @@ TRACE_EVENT(f2fs_get_victim, TP_printk("dev = (%d,%d), type = %s, policy = (%s, %s, %s), victim = %u " "ofs_unit = %u, pre_victim_secno = %d, prefree = %u, free = %u", - show_dev(__entry), + show_dev(__entry->dev), show_data_type(__entry->type), show_gc_type(__entry->gc_type), show_alloc_mode(__entry->alloc_mode), @@ -717,7 +717,7 @@ TRACE_EVENT(f2fs_reserve_new_blocks, ), TP_printk("dev = (%d,%d), nid = %u, ofs_in_node = %u, count = %llu", - show_dev(__entry), + show_dev(__entry->dev), (unsigned int)__entry->nid, __entry->ofs_in_node, (unsigned long long)__entry->count) @@ -787,6 +787,7 @@ DECLARE_EVENT_CLASS(f2fs__bio, TP_STRUCT__entry( __field(dev_t, dev) + __field(dev_t, target) __field(int, op) __field(int, op_flags) __field(int, type) @@ -796,6 +797,7 @@ DECLARE_EVENT_CLASS(f2fs__bio, TP_fast_assign( __entry->dev = sb->s_dev; + __entry->target = bio->bi_bdev->bd_dev; __entry->op = bio_op(bio); __entry->op_flags = bio->bi_rw; __entry->type = type; @@ -803,8 +805,9 @@ DECLARE_EVENT_CLASS(f2fs__bio, __entry->size = bio->bi_iter.bi_size; ), - TP_printk("dev = (%d,%d), %s%s, %s, sector = %lld, size = %u", - show_dev(__entry), + TP_printk("dev = (%d,%d)/(%d,%d), rw = %s%s, %s, sector = %lld, size = %u", + show_dev(__entry->target), + show_dev(__entry->dev), show_bio_type(__entry->op, __entry->op_flags), show_block_type(__entry->type), (unsigned long long)__entry->sector, @@ -1101,16 +1104,16 @@ TRACE_EVENT(f2fs_write_checkpoint, ), TP_printk("dev = (%d,%d), checkpoint for %s, state = %s", - show_dev(__entry), + show_dev(__entry->dev), show_cpreason(__entry->reason), __entry->msg) ); TRACE_EVENT(f2fs_issue_discard, - TP_PROTO(struct super_block *sb, block_t blkstart, block_t blklen), + TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen), - TP_ARGS(sb, blkstart, blklen), + TP_ARGS(dev, blkstart, blklen), TP_STRUCT__entry( __field(dev_t, dev) @@ -1119,22 +1122,22 @@ TRACE_EVENT(f2fs_issue_discard, ), TP_fast_assign( - __entry->dev = sb->s_dev; + __entry->dev = dev->bd_dev; __entry->blkstart = blkstart; __entry->blklen = blklen; ), TP_printk("dev = (%d,%d), blkstart = 0x%llx, blklen = 0x%llx", - show_dev(__entry), + show_dev(__entry->dev), (unsigned long long)__entry->blkstart, (unsigned long long)__entry->blklen) ); TRACE_EVENT(f2fs_issue_reset_zone, - TP_PROTO(struct super_block *sb, block_t blkstart), + TP_PROTO(struct block_device *dev, block_t blkstart), - TP_ARGS(sb, blkstart), + TP_ARGS(dev, blkstart), TP_STRUCT__entry( __field(dev_t, dev) @@ -1142,21 +1145,21 @@ TRACE_EVENT(f2fs_issue_reset_zone, ), TP_fast_assign( - __entry->dev = sb->s_dev; + __entry->dev = dev->bd_dev; __entry->blkstart = blkstart; ), TP_printk("dev = (%d,%d), reset zone at block = 0x%llx", - show_dev(__entry), + show_dev(__entry->dev), (unsigned long long)__entry->blkstart) ); TRACE_EVENT(f2fs_issue_flush, - TP_PROTO(struct super_block *sb, unsigned int nobarrier, + TP_PROTO(struct block_device *dev, unsigned int nobarrier, unsigned int flush_merge), - TP_ARGS(sb, nobarrier, flush_merge), + TP_ARGS(dev, nobarrier, flush_merge), TP_STRUCT__entry( __field(dev_t, dev) @@ -1165,13 +1168,13 @@ TRACE_EVENT(f2fs_issue_flush, ), TP_fast_assign( - __entry->dev = sb->s_dev; + __entry->dev = dev->bd_dev; __entry->nobarrier = nobarrier; __entry->flush_merge = flush_merge; ), TP_printk("dev = (%d,%d), %s %s", - show_dev(__entry), + show_dev(__entry->dev), __entry->nobarrier ? "skip (nobarrier)" : "issue", __entry->flush_merge ? " with flush_merge" : "") ); @@ -1286,7 +1289,7 @@ TRACE_EVENT(f2fs_shrink_extent_tree, ), TP_printk("dev = (%d,%d), shrunk: node_cnt = %u, tree_cnt = %u", - show_dev(__entry), + show_dev(__entry->dev), __entry->node_cnt, __entry->tree_cnt) ); @@ -1333,7 +1336,7 @@ DECLARE_EVENT_CLASS(f2fs_sync_dirty_inodes, ), TP_printk("dev = (%d,%d), %s, dirty count = %lld", - show_dev(__entry), + show_dev(__entry->dev), show_file_type(__entry->type), __entry->count) ); From e5e16d8af5cb4936cb2438e4961769878583c76c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 14 Feb 2017 09:54:37 -0800 Subject: [PATCH 0134/1212] f2fs: fix multiple f2fs_add_link() calls having same name It turns out a stakable filesystem like sdcardfs in AOSP can trigger multiple vfs_create() to lower filesystem. In that case, f2fs will add multiple dentries having same name which breaks filesystem consistency. Until upper layer fixes, let's work around by f2fs, which shows actually not much performance regression. Cc: Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 34 +++++++++++++++++++++++++++++----- fs/f2fs/f2fs.h | 1 + 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 4436079dbf0c..ab5343f79f9b 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -207,9 +207,13 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, f2fs_put_page(dentry_page, 0); } - if (!de && room && F2FS_I(dir)->chash != namehash) { - F2FS_I(dir)->chash = namehash; - F2FS_I(dir)->clevel = level; + /* This is to increase the speed of f2fs_create */ + if (!de && room) { + F2FS_I(dir)->task = current; + if (F2FS_I(dir)->chash != namehash) { + F2FS_I(dir)->chash = namehash; + F2FS_I(dir)->clevel = level; + } } return de; @@ -643,14 +647,34 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *inode, nid_t ino, umode_t mode) { struct fscrypt_name fname; + struct page *page = NULL; + struct f2fs_dir_entry *de = NULL; int err; err = fscrypt_setup_filename(dir, name, 0, &fname); if (err) return err; - err = __f2fs_do_add_link(dir, &fname, inode, ino, mode); - + /* + * An immature stakable filesystem shows a race condition between lookup + * and create. If we have same task when doing lookup and create, it's + * definitely fine as expected by VFS normally. Otherwise, let's just + * verify on-disk dentry one more time, which guarantees filesystem + * consistency more. + */ + if (current != F2FS_I(dir)->task) { + de = __f2fs_find_entry(dir, &fname, &page); + F2FS_I(dir)->task = NULL; + } + if (de) { + f2fs_dentry_kunmap(dir, page); + f2fs_put_page(page, 0); + err = -EEXIST; + } else if (IS_ERR(page)) { + err = PTR_ERR(page); + } else { + err = __f2fs_do_add_link(dir, &fname, inode, ino, mode); + } fscrypt_free_filename(&fname); return err; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 848f24e40cdb..24b5fd2d1f03 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -526,6 +526,7 @@ struct f2fs_inode_info { atomic_t dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ + struct task_struct *task; /* lookup and create consistency */ nid_t i_xattr_nid; /* node id that contains xattrs */ loff_t last_disk_size; /* lastly written file size */ From 4c05cfb0081f3fca7de0a7d9d458305e655dd82b Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Fri, 17 Feb 2017 17:16:38 +0800 Subject: [PATCH 0135/1212] f2fs: replace __get_victim by dirty_segments in FG_GC In FG_GC process, it will search victim section twice. This will cause some dirty section with less valid blocks skip garbage collection. section # 26425 : valid blocks # 3 142.037567: get_victim_by_default: victim 26425 : valid blocks # 3 142.037585: f2fs_get_victim: dev = (259,30), type = No TYPE, policy = (Foreground GC, LFS-mode, Greedy), victim = 26425 ofs_unit = 1, pre_victim_secno = 26425, prefree = 0, free = 244 142.039494: f2fs_get_victim: dev = (259,30), type = Hot DATA, policy = (Background GC, SSR-mode, Greedy), victim = 19022 ofs_unit = 1, pre_victim_secno = 26425, prefree = 0, free = 24 142.070247: new_curseg: Debug: alloc new segment 26746 142.244341: f2fs_get_victim: dev = (259,30), type = No TYPE, policy = (Foreground GC, LFS-mode, Greedy), victim = 26054 ofs_unit = 1, pre_victim_secno = 26054, prefree = 0, free = 243 142.254475: do_garbage_collect: Debug: FG_GC, seg_freed = 1 142.293131: f2fs_get_victim: dev = (259,30), type = Warm DATA, policy = (Background GC, SSR-mode, Greedy), victim = 23466 ofs_unit = 1, pre_victim_secno = -1, prefree = 0, free = 244 142.319001: f2fs_get_victim: dev = (259,30), type = Warm DATA, policy = (Background GC, SSR-mode, Greedy), victim = 23467 ofs_unit = 1, pre_victim_secno = -1, prefree = 0, free = 244 142.368879: get_victim_by_default: victim 26425 : valid blocks # 3 142.368894: f2fs_get_victim: dev = (259,30), type = No TYPE, policy = (Foreground GC, LFS-mode, Greedy), victim = 26425 ofs_unit = 1, pre_victim_secno = 26425, prefree = 0, free = 244 142.378127: f2fs_get_victim: dev = (259,30), type = Hot DATA, policy = (Background GC, SSR-mode, Greedy), victim = 19612 ofs_unit = 1, pre_victim_secno = 26425, prefree = 0, free = 24 142.416917: new_curseg: Debug: alloc new segment 26054 142.656794: f2fs_get_victim: dev = (259,30), type = No TYPE, policy = (Foreground GC, LFS-mode, Greedy), victim = 25404 ofs_unit = 1, pre_victim_secno = 25404, prefree = 0, free = 243 142.662139: do_garbage_collect: Debug: FG_GC, seg_freed = 1 142.684159: new_curseg: Debug: alloc new segment 25197 142.685059: get_victim_by_default: victim 26425 : valid blocks # 3 142.685079: f2fs_get_victim: dev = (259,30), type = No TYPE, policy = (Foreground GC, LFS-mode, Greedy), victim = 26425 ofs_unit = 1, pre_victim_secno = 26425, prefree = 0, free = 243 142.701427: f2fs_get_victim: dev = (259,30), type = No TYPE, policy = (Foreground GC, LFS-mode, Greedy), victim = 26238 ofs_unit = 1, pre_victim_secno = 26238, prefree = 0, free = 243 142.707105: do_garbage_collect: Debug: FG_GC, seg_freed = 1 142.802444: f2fs_get_victim: dev = (259,30), type = Warm DATA, policy = (Background GC, SSR-mode, Greedy), victim = 23473 ofs_unit = 1, pre_victim_secno = -1, prefree = 0, free = 244 142.804422: get_victim_by_default: victim 26425 : valid blocks # 3 142.804443: f2fs_get_victim: dev = (259,30), type = No TYPE, policy = (Foreground GC, LFS-mode, Greedy), victim = 26425 ofs_unit = 1, pre_victim_secno = 26425, prefree = 0, free = 244 142.851567: f2fs_get_victim: dev = (259,30), type = Hot DATA, policy = (Background GC, SSR-mode, Greedy), victim = 19092 ofs_unit = 1, pre_victim_secno = 26425, prefree = 0, free = 24 142.865014: new_curseg: Debug: alloc new segment 26238 143.082245: f2fs_get_victim: dev = (259,30), type = No TYPE, policy = (Foreground GC, LFS-mode, Greedy), victim = 26307 ofs_unit = 1, pre_victim_secno = 26307, prefree = 0, free = 244 143.088252: do_garbage_collect: Debug: FG_GC, seg_freed = 1 143.128307: new_curseg: Debug: alloc new segment 25404 143.181846: get_victim_by_default: victim 26425 : valid blocks # 3 143.181872: f2fs_get_victim: dev = (259,30), type = No TYPE, policy = (Foreground GC, LFS-mode, Greedy), victim = 26425 ofs_unit = 1, pre_victim_secno = 26425, prefree = 0, free = 244 Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 7f0c3e02408c..518557bfad42 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -927,8 +927,6 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background) cpc.reason = __get_cp_reason(sbi); gc_more: - segno = NULL_SEGNO; - if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) goto stop; if (unlikely(f2fs_cp_error(sbi))) { @@ -943,12 +941,10 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background) * enough free sections, we should flush dent/node blocks and do * garbage collections. */ - if (__get_victim(sbi, &segno, gc_type) || - prefree_segments(sbi)) { + if (dirty_segments(sbi) || prefree_segments(sbi)) { ret = write_checkpoint(sbi, &cpc); if (ret) goto stop; - segno = NULL_SEGNO; } else if (has_not_enough_free_secs(sbi, 0, 0)) { ret = write_checkpoint(sbi, &cpc); if (ret) @@ -959,7 +955,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background) goto stop; } - if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type)) + if (!__get_victim(sbi, &segno, gc_type)) goto stop; ret = 0; From e24eb1fceda8df42f824bd51bfc28e8019848c5b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 17 Feb 2017 09:55:55 -0800 Subject: [PATCH 0136/1212] f2fs: do not wait for writeback in write_begin Otherwise we can get livelock like below. [79880.428136] dbench D 0 18405 18404 0x00000000 [79880.428139] Call Trace: [79880.428142] __schedule+0x219/0x6b0 [79880.428144] schedule+0x36/0x80 [79880.428147] schedule_timeout+0x243/0x2e0 [79880.428152] ? update_sd_lb_stats+0x16b/0x5f0 [79880.428155] ? ktime_get+0x3c/0xb0 [79880.428157] io_schedule_timeout+0xa6/0x110 [79880.428161] __lock_page+0xf7/0x130 [79880.428164] ? unlock_page+0x30/0x30 [79880.428167] pagecache_get_page+0x16b/0x250 [79880.428171] grab_cache_page_write_begin+0x20/0x40 [79880.428182] f2fs_write_begin+0xa2/0xdb0 [f2fs] [79880.428192] ? f2fs_mark_inode_dirty_sync+0x16/0x30 [f2fs] [79880.428197] ? kmem_cache_free+0x79/0x200 [79880.428203] ? __mark_inode_dirty+0x17f/0x360 [79880.428206] generic_perform_write+0xbb/0x190 [79880.428213] ? file_update_time+0xa4/0xf0 [79880.428217] __generic_file_write_iter+0x19b/0x1e0 [79880.428226] f2fs_file_write_iter+0x9c/0x180 [f2fs] [79880.428231] __vfs_write+0xc5/0x140 [79880.428235] vfs_write+0xb2/0x1b0 [79880.428238] SyS_write+0x46/0xa0 [79880.428242] entry_SYSCALL_64_fastpath+0x1e/0xad Fixes: cae96a5c8ab6 ("f2fs: check io submission more precisely") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index e8f605bafc4e..2efca57fdb26 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1756,7 +1756,11 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, goto fail; } repeat: - page = grab_cache_page_write_begin(mapping, index, flags); + /* + * Do not use grab_cache_page_write_begin() to avoid deadlock due to + * wait_for_stable_page. Will wait that below with our IO control. + */ + page = grab_cache_page(mapping, index); if (!page) { err = -ENOMEM; goto fail; From c1288c8f35c7dde04b857b17bc7437907f19cfac Mon Sep 17 00:00:00 2001 From: Hou Pengyang Date: Thu, 16 Feb 2017 12:34:31 +0000 Subject: [PATCH 0137/1212] f2fs: add ovp valid_blocks check for bg gc victim to fg_gc For foreground gc, greedy algorithm should be adapted, which makes this formula work well: (2 * (100 / config.overprovision + 1) + 6) But currently, we fg_gc have a prior to select bg_gc victim segments to gc first, these victims are selected by cost-benefit algorithm, we can't guarantee such segments have the small valid blocks, which may destroy the f2fs rule, on the worstest case, would consume all the free segments. This patch fix this by add a filter in check_bg_victims, if segment's has # of valid blocks over overprovision ratio, skip such segments. Cc: Signed-off-by: Hou Pengyang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 +++ fs/f2fs/gc.c | 22 ++++++++++++++++++++-- fs/f2fs/segment.h | 9 +++++++++ 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 24b5fd2d1f03..053f5b30eb4b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -957,6 +957,9 @@ struct f2fs_sb_info { struct f2fs_gc_kthread *gc_thread; /* GC thread */ unsigned int cur_victim_sec; /* current victim section num */ + /* threshold for converting bg victims for fg */ + u64 fggc_threshold; + /* maximum # of trials to find a victim segment for SSR and GC */ unsigned int max_victim_search; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 518557bfad42..6cb0c81f56a5 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -166,7 +166,8 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, p->ofs_unit = sbi->segs_per_sec; } - if (p->max_search > sbi->max_victim_search) + /* we need to check every dirty segments in the FG_GC case */ + if (gc_type != FG_GC && p->max_search > sbi->max_victim_search) p->max_search = sbi->max_victim_search; p->offset = sbi->last_victim[p->gc_mode]; @@ -199,6 +200,10 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) { if (sec_usage_check(sbi, secno)) continue; + + if (no_fggc_candidate(sbi, secno)) + continue; + clear_bit(secno, dirty_i->victim_secmap); return secno * sbi->segs_per_sec; } @@ -322,13 +327,15 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, nsearched++; } - secno = GET_SECNO(sbi, segno); if (sec_usage_check(sbi, secno)) goto next; if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) goto next; + if (gc_type == FG_GC && p.alloc_mode == LFS && + no_fggc_candidate(sbi, secno)) + goto next; cost = get_gc_cost(sbi, segno, &p); @@ -985,5 +992,16 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background) void build_gc_manager(struct f2fs_sb_info *sbi) { + u64 main_count, resv_count, ovp_count, blocks_per_sec; + DIRTY_I(sbi)->v_ops = &default_v_ops; + + /* threshold of # of valid blocks in a section for victims of FG_GC */ + main_count = SM_I(sbi)->main_segments << sbi->log_blocks_per_seg; + resv_count = SM_I(sbi)->reserved_segments << sbi->log_blocks_per_seg; + ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg; + blocks_per_sec = sbi->blocks_per_seg * sbi->segs_per_sec; + + sbi->fggc_threshold = div_u64((main_count - ovp_count) * blocks_per_sec, + (main_count - resv_count)); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 5cb5755c75d9..f4020f141d83 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -716,6 +716,15 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) - (base + 1) + type; } +static inline bool no_fggc_candidate(struct f2fs_sb_info *sbi, + unsigned int secno) +{ + if (get_valid_blocks(sbi, secno, sbi->segs_per_sec) >= + sbi->fggc_threshold) + return true; + return false; +} + static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) { if (IS_CURSEC(sbi, secno) || (sbi->cur_victim_sec == secno)) From 1466b660bebe2334aad1f806c793be57f028f17d Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Tue, 21 Feb 2017 16:59:26 +0800 Subject: [PATCH 0138/1212] f2fs: put allocate_segment after refresh_sit_entry SIT information should be updated before segment allocation, since SSR needs latest valid block information. Current code does not update the old_blkaddr info in sit_entry, so adjust the allocate_segment to its proper location. Commit 5e443818fa0b2a2845561ee25bec181424fb2889 ("f2fs: handle dirty segments inside refresh_sit_entry") puts it into wrong location. Signed-off-by: Yunlong Song Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 4fc23afc03e2..22e9c31e189f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1838,14 +1838,15 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, stat_inc_block_count(sbi, curseg); - if (!__has_curseg_space(sbi, type)) - sit_i->s_ops->allocate_segment(sbi, type, false); /* * SIT information should be updated before segment allocation, * since SSR needs latest valid block information. */ refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); + if (!__has_curseg_space(sbi, type)) + sit_i->s_ops->allocate_segment(sbi, type, false); + mutex_unlock(&sit_i->sentry_lock); if (page && IS_NODESEG(type)) From 316bed49a61855f454fee133d244bd51812055d1 Mon Sep 17 00:00:00 2001 From: Hou Pengyang Date: Wed, 22 Feb 2017 10:28:59 +0000 Subject: [PATCH 0139/1212] f2fs: node segment is prior to data segment selected victim As data segment gc may lead dnode dirty, so the greedy cost for data segment should be valid blocks * 2, that is data segment is prior to node segment. Signed-off-by: Hou Pengyang Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 6cb0c81f56a5..07e61b6139cc 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -242,6 +242,16 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) return UINT_MAX - ((100 * (100 - u) * age) / (100 + u)); } +static unsigned int get_greedy_cost(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + unsigned int valid_blocks = + get_valid_blocks(sbi, segno, sbi->segs_per_sec); + + return IS_DATASEG(get_seg_entry(sbi, segno)->type) ? + valid_blocks * 2 : valid_blocks; +} + static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno, struct victim_sel_policy *p) { @@ -250,7 +260,7 @@ static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, /* alloc_mode == LFS */ if (p->gc_mode == GC_GREEDY) - return get_valid_blocks(sbi, segno, sbi->segs_per_sec); + return get_greedy_cost(sbi, segno); else return get_cb_cost(sbi, segno); } From e3e27c59487bf43b9b5a8bfc1d825e5720443332 Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Wed, 22 Feb 2017 20:50:49 +0800 Subject: [PATCH 0140/1212] f2fs: do SSR for data when there is enough free space In allocate_segment_by_default(), need_SSR() already detected it's time to do SSR. So, let's try to find victims for data segments more aggressively in time. Signed-off-by: Yunlong Song Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 22e9c31e189f..98eef04bffa3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1619,7 +1619,7 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) struct curseg_info *curseg = CURSEG_I(sbi, type); const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; - if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0, 0)) + if (IS_NODESEG(type)) return v_ops->get_victim(sbi, &(curseg)->next_segno, BG_GC, type, SSR); From 62d8564f71ec326e7cb5ba2ab582942858ccccec Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 22 Feb 2017 16:39:11 -0800 Subject: [PATCH 0141/1212] f2fs: do SSR in higher priority Let's check SSR in prior to LFS allocation. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 98eef04bffa3..209fe59e45f4 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1403,17 +1403,6 @@ static void write_current_sum_page(struct f2fs_sb_info *sbi, f2fs_put_page(page, 1); } -static int is_next_segment_free(struct f2fs_sb_info *sbi, int type) -{ - struct curseg_info *curseg = CURSEG_I(sbi, type); - unsigned int segno = curseg->segno + 1; - struct free_segmap_info *free_i = FREE_I(sbi); - - if (segno < MAIN_SEGS(sbi) && segno % sbi->segs_per_sec) - return !test_bit(segno, free_i->free_segmap); - return 0; -} - /* * Find a new segment from the free segments bitmap to right order * This function should be returned with success, otherwise BUG @@ -1638,21 +1627,17 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) static void allocate_segment_by_default(struct f2fs_sb_info *sbi, int type, bool force) { - struct curseg_info *curseg = CURSEG_I(sbi, type); - if (force) new_curseg(sbi, type, true); else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && type == CURSEG_WARM_NODE) new_curseg(sbi, type, false); - else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) - new_curseg(sbi, type, false); else if (need_SSR(sbi) && get_ssr_segment(sbi, type)) change_curseg(sbi, type, true); else new_curseg(sbi, type, false); - stat_inc_seg_type(sbi, curseg); + stat_inc_seg_type(sbi, CURSEG_I(sbi, type)); } void allocate_new_segments(struct f2fs_sb_info *sbi) From 3f2523b222146a4af5b0c0c65e82e641475a3c66 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 22 Feb 2017 17:10:18 -0800 Subject: [PATCH 0142/1212] f2fs: find data segments across all the types Previously, if type is CURSEG_HOT_DATA, we only check CURSEG_HOT_DATA only. This patch fixes to search all the different types for SSR. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 209fe59e45f4..aebbc3dbc2de 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1607,16 +1607,23 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; + int i; + + /* need_SSR() already forces to do this */ + if (v_ops->get_victim(sbi, &(curseg)->next_segno, BG_GC, type, SSR)) + return 1; if (IS_NODESEG(type)) - return v_ops->get_victim(sbi, - &(curseg)->next_segno, BG_GC, type, SSR); + return 0; /* For data segments, let's do SSR more intensively */ - for (; type >= CURSEG_HOT_DATA; type--) + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { + if (i == type) + continue; if (v_ops->get_victim(sbi, &(curseg)->next_segno, - BG_GC, type, SSR)) + BG_GC, i, SSR)) return 1; + } return 0; } From 91ef1346c8a399e90c9e3611914dd21f20172029 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 22 Feb 2017 19:10:35 -0800 Subject: [PATCH 0143/1212] f2fs: avoid very large discard command This patch adds MAX_DISCARD_BLOCKS() to avoid issuing too much large single discard command. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 ++- fs/f2fs/segment.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 053f5b30eb4b..121ee0765fa4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -200,7 +200,8 @@ enum { (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec) #define BATCHED_TRIM_BLOCKS(sbi) \ (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) - +#define MAX_DISCARD_BLOCKS(sbi) \ + ((1 << (sbi)->log_blocks_per_seg) * (sbi)->segs_per_sec) #define DISCARD_ISSUE_RATE 8 #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index aebbc3dbc2de..63846d45b4ad 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -965,7 +965,8 @@ static void __add_discard_entry(struct f2fs_sb_info *sbi, if (!list_empty(head)) { last = list_last_entry(head, struct discard_entry, list); if (START_BLOCK(sbi, cpc->trim_start) + start == - last->blkaddr + last->len) { + last->blkaddr + last->len && + last->len < MAX_DISCARD_BLOCKS(sbi)) { last->len += end - start; goto done; } From 94d3f18b4f784e2ee114db3ad773527df992ac27 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 22 Feb 2017 19:53:07 -0800 Subject: [PATCH 0144/1212] f2fs: much larger batched trim_fs job We have a kernel thread to issue discard commands, so we can increase the number of batched discard sections. By default, now it becomes 4GB range. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 121ee0765fa4..4440f52a83a3 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -195,7 +195,7 @@ enum { CP_DISCARD, }; -#define DEF_BATCHED_TRIM_SECTIONS 2 +#define DEF_BATCHED_TRIM_SECTIONS 2048 #define BATCHED_TRIM_SEGMENTS(sbi) \ (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec) #define BATCHED_TRIM_BLOCKS(sbi) \ From 97a61adb90fb6d15e46b693a7a60b5bde6b1c8d2 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 22 Feb 2017 19:58:23 -0800 Subject: [PATCH 0145/1212] f2fs: wait for discard completion after submission We don't need to wait for each discard commands when unmounting the image. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 63846d45b4ad..609d09ab6012 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -676,8 +676,12 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *wait_list = &(dcc->discard_cmd_list); struct discard_cmd *dc, *tmp; + struct blk_plug plug; mutex_lock(&dcc->cmd_lock); + + blk_start_plug(&plug); + list_for_each_entry_safe(dc, tmp, wait_list, list) { if (blkaddr == NULL_ADDR) { @@ -686,9 +690,6 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) submit_bio(REQ_SYNC, dc->bio); atomic_inc(&dcc->submit_discard); } - wait_for_completion_io(&dc->wait); - - __remove_discard_cmd(sbi, dc); continue; } @@ -699,6 +700,15 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) __remove_discard_cmd(sbi, dc); } } + blk_finish_plug(&plug); + + /* this comes from f2fs_put_super */ + if (blkaddr == NULL_ADDR) { + list_for_each_entry_safe(dc, tmp, wait_list, list) { + wait_for_completion_io(&dc->wait); + __remove_discard_cmd(sbi, dc); + } + } mutex_unlock(&dcc->cmd_lock); } From 70dd5a4c5aa2440965085a36a17496be6ba9b760 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 22 Feb 2017 20:18:35 -0800 Subject: [PATCH 0146/1212] f2fs: check discard alignment only for SEQWRITE zones For converntional zones, we don't need to align discard commands to exact zone size. Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/segment.c --- fs/f2fs/segment.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 609d09ab6012..d77b2cddf9df 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -872,24 +872,13 @@ static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { - sector_t nr_sects = SECTOR_FROM_BLOCK(blklen); - sector_t sector; + sector_t sector, nr_sects; int devi = 0; if (sbi->s_ndevs) { devi = f2fs_target_device_index(sbi, blkstart); blkstart -= FDEV(devi).start_blk; } - sector = SECTOR_FROM_BLOCK(blkstart); - - if (sector & (bdev_zone_size(bdev) - 1) || - nr_sects != bdev_zone_size(bdev)) { - f2fs_msg(sbi->sb, KERN_INFO, - "(%d) %s: Unaligned discard attempted (block %x + %x)", - devi, sbi->s_ndevs ? FDEV(devi).path: "", - blkstart, blklen); - return -EIO; - } /* * We need to know the type of the zone: for conventional zones, @@ -904,6 +893,17 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen); case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF: + sector = SECTOR_FROM_BLOCK(blkstart); + nr_sects = SECTOR_FROM_BLOCK(blklen); + + if (sector & (bdev_zone_size(bdev) - 1) || + nr_sects != bdev_zone_size(bdev)) { + f2fs_msg(sbi->sb, KERN_INFO, + "(%d) %s: Unaligned discard attempted (block %x + %x)", + devi, sbi->s_ndevs ? FDEV(devi).path: "", + blkstart, blklen); + return -EIO; + } trace_f2fs_issue_reset_zone(bdev, blkstart); return blkdev_reset_zones(bdev, sector, nr_sects, GFP_NOFS); From cb9ca08d121ab076d027d3f37806fd315d9771fa Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 22 Feb 2017 17:02:32 -0800 Subject: [PATCH 0147/1212] f2fs: do SSR for node segments more aggresively This patch gives more SSR chances for node blocks. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d77b2cddf9df..934749663b61 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1618,17 +1618,22 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; - int i; + int i, n; /* need_SSR() already forces to do this */ if (v_ops->get_victim(sbi, &(curseg)->next_segno, BG_GC, type, SSR)) return 1; - if (IS_NODESEG(type)) - return 0; + /* For node segments, let's do SSR more intensively */ + if (IS_NODESEG(type)) { + i = CURSEG_HOT_NODE; + n = CURSEG_COLD_NODE; + } else { + i = CURSEG_HOT_DATA; + n = CURSEG_COLD_DATA; + } - /* For data segments, let's do SSR more intensively */ - for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { + for (; i <= n; i++) { if (i == type) continue; if (v_ops->get_victim(sbi, &(curseg)->next_segno, From c074e1b7c11cb877b8484f8f47ff618ff83b5169 Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Tue, 21 Feb 2017 20:43:48 +0800 Subject: [PATCH 0148/1212] f2fs: remove unnecessary condition check for write_checkpoint in f2fs_gc Since has_not_enough_free_secs(sbi, 0, 0) must be true if has_not_enough_ free_secs(sbi, sec_freed, 0) is true, write_checkpoint is sure to execute in both conditions. Signed-off-by: Yunlong Song Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 07e61b6139cc..2727d352817e 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -958,15 +958,9 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background) * enough free sections, we should flush dent/node blocks and do * garbage collections. */ - if (dirty_segments(sbi) || prefree_segments(sbi)) { - ret = write_checkpoint(sbi, &cpc); - if (ret) - goto stop; - } else if (has_not_enough_free_secs(sbi, 0, 0)) { - ret = write_checkpoint(sbi, &cpc); - if (ret) - goto stop; - } + ret = write_checkpoint(sbi, &cpc); + if (ret) + goto stop; } else if (gc_type == BG_GC && !background) { /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ goto stop; From c12a69d920f191a77d8f5134592af800b965bd21 Mon Sep 17 00:00:00 2001 From: Hou Pengyang Date: Thu, 23 Feb 2017 09:18:05 +0000 Subject: [PATCH 0149/1212] f2fs: init local extent_info to avoid stale stack info in tp To avoid such stale(fops, blk, len) info in f2fs_lookup_extent_tree_end tp dio-23095 [005] ...1 17878.856859: f2fs_lookup_extent_tree_end: dev = (259,30), ino = 856, pgofs = 0, ext_info(fofs: 3441207040, blk: 4294967232, len: 3481143808) Signed-off-by: Hou Pengyang Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 8 ++++---- fs/f2fs/file.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 2efca57fdb26..8cbfd1caf4be 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -511,7 +511,7 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) { - struct extent_info ei; + struct extent_info ei = {0,0,0}; struct inode *inode = dn->inode; if (f2fs_lookup_extent_cache(inode, index, &ei)) { @@ -528,7 +528,7 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; struct page *page; - struct extent_info ei; + struct extent_info ei = {0,0,0}; int err; struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), @@ -803,7 +803,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int err = 0, ofs = 1; unsigned int ofs_in_node, last_ofs_in_node; blkcnt_t prealloc; - struct extent_info ei; + struct extent_info ei = {0,0,0}; block_t blkaddr; if (!maxblocks) @@ -1664,7 +1664,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, struct dnode_of_data dn; struct page *ipage; bool locked = false; - struct extent_info ei; + struct extent_info ei = {0,0,0}; int err = 0; /* diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 948d440dd3ce..36082c11adb7 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1896,7 +1896,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, { struct inode *inode = file_inode(filp); struct f2fs_map_blocks map = { .m_next_pgofs = NULL }; - struct extent_info ei; + struct extent_info ei = {0,0,0}; pgoff_t pg_start, pg_end; unsigned int blk_per_seg = sbi->blocks_per_seg; unsigned int total = 0, sec_num; From 7234370dc65c163b804acf700d9694ce7d78abdf Mon Sep 17 00:00:00 2001 From: Hou Pengyang Date: Thu, 23 Feb 2017 09:18:06 +0000 Subject: [PATCH 0150/1212] f2fs: remove unsafe bitmap checking proc A: proc B: - writeback_sb_inodes - __writeback_single_inode - do_writepages - f2fs_write_node_pages - f2fs_balance_fs_bg - write_checkpoint - build_free_nids - flush_nat_entries - __build_free_nids - __flush_nat_entry_set - ra_meta_pages - get_next_nat_page - current_nat_addr - set_to_next_nat [do nat_bitmap checking] - f2fs_change_bit For proc A, nat_bitmap and nat_bitmap_mir would be compared without lock_op and nm_i->nat_tree_lock, while proc B is changing nat_bitmap/nat_bitmap_ver in cp. So it is normal for nat_bitmap/nat_bitmap diffrence under such scenario. This patch fix this by removing the monitoring point. [Fix: 599a09b f2fs: check in-memory nat version bitmap] Signed-off-by: Hou Pengyang Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index d3d289306469..3fc9c4b1dce9 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -209,12 +209,6 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start) (seg_off << sbi->log_blocks_per_seg << 1) + (block_off & (sbi->blocks_per_seg - 1))); -#ifdef CONFIG_F2FS_CHECK_FS - if (f2fs_test_bit(block_off, nm_i->nat_bitmap) != - f2fs_test_bit(block_off, nm_i->nat_bitmap_mir)) - f2fs_bug_on(sbi, 1); -#endif - if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) block_addr += sbi->blocks_per_seg; From 8cfbfea08204d8eb71ff5306c859379456e75bd3 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Thu, 23 Feb 2017 19:55:05 +0800 Subject: [PATCH 0151/1212] f2fs: avoid m_flags overlay when allocating more data blocks When more than one data blocks are allocated, the F2FS_MAP_UNWRITTEN/MAPPED flags will be overlapped by F2FS_MAP_NEW at the later times. Signed-off-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8cbfd1caf4be..5356cc3cf1a9 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -867,7 +867,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, } if (err) goto sync_out; - map->m_flags = F2FS_MAP_NEW; + map->m_flags |= F2FS_MAP_NEW; blkaddr = dn.data_blkaddr; } else { if (flag == F2FS_GET_BLOCK_BMAP) { From 2ef086ad2991b52a5bf8a24408505bc53d6e43f8 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Thu, 23 Feb 2017 19:39:59 +0800 Subject: [PATCH 0152/1212] f2fs: replace rw semaphore extent_tree_lock with mutex lock This patch replace rw semaphore extent_tree_lock with mutex lock for no read cases with this lock. Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 22 +++++++++++----------- fs/f2fs/f2fs.h | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 6ed6424807b6..0ab5518e45c2 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -77,7 +77,7 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) struct extent_tree *et; nid_t ino = inode->i_ino; - down_write(&sbi->extent_tree_lock); + mutex_lock(&sbi->extent_tree_lock); et = radix_tree_lookup(&sbi->extent_tree_root, ino); if (!et) { et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS); @@ -94,7 +94,7 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) atomic_dec(&sbi->total_zombie_tree); list_del_init(&et->list); } - up_write(&sbi->extent_tree_lock); + mutex_unlock(&sbi->extent_tree_lock); /* never died until evict_inode */ F2FS_I(inode)->extent_tree = et; @@ -548,7 +548,7 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) if (!atomic_read(&sbi->total_zombie_tree)) goto free_node; - if (!down_write_trylock(&sbi->extent_tree_lock)) + if (!mutex_trylock(&sbi->extent_tree_lock)) goto out; /* 1. remove unreferenced extent tree */ @@ -570,11 +570,11 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) goto unlock_out; cond_resched(); } - up_write(&sbi->extent_tree_lock); + mutex_unlock(&sbi->extent_tree_lock); free_node: /* 2. remove LRU extent entries */ - if (!down_write_trylock(&sbi->extent_tree_lock)) + if (!mutex_trylock(&sbi->extent_tree_lock)) goto out; remained = nr_shrink - (node_cnt + tree_cnt); @@ -604,7 +604,7 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) spin_unlock(&sbi->extent_lock); unlock_out: - up_write(&sbi->extent_tree_lock); + mutex_unlock(&sbi->extent_tree_lock); out: trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt); @@ -651,10 +651,10 @@ void f2fs_destroy_extent_tree(struct inode *inode) if (inode->i_nlink && !is_bad_inode(inode) && atomic_read(&et->node_cnt)) { - down_write(&sbi->extent_tree_lock); + mutex_lock(&sbi->extent_tree_lock); list_add_tail(&et->list, &sbi->zombie_list); atomic_inc(&sbi->total_zombie_tree); - up_write(&sbi->extent_tree_lock); + mutex_unlock(&sbi->extent_tree_lock); return; } @@ -662,12 +662,12 @@ void f2fs_destroy_extent_tree(struct inode *inode) node_cnt = f2fs_destroy_extent_node(inode); /* delete extent tree entry in radix tree */ - down_write(&sbi->extent_tree_lock); + mutex_lock(&sbi->extent_tree_lock); f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); radix_tree_delete(&sbi->extent_tree_root, inode->i_ino); kmem_cache_free(extent_tree_slab, et); atomic_dec(&sbi->total_ext_tree); - up_write(&sbi->extent_tree_lock); + mutex_unlock(&sbi->extent_tree_lock); F2FS_I(inode)->extent_tree = NULL; @@ -714,7 +714,7 @@ void f2fs_update_extent_cache_range(struct dnode_of_data *dn, void init_extent_cache_info(struct f2fs_sb_info *sbi) { INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO); - init_rwsem(&sbi->extent_tree_lock); + mutex_init(&sbi->extent_tree_lock); INIT_LIST_HEAD(&sbi->extent_list); spin_lock_init(&sbi->extent_lock); atomic_set(&sbi->total_ext_tree, 0); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4440f52a83a3..94f0dcf48763 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -911,7 +911,7 @@ struct f2fs_sb_info { /* for extent tree cache */ struct radix_tree_root extent_tree_root;/* cache extent cache entries */ - struct rw_semaphore extent_tree_lock; /* locking extent radix tree */ + struct mutex extent_tree_lock; /* locking extent radix tree */ struct list_head extent_list; /* lru list for shrinker */ spinlock_t extent_lock; /* locking extent lru list */ atomic_t total_ext_tree; /* extent tree count */ From 273924c37731e8a4e578f31727167338a237d1dd Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 9 Feb 2017 10:38:09 -0800 Subject: [PATCH 0153/1212] f2fs: add bitmaps for empty or full NAT blocks This patches adds bitmaps to represent empty or full NAT blocks containing free nid entries. If we can find valid crc|cp_ver in the last block of checkpoint pack, we'll use these bitmaps when building free nids. In order to avoid checkpointing burden, up-to-date bitmaps will be flushed only during umount time. So, normally we can get this gain, but when power-cut happens, we rely on fsck.f2fs which recovers this bitmap again. After this patch, we build free nids from nid #0 at mount time to make more full NAT blocks, but in runtime, we check empty NAT blocks to load free nids without loading any NAT pages from disk. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 28 +++++- fs/f2fs/debug.c | 1 + fs/f2fs/f2fs.h | 31 ++++++- fs/f2fs/node.c | 188 ++++++++++++++++++++++++++++++++++++---- fs/f2fs/segment.c | 2 +- include/linux/f2fs_fs.h | 1 + 6 files changed, 231 insertions(+), 20 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index fd8db9d1ceea..2a7824341a75 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1025,6 +1025,10 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) spin_lock(&sbi->cp_lock); + if (cpc->reason == CP_UMOUNT && ckpt->cp_pack_total_block_count > + sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) + disable_nat_bits(sbi, false); + if (cpc->reason == CP_UMOUNT) __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); else @@ -1137,6 +1141,28 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) start_blk = __start_cp_next_addr(sbi); + /* write nat bits */ + if (enabled_nat_bits(sbi, cpc)) { + __u64 cp_ver = cur_cp_version(ckpt); + unsigned int i; + block_t blk; + + cp_ver |= ((__u64)crc32 << 32); + *(__le64 *)nm_i->nat_bits = cpu_to_le64(cp_ver); + + blk = start_blk + sbi->blocks_per_seg - nm_i->nat_bits_blocks; + for (i = 0; i < nm_i->nat_bits_blocks; i++) + update_meta_page(sbi, nm_i->nat_bits + + (i << F2FS_BLKSIZE_BITS), blk + i); + + /* Flush all the NAT BITS pages */ + while (get_pages(sbi, F2FS_DIRTY_META)) { + sync_meta_pages(sbi, META, LONG_MAX); + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + } + } + /* need to wait for end_io results */ wait_on_all_pages_writeback(sbi); if (unlikely(f2fs_cp_error(sbi))) @@ -1273,7 +1299,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver); /* write cached NAT/SIT entries to NAT/SIT area */ - flush_nat_entries(sbi); + flush_nat_entries(sbi, cpc); flush_sit_entries(sbi, cpc); /* unlock all the fs_lock[] in do_checkpoint() */ diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index de8da9fc5c99..015ad2b73a92 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -193,6 +193,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi) /* build nm */ si->base_mem += sizeof(struct f2fs_nm_info); si->base_mem += __bitmap_size(sbi, NAT_BITMAP); + si->base_mem += (NM_I(sbi)->nat_bits_blocks << F2FS_BLKSIZE_BITS); get_cache: si->cache_mem = 0; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 94f0dcf48763..3e726878ccdc 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -621,6 +621,7 @@ struct f2fs_nm_info { struct list_head nat_entries; /* cached nat entry list (clean) */ unsigned int nat_cnt; /* the # of cached nat entries */ unsigned int dirty_nat_cnt; /* total num of nat entries in set */ + unsigned int nat_blocks; /* # of nat blocks */ /* free node ids management */ struct radix_tree_root free_nid_root;/* root of the free_nid cache */ @@ -631,6 +632,11 @@ struct f2fs_nm_info { /* for checkpoint */ char *nat_bitmap; /* NAT bitmap pointer */ + + unsigned int nat_bits_blocks; /* # of nat bits blocks */ + unsigned char *nat_bits; /* NAT bits blocks */ + unsigned char *full_nat_bits; /* full NAT pages */ + unsigned char *empty_nat_bits; /* empty NAT pages */ #ifdef CONFIG_F2FS_CHECK_FS char *nat_bitmap_mir; /* NAT bitmap mirror */ #endif @@ -1238,6 +1244,27 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) spin_unlock(&sbi->cp_lock); } +static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock) +{ + set_sbi_flag(sbi, SBI_NEED_FSCK); + + if (lock) + spin_lock(&sbi->cp_lock); + __clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG); + kfree(NM_I(sbi)->nat_bits); + NM_I(sbi)->nat_bits = NULL; + if (lock) + spin_unlock(&sbi->cp_lock); +} + +static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi, + struct cp_control *cpc) +{ + bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); + + return (cpc) ? (cpc->reason == CP_UMOUNT) && set : set; +} + static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { down_read(&sbi->cp_rwsem); @@ -2198,7 +2225,7 @@ void move_node_page(struct page *node_page, int gc_type); int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic); int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc); -void build_free_nids(struct f2fs_sb_info *sbi, bool sync); +void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount); bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid); void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid); @@ -2209,7 +2236,7 @@ int recover_xattr_data(struct inode *inode, struct page *page, int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); int restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum); -void flush_nat_entries(struct f2fs_sb_info *sbi); +void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); int build_node_manager(struct f2fs_sb_info *sbi); void destroy_node_manager(struct f2fs_sb_info *sbi); int __init create_node_manager_caches(void); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 8137903c9012..7facc1711baf 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -338,6 +338,9 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, set_nat_flag(e, IS_CHECKPOINTED, false); __set_nat_cache_dirty(nm_i, e); + if (enabled_nat_bits(sbi, NULL) && new_blkaddr == NEW_ADDR) + __clear_bit_le(NAT_BLOCK_OFFSET(ni->nid), nm_i->empty_nat_bits); + /* update fsync_mark if its inode nat entry is still alive */ if (ni->nid != ni->ino) e = __lookup_nat_cache(nm_i, ni->ino); @@ -1844,7 +1847,60 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, } } -static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) +static int scan_nat_bits(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct page *page; + unsigned int i = 0; + nid_t target = FREE_NID_PAGES * NAT_ENTRY_PER_BLOCK; + nid_t nid; + + if (!enabled_nat_bits(sbi, NULL)) + return -EAGAIN; + + down_read(&nm_i->nat_tree_lock); +check_empty: + i = find_next_bit_le(nm_i->empty_nat_bits, nm_i->nat_blocks, i); + if (i >= nm_i->nat_blocks) { + i = 0; + goto check_partial; + } + + for (nid = i * NAT_ENTRY_PER_BLOCK; nid < (i + 1) * NAT_ENTRY_PER_BLOCK; + nid++) { + if (unlikely(nid >= nm_i->max_nid)) + break; + add_free_nid(sbi, nid, true); + } + + if (nm_i->nid_cnt[FREE_NID_LIST] >= target) + goto out; + i++; + goto check_empty; + +check_partial: + i = find_next_zero_bit_le(nm_i->full_nat_bits, nm_i->nat_blocks, i); + if (i >= nm_i->nat_blocks) { + disable_nat_bits(sbi, true); + up_read(&nm_i->nat_tree_lock); + return -EINVAL; + } + + nid = i * NAT_ENTRY_PER_BLOCK; + page = get_current_nat_page(sbi, nid); + scan_nat_page(sbi, page, nid); + f2fs_put_page(page, 1); + + if (nm_i->nid_cnt[FREE_NID_LIST] < target) { + i++; + goto check_partial; + } +out: + up_read(&nm_i->nat_tree_lock); + return 0; +} + +static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -1859,6 +1915,21 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) if (!sync && !available_free_memory(sbi, FREE_NIDS)) return; + /* try to find free nids with nat_bits */ + if (!mount && !scan_nat_bits(sbi) && nm_i->nid_cnt[FREE_NID_LIST]) + return; + + /* find next valid candidate */ + if (enabled_nat_bits(sbi, NULL)) { + int idx = find_next_zero_bit_le(nm_i->full_nat_bits, + nm_i->nat_blocks, 0); + + if (idx >= nm_i->nat_blocks) + set_sbi_flag(sbi, SBI_NEED_FSCK); + else + nid = idx * NAT_ENTRY_PER_BLOCK; + } + /* readahead nat pages to be scanned */ ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT, true); @@ -1901,10 +1972,10 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) nm_i->ra_nid_pages, META_NAT, false); } -void build_free_nids(struct f2fs_sb_info *sbi, bool sync) +void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) { mutex_lock(&NM_I(sbi)->build_lock); - __build_free_nids(sbi, sync); + __build_free_nids(sbi, sync, mount); mutex_unlock(&NM_I(sbi)->build_lock); } @@ -1946,7 +2017,7 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) spin_unlock(&nm_i->nid_list_lock); /* Let's scan nat pages and its caches to get free nids */ - build_free_nids(sbi, true); + build_free_nids(sbi, true, false); goto retry; } @@ -2238,8 +2309,39 @@ static void __adjust_nat_entry_set(struct nat_entry_set *nes, list_add_tail(&nes->set_list, head); } +void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, + struct page *page) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_index = start_nid / NAT_ENTRY_PER_BLOCK; + struct f2fs_nat_block *nat_blk = page_address(page); + int valid = 0; + int i; + + if (!enabled_nat_bits(sbi, NULL)) + return; + + for (i = 0; i < NAT_ENTRY_PER_BLOCK; i++) { + if (start_nid == 0 && i == 0) + valid++; + if (nat_blk->entries[i].block_addr) + valid++; + } + if (valid == 0) { + __set_bit_le(nat_index, nm_i->empty_nat_bits); + __clear_bit_le(nat_index, nm_i->full_nat_bits); + return; + } + + __clear_bit_le(nat_index, nm_i->empty_nat_bits); + if (valid == NAT_ENTRY_PER_BLOCK) + __set_bit_le(nat_index, nm_i->full_nat_bits); + else + __clear_bit_le(nat_index, nm_i->full_nat_bits); +} + static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, - struct nat_entry_set *set) + struct nat_entry_set *set, struct cp_control *cpc) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); struct f2fs_journal *journal = curseg->journal; @@ -2254,7 +2356,8 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, * #1, flush nat entries to journal in current hot data summary block. * #2, flush nat entries to nat page. */ - if (!__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL)) + if (enabled_nat_bits(sbi, cpc) || + !__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL)) to_journal = false; if (to_journal) { @@ -2294,10 +2397,12 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, } } - if (to_journal) + if (to_journal) { up_write(&curseg->journal_rwsem); - else + } else { + __update_nat_bits(sbi, start_nid, page); f2fs_put_page(page, 1); + } f2fs_bug_on(sbi, set->entry_cnt); @@ -2308,7 +2413,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, /* * This function is called during the checkpointing process. */ -void flush_nat_entries(struct f2fs_sb_info *sbi) +void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -2329,7 +2434,8 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) * entries, remove all entries from journal and merge them * into nat entry set. */ - if (!__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL)) + if (cpc->reason == CP_UMOUNT || + !__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL)) remove_nats_in_journal(sbi); while ((found = __gang_lookup_nat_set(nm_i, @@ -2343,27 +2449,72 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) /* flush dirty nats in nat entry set */ list_for_each_entry_safe(set, tmp, &sets, set_list) - __flush_nat_entry_set(sbi, set); + __flush_nat_entry_set(sbi, set, cpc); up_write(&nm_i->nat_tree_lock); f2fs_bug_on(sbi, nm_i->dirty_nat_cnt); } +static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_bits_bytes = nm_i->nat_blocks / BITS_PER_BYTE; + unsigned int i; + __u64 cp_ver = cur_cp_version(ckpt); + size_t crc_offset = le32_to_cpu(ckpt->checksum_offset); + __u64 crc = le32_to_cpu(*((__le32 *) + ((unsigned char *)ckpt + crc_offset))); + block_t nat_bits_addr; + + if (!enabled_nat_bits(sbi, NULL)) + return 0; + + nm_i->nat_bits_blocks = F2FS_BYTES_TO_BLK((nat_bits_bytes << 1) + 8 + + F2FS_BLKSIZE - 1); + nm_i->nat_bits = kzalloc(nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, + GFP_KERNEL); + if (!nm_i->nat_bits) + return -ENOMEM; + + nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg - + nm_i->nat_bits_blocks; + for (i = 0; i < nm_i->nat_bits_blocks; i++) { + struct page *page = get_meta_page(sbi, nat_bits_addr++); + + memcpy(nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS), + page_address(page), F2FS_BLKSIZE); + f2fs_put_page(page, 1); + } + + cp_ver |= (crc << 32); + if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) { + disable_nat_bits(sbi, true); + return 0; + } + + nm_i->full_nat_bits = nm_i->nat_bits + 8; + nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes; + + f2fs_msg(sbi->sb, KERN_NOTICE, "Found nat_bits in checkpoint"); + return 0; +} + static int init_node_manager(struct f2fs_sb_info *sbi) { struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi); struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned char *version_bitmap; - unsigned int nat_segs, nat_blocks; + unsigned int nat_segs; + int err; nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr); /* segment_count_nat includes pair segment so divide to 2. */ nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1; - nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg); - - nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks; + nm_i->nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg); + nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nm_i->nat_blocks; /* not used nids: 0, node, meta, (and root counted as valid node) */ nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count - @@ -2397,6 +2548,10 @@ static int init_node_manager(struct f2fs_sb_info *sbi) if (!nm_i->nat_bitmap) return -ENOMEM; + err = __get_nat_bitmaps(sbi); + if (err) + return err; + #ifdef CONFIG_F2FS_CHECK_FS nm_i->nat_bitmap_mir = kmemdup(version_bitmap, nm_i->bitmap_size, GFP_KERNEL); @@ -2419,7 +2574,7 @@ int build_node_manager(struct f2fs_sb_info *sbi) if (err) return err; - build_free_nids(sbi, true); + build_free_nids(sbi, true, true); return 0; } @@ -2478,6 +2633,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) up_write(&nm_i->nat_tree_lock); kfree(nm_i->nat_bitmap); + kfree(nm_i->nat_bits); #ifdef CONFIG_F2FS_CHECK_FS kfree(nm_i->nat_bitmap_mir); #endif diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 934749663b61..953599361fb0 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -386,7 +386,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) if (!available_free_memory(sbi, FREE_NIDS)) try_to_free_nids(sbi, MAX_FREE_NIDS); else - build_free_nids(sbi, false); + build_free_nids(sbi, false, false); if (!is_idle(sbi)) return; diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index f0748524ca8c..1c92ace2e8f8 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -114,6 +114,7 @@ struct f2fs_super_block { /* * For checkpoint */ +#define CP_NAT_BITS_FLAG 0x00000080 #define CP_CRC_RECOVERY_FLAG 0x00000040 #define CP_FASTBOOT_FLAG 0x00000020 #define CP_FSCK_FLAG 0x00000010 From 1ad1cd4f71491ca811a0511bfdc8287a686d1244 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Thu, 23 Feb 2017 20:31:20 +0800 Subject: [PATCH 0154/1212] f2fs: no need lock_op in f2fs_write_inline_data Similar as f2fs_write_inode, f2fs_write_inline_data just mark inode page dirty, so it's no need to write inline data under read lock of cp_rwsem. Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 5356cc3cf1a9..86774b13ba42 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1411,9 +1411,12 @@ static int __write_data_page(struct page *page, bool *submitted, goto redirty_out; err = -EAGAIN; - f2fs_lock_op(sbi); - if (f2fs_has_inline_data(inode)) + if (f2fs_has_inline_data(inode)) { err = f2fs_write_inline_data(inode, page); + if (!err) + goto out; + } + f2fs_lock_op(sbi); if (err == -EAGAIN) err = do_write_data_page(&fio); if (F2FS_I(inode)->last_disk_size < psize) From 7d77c7a3525b5be2b5daaaf23ed71c7632f9b04a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 23 Feb 2017 17:43:49 -0800 Subject: [PATCH 0155/1212] f2fs: use __clear_bit_le Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index ab5343f79f9b..0c7bd9a133a9 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -745,7 +745,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, dentry_blk = page_address(page); bit_pos = dentry - dentry_blk->dentry; for (i = 0; i < slots; i++) - clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); + __clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); /* Let's check and deallocate this dentry page */ bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, From 2ea010a9c82622e475a00e20ee53b095edc53c3e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 24 Feb 2017 15:09:16 -0800 Subject: [PATCH 0156/1212] fscrypt: catch fscrypto_get_policy in v4.10-rc6 Signed-off-by: Jaegeuk Kim --- fs/crypto/policy.c | 39 ++++++++++++++++++++++++++------------- fs/f2fs/f2fs.h | 4 ++-- fs/f2fs/file.c | 19 ++----------------- include/linux/fscrypto.h | 12 ++++++------ 4 files changed, 36 insertions(+), 38 deletions(-) diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 6865663aac69..69ec4da11a7b 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -93,16 +93,19 @@ static int create_encryption_context_from_policy(struct inode *inode, return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL); } -int fscrypt_process_policy(struct file *filp, - const struct fscrypt_policy *policy) +int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) { + struct fscrypt_policy policy; struct inode *inode = file_inode(filp); int ret; + if (copy_from_user(&policy, arg, sizeof(policy))) + return -EFAULT; + if (!inode_owner_or_capable(inode)) return -EACCES; - if (policy->version != 0) + if (policy.version != 0) return -EINVAL; ret = mnt_want_write_file(filp); @@ -120,9 +123,9 @@ int fscrypt_process_policy(struct file *filp, ret = -ENOTEMPTY; else ret = create_encryption_context_from_policy(inode, - policy); + &policy); } else if (!is_encryption_context_consistent_with_policy(inode, - policy)) { + &policy)) { printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n", __func__); @@ -134,11 +137,13 @@ int fscrypt_process_policy(struct file *filp, mnt_drop_write_file(filp); return ret; } -EXPORT_SYMBOL(fscrypt_process_policy); +EXPORT_SYMBOL(fscrypt_ioctl_set_policy); -int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy) +int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) { + struct inode *inode = file_inode(filp); struct fscrypt_context ctx; + struct fscrypt_policy policy; int res; if (!inode->i_sb->s_cop->get_context || @@ -151,15 +156,18 @@ int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy) if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1) return -EINVAL; - policy->version = 0; - policy->contents_encryption_mode = ctx.contents_encryption_mode; - policy->filenames_encryption_mode = ctx.filenames_encryption_mode; - policy->flags = ctx.flags; - memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor, + policy.version = 0; + policy.contents_encryption_mode = ctx.contents_encryption_mode; + policy.filenames_encryption_mode = ctx.filenames_encryption_mode; + policy.flags = ctx.flags; + memcpy(policy.master_key_descriptor, ctx.master_key_descriptor, FS_KEY_DESCRIPTOR_SIZE); + + if (copy_to_user(arg, &policy, sizeof(policy))) + return -EFAULT; return 0; } -EXPORT_SYMBOL(fscrypt_get_policy); +EXPORT_SYMBOL(fscrypt_ioctl_get_policy); int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) { @@ -171,6 +179,11 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) BUG_ON(1); } + /* No restrictions on file types which are never encrypted */ + if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) && + !S_ISLNK(child->i_mode)) + return 1; + /* no restrictions if the parent directory is not encrypted */ if (!parent->i_sb->s_cop->is_encrypted(parent)) return 1; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3e726878ccdc..1e41664982a0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2708,8 +2708,8 @@ static inline bool f2fs_may_encrypt(struct inode *inode) #define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page #define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page #define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range -#define fscrypt_process_policy fscrypt_notsupp_process_policy -#define fscrypt_get_policy fscrypt_notsupp_get_policy +#define fscrypt_ioctl_set_policy fscrypt_notsupp_ioctl_set_policy +#define fscrypt_ioctl_get_policy fscrypt_notsupp_ioctl_get_policy #define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context #define fscrypt_inherit_context fscrypt_notsupp_inherit_context #define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 36082c11adb7..892caab7f74b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1772,31 +1772,16 @@ static bool uuid_is_nonzero(__u8 u[16]) static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) { - struct fscrypt_policy policy; struct inode *inode = file_inode(filp); - if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg, - sizeof(policy))) - return -EFAULT; - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - return fscrypt_process_policy(filp, &policy); + return fscrypt_ioctl_set_policy(filp, (const void __user *)arg); } static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) { - struct fscrypt_policy policy; - struct inode *inode = file_inode(filp); - int err; - - err = fscrypt_get_policy(inode, &policy); - if (err) - return err; - - if (copy_to_user((struct fscrypt_policy __user *)arg, &policy, sizeof(policy))) - return -EFAULT; - return 0; + return fscrypt_ioctl_get_policy(filp, (void __user *)arg); } static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h index ff8b11b26f31..e6e53a36104b 100644 --- a/include/linux/fscrypto.h +++ b/include/linux/fscrypto.h @@ -250,8 +250,8 @@ extern void fscrypt_restore_control_page(struct page *); extern int fscrypt_zeroout_range(struct inode *, pgoff_t, sector_t, unsigned int); /* policy.c */ -extern int fscrypt_process_policy(struct file *, const struct fscrypt_policy *); -extern int fscrypt_get_policy(struct inode *, struct fscrypt_policy *); +extern int fscrypt_ioctl_set_policy(struct file *, const void __user *); +extern int fscrypt_ioctl_get_policy(struct file *, void __user *); extern int fscrypt_has_permitted_context(struct inode *, struct inode *); extern int fscrypt_inherit_context(struct inode *, struct inode *, void *, bool); @@ -320,14 +320,14 @@ static inline int fscrypt_notsupp_zeroout_range(struct inode *i, pgoff_t p, } /* policy.c */ -static inline int fscrypt_notsupp_process_policy(struct file *f, - const struct fscrypt_policy *p) +static inline int fscrypt_notsupp_ioctl_set_policy(struct file *f, + const void __user *arg) { return -EOPNOTSUPP; } -static inline int fscrypt_notsupp_get_policy(struct inode *i, - struct fscrypt_policy *p) +static inline int fscrypt_notsupp_ioctl_get_policy(struct file *f, + void __user *arg) { return -EOPNOTSUPP; } From 3a40c74cce8bf6a05114d70317fe4c2c8b6ca50d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 25 Feb 2017 11:08:28 +0800 Subject: [PATCH 0157/1212] f2fs: show simple call stack in fault injection message Previously kernel message can show that in which function we do the injection, but unfortunately, most of the caller are the same, for tracking more information of injection path, it needs to show upper caller's name. This patch supports that ability. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/data.c --- fs/f2fs/checkpoint.c | 1 + fs/f2fs/data.c | 4 +++- fs/f2fs/dir.c | 4 +++- fs/f2fs/f2fs.h | 20 +++++++++++++------- fs/f2fs/gc.c | 4 +++- fs/f2fs/inode.c | 4 +++- fs/f2fs/node.c | 4 +++- fs/f2fs/segment.c | 4 +++- 8 files changed, 32 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 2a7824341a75..d30973a4e4d9 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -495,6 +495,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi) #ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_ORPHAN)) { spin_unlock(&im->ino_lock); + f2fs_show_injection_info(FAULT_ORPHAN); return -ENOSPC; } #endif diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 86774b13ba42..dacc9b2896d7 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -55,8 +55,10 @@ static void f2fs_read_end_io(struct bio *bio) int i; #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) + if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) { + f2fs_show_injection_info(FAULT_IO); bio->bi_error = -EIO; + } #endif if (f2fs_bio_encrypted(bio)) { diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 0c7bd9a133a9..35cbe7185594 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -549,8 +549,10 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, start: #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) + if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) { + f2fs_show_injection_info(FAULT_DIR_DEPTH); return -ENOSPC; + } #endif if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) return -ENOSPC; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1e41664982a0..c8ddca99acaa 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1019,6 +1019,10 @@ struct f2fs_sb_info { }; #ifdef CONFIG_F2FS_FAULT_INJECTION +#define f2fs_show_injection_info(type) \ + printk("%sF2FS-fs : inject %s in %s of %pF\n", \ + KERN_INFO, fault_name[type], \ + __func__, __builtin_return_address(0)) static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) { struct f2fs_fault_info *ffi = &sbi->fault_info; @@ -1032,10 +1036,6 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) atomic_inc(&ffi->inject_ops); if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) { atomic_set(&ffi->inject_ops, 0); - printk("%sF2FS-fs : inject %s in %pF\n", - KERN_INFO, - fault_name[type], - __builtin_return_address(0)); return true; } return false; @@ -1344,8 +1344,10 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, blkcnt_t diff; #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_BLOCK)) + if (time_to_inject(sbi, FAULT_BLOCK)) { + f2fs_show_injection_info(FAULT_BLOCK); return false; + } #endif /* * let's increase this in prior to actual block count change in order @@ -1585,8 +1587,10 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, if (page) return page; - if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) + if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) { + f2fs_show_injection_info(FAULT_PAGE_ALLOC); return NULL; + } #endif if (!for_write) return grab_cache_page(mapping, index); @@ -2062,8 +2066,10 @@ static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_KMALLOC)) + if (time_to_inject(sbi, FAULT_KMALLOC)) { + f2fs_show_injection_info(FAULT_KMALLOC); return NULL; + } #endif return kmalloc(size, flags); } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 2727d352817e..b77d1c806aba 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -48,8 +48,10 @@ static int gc_thread_func(void *data) } #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_CHECKPOINT)) + if (time_to_inject(sbi, FAULT_CHECKPOINT)) { + f2fs_show_injection_info(FAULT_CHECKPOINT); f2fs_stop_checkpoint(sbi, false); + } #endif /* diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index af06bda51a54..24bb8213d974 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -373,8 +373,10 @@ void f2fs_evict_inode(struct inode *inode) goto no_delete; #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_EVICT_INODE)) + if (time_to_inject(sbi, FAULT_EVICT_INODE)) { + f2fs_show_injection_info(FAULT_EVICT_INODE); goto no_delete; + } #endif remove_ino_entry(sbi, inode->i_ino, APPEND_INO); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 7facc1711baf..dca0b1a2c395 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1990,8 +1990,10 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct free_nid *i = NULL; retry: #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_ALLOC_NID)) + if (time_to_inject(sbi, FAULT_ALLOC_NID)) { + f2fs_show_injection_info(FAULT_ALLOC_NID); return false; + } #endif spin_lock(&nm_i->nid_list_lock); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 953599361fb0..684a5165dd04 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -352,8 +352,10 @@ int commit_inmem_pages(struct inode *inode) void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_CHECKPOINT)) + if (time_to_inject(sbi, FAULT_CHECKPOINT)) { + f2fs_show_injection_info(FAULT_CHECKPOINT); f2fs_stop_checkpoint(sbi, false); + } #endif if (!need) From 377816fec3f305ab5f4f72356363bfa98b992db5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 24 Feb 2017 18:46:00 +0800 Subject: [PATCH 0158/1212] f2fs: select target segment with closer temperature in SSR mode In SSR mode, we can allocate target segment which has different temperature type from the type of current block, in order to avoid mixing coldest and hottest data/node as much as possible, change SSR allocation policy to select closer temperature for current block prior. Signed-off-by: Yunlong Song Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 684a5165dd04..e4ef306ba234 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1620,7 +1620,8 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; - int i, n; + int i, cnt; + bool reversed = false; /* need_SSR() already forces to do this */ if (v_ops->get_victim(sbi, &(curseg)->next_segno, BG_GC, type, SSR)) @@ -1628,14 +1629,24 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) /* For node segments, let's do SSR more intensively */ if (IS_NODESEG(type)) { - i = CURSEG_HOT_NODE; - n = CURSEG_COLD_NODE; + if (type >= CURSEG_WARM_NODE) { + reversed = true; + i = CURSEG_COLD_NODE; + } else { + i = CURSEG_HOT_NODE; + } + cnt = NR_CURSEG_NODE_TYPE; } else { - i = CURSEG_HOT_DATA; - n = CURSEG_COLD_DATA; + if (type >= CURSEG_WARM_DATA) { + reversed = true; + i = CURSEG_COLD_DATA; + } else { + i = CURSEG_HOT_DATA; + } + cnt = NR_CURSEG_DATA_TYPE; } - for (; i <= n; i++) { + for (; cnt-- > 0; reversed ? i-- : i++) { if (i == type) continue; if (v_ops->get_victim(sbi, &(curseg)->next_segno, From 1b30dde97f843a00bd6c19c3bd6fe501a4487b8f Mon Sep 17 00:00:00 2001 From: Hou Pengyang Date: Sat, 25 Feb 2017 03:57:38 +0000 Subject: [PATCH 0159/1212] f2fs: avoid bggc->fggc when enough free segments are avaliable after cp We use has_not_enough_free_secs to check if there are enough free segments, (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs + imeta_secs + reserved_sections(sbi) + needed); Under scenario with large number of dirty nodes, these nodes would be flushed during cp, as a result, right side of the inequality would be decreased, while left side stays unchanged if these nodes are flushed in SSR way, which means there are enough free segments after this cp. For this case, we just do a bggc instead of fggc. Signed-off-by: Hou Pengyang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index b77d1c806aba..8c8e7135ef58 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -953,21 +953,22 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background) goto stop; } - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed, 0)) { - gc_type = FG_GC; + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) { /* - * If there is no victim and no prefree segment but still not - * enough free sections, we should flush dent/node blocks and do - * garbage collections. + * For example, if there are many prefree_segments below given + * threshold, we can make them free by checkpoint. Then, we + * secure free segments which doesn't need fggc any more. */ ret = write_checkpoint(sbi, &cpc); if (ret) goto stop; - } else if (gc_type == BG_GC && !background) { - /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ - goto stop; + if (has_not_enough_free_secs(sbi, 0, 0)) + gc_type = FG_GC; } + /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ + if (gc_type == BG_GC && !background) + goto stop; if (!__get_victim(sbi, &segno, gc_type)) goto stop; ret = 0; From e042b87adaa7be0bdda3998f07cc3777fc845a71 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 25 Feb 2017 17:29:54 +0800 Subject: [PATCH 0160/1212] f2fs: kill __is_extent_same Since commit ee6d182f2a19 ("f2fs: remove syncing inode page in all the cases") delayed inode element updating from inode cache to node page cache, so once largest cached extent is updated, we can make inode dirty immediately instead of checking and updating it in the end of extent cache update. The above commit didn't clean up unneeded codes in extent_cache.c, let's finish the job in this patch. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 8 +++----- fs/f2fs/f2fs.h | 7 ------- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 0ab5518e45c2..c6934f014e0f 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -413,7 +413,7 @@ static struct extent_node *__insert_extent_tree(struct inode *inode, return en; } -static unsigned int f2fs_update_extent_tree_range(struct inode *inode, +static void f2fs_update_extent_tree_range(struct inode *inode, pgoff_t fofs, block_t blkaddr, unsigned int len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -426,7 +426,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, unsigned int pos = (unsigned int)fofs; if (!et) - return false; + return; trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len); @@ -434,7 +434,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, if (is_inode_flag_set(inode, FI_NO_EXTENT)) { write_unlock(&et->lock); - return false; + return; } prev = et->largest; @@ -531,8 +531,6 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, __free_extent_tree(sbi, et); write_unlock(&et->lock); - - return !__is_extent_same(&prev, &et->largest); } unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c8ddca99acaa..b4bde555aaf4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -563,13 +563,6 @@ static inline void set_extent_info(struct extent_info *ei, unsigned int fofs, ei->len = len; } -static inline bool __is_extent_same(struct extent_info *ei1, - struct extent_info *ei2) -{ - return (ei1->fofs == ei2->fofs && ei1->blk == ei2->blk && - ei1->len == ei2->len); -} - static inline bool __is_extent_mergeable(struct extent_info *back, struct extent_info *front) { From 2ed4b498d823d54e6c1d8e22390624f814b4b753 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Sat, 25 Feb 2017 19:23:27 +0800 Subject: [PATCH 0161/1212] f2fs: Don't update the xattr data that same as the exist f2fs removes the old xattr data and appends the new data although the new data is same as the exist. Signed-off-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 8eca9022bf16..4f8ab3c0edb1 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -585,6 +585,13 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) return error; } +static bool f2fs_xattr_value_same(struct f2fs_xattr_entry *entry, + const void *value, size_t size) +{ + void *pval = entry->e_name + entry->e_name_len; + return (entry->e_value_size == size) && !memcmp(pval, value, size); +} + static int __f2fs_setxattr(struct inode *inode, int index, const char *name, const void *value, size_t size, struct page *ipage, int flags) @@ -619,12 +626,17 @@ static int __f2fs_setxattr(struct inode *inode, int index, found = IS_XATTR_LAST_ENTRY(here) ? 0 : 1; - if ((flags & XATTR_REPLACE) && !found) { + if (found) { + if ((flags & XATTR_CREATE)) { + error = -EEXIST; + goto exit; + } + + if (f2fs_xattr_value_same(here, value, size)) + goto exit; + } else if ((flags & XATTR_REPLACE)) { error = -ENODATA; goto exit; - } else if ((flags & XATTR_CREATE) && found) { - error = -EEXIST; - goto exit; } last = here; From aa2a9a1c479bd0225c4f7bbd78957f7f9a5f133c Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Sat, 25 Feb 2017 19:23:40 +0800 Subject: [PATCH 0162/1212] f2fs: drop the duplicate pval in f2fs_getxattr Fixes: ba38c27eb9 ("f2fs: enhance lookup xattr") Signed-off-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 4f8ab3c0edb1..d0d15920e3ff 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -514,7 +514,6 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, struct f2fs_xattr_entry *entry = NULL; int error = 0; unsigned int size, len; - char *pval; void *base_addr = NULL; if (name == NULL) @@ -536,8 +535,6 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, goto out; } - pval = entry->e_name + entry->e_name_len; - if (buffer) { char *pval = entry->e_name + entry->e_name_len; memcpy(buffer, pval, size); From 20adb5b3fe0bfde2d36413de4bed7963a3158184 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Sat, 25 Feb 2017 19:32:21 +0800 Subject: [PATCH 0163/1212] f2fs: update the comment of default nr_pages to skipping Fixes: 2c237ebaa4 ("f2fs: avoid writing node/metapages during writes") Signed-off-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index f4020f141d83..5e8ad4280a50 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -736,8 +736,8 @@ static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) * It is very important to gather dirty pages and write at once, so that we can * submit a big bio without interfering other data writes. * By default, 512 pages for directory data, - * 512 pages (2MB) * 3 for three types of nodes, and - * max_bio_blocks for meta are set. + * 512 pages (2MB) * 8 for nodes, and + * 256 pages * 8 for meta are set. */ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) { From 4db9ebac249f0882d4ace06b77b2e9dc8aa440df Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Sat, 25 Feb 2017 19:53:39 +0800 Subject: [PATCH 0164/1212] f2fs: new helper cur_cp_crc() getting crc in f2fs_checkpoint There are four places that getting the crc value in f2fs_checkpoint, just add a new helper cur_cp_crc for them. Signed-off-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 3 +-- fs/f2fs/f2fs.h | 6 ++++++ fs/f2fs/node.c | 5 +---- fs/f2fs/node.h | 20 +++++++------------- 4 files changed, 15 insertions(+), 19 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index d30973a4e4d9..645c3f7f21ce 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -684,8 +684,7 @@ static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, return -EINVAL; } - crc = le32_to_cpu(*((__le32 *)((unsigned char *)*cp_block - + crc_offset))); + crc = cur_cp_crc(*cp_block); if (!f2fs_crc_valid(sbi, crc, *cp_block, crc_offset)) { f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc value"); return -EINVAL; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b4bde555aaf4..d0f8a6153068 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1193,6 +1193,12 @@ static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp) return le64_to_cpu(cp->checkpoint_ver); } +static inline __u64 cur_cp_crc(struct f2fs_checkpoint *cp) +{ + size_t crc_offset = le32_to_cpu(cp->checksum_offset); + return le32_to_cpu(*((__le32 *)((unsigned char *)cp + crc_offset))); +} + static inline bool __is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) { unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index dca0b1a2c395..3463a3e54750 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2465,9 +2465,6 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) unsigned int nat_bits_bytes = nm_i->nat_blocks / BITS_PER_BYTE; unsigned int i; __u64 cp_ver = cur_cp_version(ckpt); - size_t crc_offset = le32_to_cpu(ckpt->checksum_offset); - __u64 crc = le32_to_cpu(*((__le32 *) - ((unsigned char *)ckpt + crc_offset))); block_t nat_bits_addr; if (!enabled_nat_bits(sbi, NULL)) @@ -2490,7 +2487,7 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) f2fs_put_page(page, 1); } - cp_ver |= (crc << 32); + cp_ver |= (cur_cp_crc(ckpt) << 32); if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) { disable_nat_bits(sbi, true); return 0; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 3fc9c4b1dce9..2f9603fa85a5 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -300,14 +300,11 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); struct f2fs_node *rn = F2FS_NODE(page); - size_t crc_offset = le32_to_cpu(ckpt->checksum_offset); - __u64 cp_ver = le64_to_cpu(ckpt->checkpoint_ver); + __u64 cp_ver = cur_cp_version(ckpt); + + if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) + cp_ver |= (cur_cp_crc(ckpt) << 32); - if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) { - __u64 crc = le32_to_cpu(*((__le32 *) - ((unsigned char *)ckpt + crc_offset))); - cp_ver |= (crc << 32); - } rn->footer.cp_ver = cpu_to_le64(cp_ver); rn->footer.next_blkaddr = cpu_to_le32(blkaddr); } @@ -315,14 +312,11 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) static inline bool is_recoverable_dnode(struct page *page) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); - size_t crc_offset = le32_to_cpu(ckpt->checksum_offset); __u64 cp_ver = cur_cp_version(ckpt); - if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) { - __u64 crc = le32_to_cpu(*((__le32 *) - ((unsigned char *)ckpt + crc_offset))); - cp_ver |= (crc << 32); - } + if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) + cp_ver |= (cur_cp_crc(ckpt) << 32); + return cp_ver == cpver_of_node(page); } From 179e2535c7b7231285a9db9a14b663191acce80e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 23 Feb 2017 10:53:49 +0800 Subject: [PATCH 0165/1212] f2fs: introduce free nid bitmap In scenario of intensively node allocation, free nids will be ran out soon, then it needs to stop to load free nids by traversing NAT blocks, in worse case, if NAT blocks does not be cached in memory, it generates IOs which slows down our foreground operations. In order to speed up node allocation, in this patch we introduce a new free_nid_bitmap array, so there is an bitmap table for each NAT block, Once the NAT block is loaded, related bitmap cache will be switched on, and bitmap will be set during traversing nat entries in NAT block, later we can query and update nid usage status in memory completely. With such implementation, I expect performance of node allocation can be improved in the long-term after filesystem image is mounted. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Conflicts: include/linux/f2fs_fs.h --- fs/f2fs/debug.c | 2 + fs/f2fs/f2fs.h | 2 + fs/f2fs/node.c | 125 ++++++++++++++++++++++++++++++++++++---- include/linux/f2fs_fs.h | 1 + 4 files changed, 120 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 015ad2b73a92..a77df377e2e8 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -194,6 +194,8 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem += sizeof(struct f2fs_nm_info); si->base_mem += __bitmap_size(sbi, NAT_BITMAP); si->base_mem += (NM_I(sbi)->nat_bits_blocks << F2FS_BLKSIZE_BITS); + si->base_mem += NM_I(sbi)->nat_blocks * NAT_ENTRY_BITMAP_SIZE; + si->base_mem += NM_I(sbi)->nat_blocks / 8; get_cache: si->cache_mem = 0; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d0f8a6153068..f26b9b451e13 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -622,6 +622,8 @@ struct f2fs_nm_info { unsigned int nid_cnt[MAX_NID_LIST]; /* the number of free node id */ spinlock_t nid_list_lock; /* protect nid lists ops */ struct mutex build_lock; /* lock for build free nids */ + unsigned char (*free_nid_bitmap)[NAT_ENTRY_BITMAP_SIZE]; + unsigned char *nat_block_bitmap; /* for checkpoint */ char *nat_bitmap; /* NAT bitmap pointer */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 3463a3e54750..2909c935039a 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1768,7 +1768,8 @@ static void __remove_nid_from_list(struct f2fs_sb_info *sbi, radix_tree_delete(&nm_i->free_nid_root, i->nid); } -static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) +/* return if the nid is recognized as free */ +static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; @@ -1777,14 +1778,14 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) /* 0 nid should not be used */ if (unlikely(nid == 0)) - return 0; + return false; if (build) { /* do not add allocated nids */ ne = __lookup_nat_cache(nm_i, nid); if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || nat_get_blkaddr(ne) != NULL_ADDR)) - return 0; + return false; } i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS); @@ -1793,7 +1794,7 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) if (radix_tree_preload(GFP_NOFS)) { kmem_cache_free(free_nid_slab, i); - return 0; + return true; } spin_lock(&nm_i->nid_list_lock); @@ -1802,9 +1803,9 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) radix_tree_preload_end(); if (err) { kmem_cache_free(free_nid_slab, i); - return 0; + return true; } - return 1; + return true; } static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) @@ -1825,17 +1826,36 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) kmem_cache_free(free_nid_slab, i); } +void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, bool set) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid); + unsigned int nid_ofs = nid - START_NID(nid); + + if (!test_bit_le(nat_ofs, nm_i->nat_block_bitmap)) + return; + + if (set) + set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + else + clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); +} + static void scan_nat_page(struct f2fs_sb_info *sbi, struct page *nat_page, nid_t start_nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct f2fs_nat_block *nat_blk = page_address(nat_page); block_t blk_addr; + unsigned int nat_ofs = NAT_BLOCK_OFFSET(start_nid); int i; + set_bit_le(nat_ofs, nm_i->nat_block_bitmap); + i = start_nid % NAT_ENTRY_PER_BLOCK; for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) { + bool freed = false; if (unlikely(start_nid >= nm_i->max_nid)) break; @@ -1843,10 +1863,54 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); f2fs_bug_on(sbi, blk_addr == NEW_ADDR); if (blk_addr == NULL_ADDR) - add_free_nid(sbi, start_nid, true); + freed = add_free_nid(sbi, start_nid, true); + update_free_nid_bitmap(sbi, start_nid, freed); } } +static void scan_free_nid_bits(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); + struct f2fs_journal *journal = curseg->journal; + unsigned int i, idx; + unsigned int target = FREE_NID_PAGES * NAT_ENTRY_PER_BLOCK; + + down_read(&nm_i->nat_tree_lock); + + for (i = 0; i < nm_i->nat_blocks; i++) { + if (!test_bit_le(i, nm_i->nat_block_bitmap)) + continue; + for (idx = 0; idx < NAT_ENTRY_PER_BLOCK; idx++) { + nid_t nid; + + if (!test_bit_le(idx, nm_i->free_nid_bitmap[i])) + continue; + + nid = i * NAT_ENTRY_PER_BLOCK + idx; + add_free_nid(sbi, nid, true); + + if (nm_i->nid_cnt[FREE_NID_LIST] >= target) + goto out; + } + } +out: + down_read(&curseg->journal_rwsem); + for (i = 0; i < nats_in_cursum(journal); i++) { + block_t addr; + nid_t nid; + + addr = le32_to_cpu(nat_in_journal(journal, i).block_addr); + nid = le32_to_cpu(nid_in_journal(journal, i)); + if (addr == NULL_ADDR) + add_free_nid(sbi, nid, true); + else + remove_free_nid(sbi, nid); + } + up_read(&curseg->journal_rwsem); + up_read(&nm_i->nat_tree_lock); +} + static int scan_nat_bits(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -1915,9 +1979,17 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) if (!sync && !available_free_memory(sbi, FREE_NIDS)) return; - /* try to find free nids with nat_bits */ - if (!mount && !scan_nat_bits(sbi) && nm_i->nid_cnt[FREE_NID_LIST]) - return; + if (!mount) { + /* try to find free nids in free_nid_bitmap */ + scan_free_nid_bits(sbi); + + if (nm_i->nid_cnt[FREE_NID_LIST]) + return; + + /* try to find free nids with nat_bits */ + if (!scan_nat_bits(sbi) && nm_i->nid_cnt[FREE_NID_LIST]) + return; + } /* find next valid candidate */ if (enabled_nat_bits(sbi, NULL)) { @@ -2013,6 +2085,9 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) i->state = NID_ALLOC; __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false); nm_i->available_nids--; + + update_free_nid_bitmap(sbi, *nid, false); + spin_unlock(&nm_i->nid_list_lock); return true; } @@ -2067,6 +2142,8 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) nm_i->available_nids++; + update_free_nid_bitmap(sbi, nid, true); + spin_unlock(&nm_i->nid_list_lock); if (need_free) @@ -2395,6 +2472,11 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, add_free_nid(sbi, nid, false); spin_lock(&NM_I(sbi)->nid_list_lock); NM_I(sbi)->available_nids++; + update_free_nid_bitmap(sbi, nid, true); + spin_unlock(&NM_I(sbi)->nid_list_lock); + } else { + spin_lock(&NM_I(sbi)->nid_list_lock); + update_free_nid_bitmap(sbi, nid, false); spin_unlock(&NM_I(sbi)->nid_list_lock); } } @@ -2561,6 +2643,22 @@ static int init_node_manager(struct f2fs_sb_info *sbi) return 0; } +int init_free_nid_cache(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + nm_i->free_nid_bitmap = f2fs_kvzalloc(nm_i->nat_blocks * + NAT_ENTRY_BITMAP_SIZE, GFP_KERNEL); + if (!nm_i->free_nid_bitmap) + return -ENOMEM; + + nm_i->nat_block_bitmap = f2fs_kvzalloc(nm_i->nat_blocks / 8, + GFP_KERNEL); + if (!nm_i->nat_block_bitmap) + return -ENOMEM; + return 0; +} + int build_node_manager(struct f2fs_sb_info *sbi) { int err; @@ -2573,6 +2671,10 @@ int build_node_manager(struct f2fs_sb_info *sbi) if (err) return err; + err = init_free_nid_cache(sbi); + if (err) + return err; + build_free_nids(sbi, true, true); return 0; } @@ -2631,6 +2733,9 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) } up_write(&nm_i->nat_tree_lock); + kvfree(nm_i->nat_block_bitmap); + kvfree(nm_i->free_nid_bitmap); + kfree(nm_i->nat_bitmap); kfree(nm_i->nat_bits); #ifdef CONFIG_F2FS_CHECK_FS diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 1c92ace2e8f8..e2d239ed4c60 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -279,6 +279,7 @@ struct f2fs_node { * For NAT entries */ #define NAT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_nat_entry)) +#define NAT_ENTRY_BITMAP_SIZE ((NAT_ENTRY_PER_BLOCK + 7) / 8) struct f2fs_nat_entry { __u8 version; /* latest version of cached nat entry */ From b1305bba60c0a2bf2aeb2c16777f2fbe0d46f282 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Sun, 26 Feb 2017 20:47:16 +0800 Subject: [PATCH 0166/1212] f2fs: use MAX_FREE_NIDS for the free nids target F2FS has define MAX_FREE_NIDS for maximum of cached free nids target. Signed-off-by: Kinglong Mee Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 2909c935039a..cbc0d6ca58da 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1874,7 +1874,6 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi) struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); struct f2fs_journal *journal = curseg->journal; unsigned int i, idx; - unsigned int target = FREE_NID_PAGES * NAT_ENTRY_PER_BLOCK; down_read(&nm_i->nat_tree_lock); @@ -1890,7 +1889,7 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi) nid = i * NAT_ENTRY_PER_BLOCK + idx; add_free_nid(sbi, nid, true); - if (nm_i->nid_cnt[FREE_NID_LIST] >= target) + if (nm_i->nid_cnt[FREE_NID_LIST] >= MAX_FREE_NIDS) goto out; } } @@ -1916,7 +1915,6 @@ static int scan_nat_bits(struct f2fs_sb_info *sbi) struct f2fs_nm_info *nm_i = NM_I(sbi); struct page *page; unsigned int i = 0; - nid_t target = FREE_NID_PAGES * NAT_ENTRY_PER_BLOCK; nid_t nid; if (!enabled_nat_bits(sbi, NULL)) @@ -1937,7 +1935,7 @@ static int scan_nat_bits(struct f2fs_sb_info *sbi) add_free_nid(sbi, nid, true); } - if (nm_i->nid_cnt[FREE_NID_LIST] >= target) + if (nm_i->nid_cnt[FREE_NID_LIST] >= MAX_FREE_NIDS) goto out; i++; goto check_empty; @@ -1955,7 +1953,7 @@ static int scan_nat_bits(struct f2fs_sb_info *sbi) scan_nat_page(sbi, page, nid); f2fs_put_page(page, 1); - if (nm_i->nid_cnt[FREE_NID_LIST] < target) { + if (nm_i->nid_cnt[FREE_NID_LIST] < MAX_FREE_NIDS) { i++; goto check_partial; } From 6ac7367ebfee241a3df925859e6dd55dd89da2fc Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 27 Feb 2017 17:10:45 +0800 Subject: [PATCH 0167/1212] f2fs: fix to update F2FS_{CP_}WB_DATA count correctly We should only account F2FS_{CP_}WB_DATA IOs for write path, fix it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index dacc9b2896d7..58e7dcb9af62 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -398,7 +398,8 @@ int f2fs_submit_page_mbio(struct f2fs_io_info *fio) if ((fio->type == DATA || fio->type == NODE) && fio->new_blkaddr & F2FS_IO_SIZE_MASK(sbi)) { err = -EAGAIN; - dec_page_count(sbi, WB_DATA_TYPE(bio_page)); + if (!is_read) + dec_page_count(sbi, WB_DATA_TYPE(bio_page)); goto out_fail; } io->bio = __bio_alloc(sbi, fio->new_blkaddr, From 9113aae794eb25d3d74c5589b49c091727a9d78b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 27 Feb 2017 18:43:12 +0800 Subject: [PATCH 0168/1212] f2fs: fix memory leak of write_io_dummy mempool during umount Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 35e712bbccf1..da9592d88edb 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -818,7 +818,8 @@ static void f2fs_put_super(struct super_block *sb) kfree(sbi->raw_super); destroy_device_list(sbi); - + if (sbi->write_io_dummy) + mempool_destroy(sbi->write_io_dummy); destroy_percpu_info(sbi); kfree(sbi); } From f6493d7dd1cd7c6bf764fa3d4417a9562e9601b5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 27 Feb 2017 18:43:13 +0800 Subject: [PATCH 0169/1212] f2fs: fix to enlarge size of write_io_dummy mempool It needs to double cache size of write_io_dummy mempool, otherwise we may run out of cache in scenraio of Data/Node IOs were issued concurrently. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index da9592d88edb..379259ce4cd1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1931,7 +1931,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (F2FS_IO_SIZE(sbi) > 1) { sbi->write_io_dummy = - mempool_create_page_pool(F2FS_IO_SIZE(sbi) - 1, 0); + mempool_create_page_pool(2 * (F2FS_IO_SIZE(sbi) - 1), 0); if (!sbi->write_io_dummy) goto free_options; } From e323e9ef9f41168ae00ba25c6560aabf9e706b1f Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Mon, 27 Feb 2017 18:59:53 +0800 Subject: [PATCH 0170/1212] f2fs: remove redundant set_page_dirty() This patch remove redundant set_page_dirty in truncate_blocks Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 892caab7f74b..cfd86ae20b7c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -568,8 +568,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) } if (f2fs_has_inline_data(inode)) { - if (truncate_inline_inode(ipage, from)) - set_page_dirty(ipage); + truncate_inline_inode(ipage, from); if (from == 0) clear_inode_flag(inode, FI_DATA_EXIST); f2fs_put_page(ipage, 1); From 5086fe4c101b9c5a7cca87754fc6b4a17101f9ff Mon Sep 17 00:00:00 2001 From: Masato Suzuki Date: Mon, 27 Feb 2017 20:52:49 +0900 Subject: [PATCH 0171/1212] f2fs: Fix zoned block device support The introduction of the multi-device feature partially broke the support for zoned block devices. In the function f2fs_scan_devices, sbi->devs allocation and initialization is skipped in the case of a single device mount. This result in no device information structure being allocated for the device. This is fine if the device is a regular device, but in the case of a zoned block device, the device zone type array is not initialized, which causes the function __f2fs_issue_discard_zone to fail as get_blkz_type is unable to determine the zone type of a section. Fix this by always allocating and initializing the sbi->devs device information array even in the case of a single device if that device is zoned. For this particular case, make sure to obtain a reference on the single device so that the call to blkdev_put() in destroy_device_list operates as expected. Fixes: 3c62be17d4f562f4 ("f2fs: support multiple devices") Cc: # v4.10 Signed-off-by: Masato Suzuki Acked-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 71 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 48 insertions(+), 23 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 379259ce4cd1..b7f8932c3502 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1745,36 +1745,59 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) static int f2fs_scan_devices(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + unsigned int max_devices = MAX_DEVICES; int i; - for (i = 0; i < MAX_DEVICES; i++) { - if (!RDEV(i).path[0]) + /* Initialize single device information */ + if (!RDEV(0).path[0]) { +#ifdef CONFIG_BLK_DEV_ZONED + if (bdev_zoned_model(sbi->sb->s_bdev) == BLK_ZONED_NONE) return 0; + max_devices = 1; +#else + return 0; +#endif + } - if (i == 0) { - sbi->devs = kzalloc(sizeof(struct f2fs_dev_info) * - MAX_DEVICES, GFP_KERNEL); - if (!sbi->devs) - return -ENOMEM; - } + /* + * Initialize multiple devices information, or single + * zoned block device information. + */ + sbi->devs = kcalloc(max_devices, sizeof(struct f2fs_dev_info), + GFP_KERNEL); + if (!sbi->devs) + return -ENOMEM; - memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN); - FDEV(i).total_segments = le32_to_cpu(RDEV(i).total_segments); - if (i == 0) { - FDEV(i).start_blk = 0; - FDEV(i).end_blk = FDEV(i).start_blk + - (FDEV(i).total_segments << - sbi->log_blocks_per_seg) - 1 + - le32_to_cpu(raw_super->segment0_blkaddr); - } else { - FDEV(i).start_blk = FDEV(i - 1).end_blk + 1; - FDEV(i).end_blk = FDEV(i).start_blk + - (FDEV(i).total_segments << - sbi->log_blocks_per_seg) - 1; - } + for (i = 0; i < max_devices; i++) { - FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, + if (i > 0 && !RDEV(i).path[0]) + break; + + if (max_devices == 1) { + /* Single zoned block device mount */ + FDEV(0).bdev = + blkdev_get_by_dev(sbi->sb->s_bdev->bd_dev, sbi->sb->s_mode, sbi->sb->s_type); + } else { + /* Multi-device mount */ + memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN); + FDEV(i).total_segments = + le32_to_cpu(RDEV(i).total_segments); + if (i == 0) { + FDEV(i).start_blk = 0; + FDEV(i).end_blk = FDEV(i).start_blk + + (FDEV(i).total_segments << + sbi->log_blocks_per_seg) - 1 + + le32_to_cpu(raw_super->segment0_blkaddr); + } else { + FDEV(i).start_blk = FDEV(i - 1).end_blk + 1; + FDEV(i).end_blk = FDEV(i).start_blk + + (FDEV(i).total_segments << + sbi->log_blocks_per_seg) - 1; + } + FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, + sbi->sb->s_mode, sbi->sb->s_type); + } if (IS_ERR(FDEV(i).bdev)) return PTR_ERR(FDEV(i).bdev); @@ -1794,6 +1817,8 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) "Failed to initialize F2FS blkzone information"); return -EINVAL; } + if (max_devices == 1) + break; f2fs_msg(sbi->sb, KERN_INFO, "Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)", i, FDEV(i).path, From c1c90b7d9dc1440665b1cb0ba069013a5b483464 Mon Sep 17 00:00:00 2001 From: Hou Pengyang Date: Mon, 27 Feb 2017 13:02:58 +0000 Subject: [PATCH 0172/1212] f2fs: add f2fs_drop_inode tracepoint Signed-off-by: Hou Pengyang Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 7 +++++-- include/trace/events/f2fs.h | 7 +++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b7f8932c3502..85c282272067 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -624,6 +624,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) static int f2fs_drop_inode(struct inode *inode) { + int ret; /* * This is to avoid a deadlock condition like below. * writeback_single_inode(inode) @@ -656,10 +657,12 @@ static int f2fs_drop_inode(struct inode *inode) spin_lock(&inode->i_lock); atomic_dec(&inode->i_count); } + trace_f2fs_drop_inode(inode, 0); return 0; } - - return generic_drop_inode(inode); + ret = generic_drop_inode(inode); + trace_f2fs_drop_inode(inode, ret); + return ret; } int f2fs_inode_dirtied(struct inode *inode, bool sync) diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index bd1772879c8c..b95872b9c3ae 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -309,6 +309,13 @@ DEFINE_EVENT(f2fs__inode_exit, f2fs_unlink_exit, TP_ARGS(inode, ret) ); +DEFINE_EVENT(f2fs__inode_exit, f2fs_drop_inode, + + TP_PROTO(struct inode *inode, int ret), + + TP_ARGS(inode, ret) +); + DEFINE_EVENT(f2fs__inode, f2fs_truncate, TP_PROTO(struct inode *inode), From 95bfba756e531d42d1b06b45b12e3e0f65560aee Mon Sep 17 00:00:00 2001 From: Hou Pengyang Date: Mon, 27 Feb 2017 13:02:59 +0000 Subject: [PATCH 0173/1212] f2fs: fix a plint compile warning fix such pclint warning: ... Loss of precision (arg. no. 2) (unsigned long long to unsigned int)) Signed-off-by: Hou Pengyang Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 8c8e7135ef58..68d6a4cad349 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1009,6 +1009,6 @@ void build_gc_manager(struct f2fs_sb_info *sbi) ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg; blocks_per_sec = sbi->blocks_per_seg * sbi->segs_per_sec; - sbi->fggc_threshold = div_u64((main_count - ovp_count) * blocks_per_sec, + sbi->fggc_threshold = div64_u64((main_count - ovp_count) * blocks_per_sec, (main_count - resv_count)); } From 7375ae65fa6dae808669b6837b49ca40fe227531 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 27 Feb 2017 11:57:11 -0800 Subject: [PATCH 0174/1212] f2fs: avoid to issue redundant discard commands If segs_per_sec is over 1 like under SMR, previously f2fs issues discard commands redundantly on the same section, since we didn't move end position for the previous discard command. E.g., start end | | prefree_bitmap = [01111100111100] And, after issue discard for this section, end start | | prefree_bitmap = [01111100111100] Select this section again by searching from (end + 1), start end | | prefree_bitmap = [01111100111100] Fixes: 36abef4e796d38 ("f2fs: introduce mode=lfs mount option") Cc: Cc: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e4ef306ba234..a09c726cc1c3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1115,6 +1115,8 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) start = start_segno + sbi->segs_per_sec; if (start < end) goto next; + else + end = start - 1; } mutex_unlock(&dirty_i->seglist_lock); From 9cc3fbc9ea182cfd50758d754896108749f74808 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 27 Feb 2017 21:28:53 -0800 Subject: [PATCH 0175/1212] f2fs: avoid to flush nat journal entries This patch adds a missing condition which flushes nat journal entries unnecessarily introduced by: f2fs: add bitmaps for empty or full NAT blocks Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index cbc0d6ca58da..81f0daad982b 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2516,7 +2516,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * entries, remove all entries from journal and merge them * into nat entry set. */ - if (cpc->reason == CP_UMOUNT || + if (enabled_nat_bits(sbi, cpc) || !__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL)) remove_nats_in_journal(sbi); From b5bb7b2de94dc8802a7eb27e668f7e4e122de209 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 8 Mar 2017 15:24:43 -0800 Subject: [PATCH 0176/1212] fscrypt: catch up to v4.11-rc1 fscrypt: - fs/crypto/bio.c changes f2fs: - fscrypt: use ENOKEY when file cannot be created w/o key - fscrypt: split supp and notsupp declarations into their own headers - fscrypt: make fscrypt_operations.key_prefix a string Signed-off-by: Jaegeuk Kim --- fs/crypto/Makefile | 1 + fs/crypto/bio.c | 143 ++++++++++++++++++ fs/crypto/crypto.c | 249 +++++++++++--------------------- fs/crypto/fname.c | 12 +- fs/crypto/fscrypt_private.h | 117 +++++++++++++++ fs/crypto/keyinfo.c | 52 +++---- fs/crypto/policy.c | 97 +++++-------- fs/f2fs/data.c | 4 +- fs/f2fs/dir.c | 5 +- fs/f2fs/f2fs.h | 40 +---- fs/f2fs/namei.c | 4 +- fs/f2fs/super.c | 14 +- include/linux/fscrypt_common.h | 146 +++++++++++++++++++ include/linux/fscrypt_notsupp.h | 168 +++++++++++++++++++++ include/linux/fscrypt_supp.h | 66 +++++++++ include/uapi/linux/fs.h | 15 ++ 16 files changed, 818 insertions(+), 315 deletions(-) create mode 100644 fs/crypto/bio.c create mode 100644 fs/crypto/fscrypt_private.h create mode 100644 include/linux/fscrypt_common.h create mode 100644 include/linux/fscrypt_notsupp.h create mode 100644 include/linux/fscrypt_supp.h diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile index f17684c48739..9f6607f17b53 100644 --- a/fs/crypto/Makefile +++ b/fs/crypto/Makefile @@ -1,3 +1,4 @@ obj-$(CONFIG_FS_ENCRYPTION) += fscrypto.o fscrypto-y := crypto.o fname.o policy.o keyinfo.o +fscrypto-$(CONFIG_BLOCK) += bio.o diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c new file mode 100644 index 000000000000..a91ed46fe503 --- /dev/null +++ b/fs/crypto/bio.c @@ -0,0 +1,143 @@ +/* + * This contains encryption functions for per-file encryption. + * + * Copyright (C) 2015, Google, Inc. + * Copyright (C) 2015, Motorola Mobility + * + * Written by Michael Halcrow, 2014. + * + * Filename encryption additions + * Uday Savagaonkar, 2014 + * Encryption policy handling additions + * Ildar Muslukhov, 2014 + * Add fscrypt_pullback_bio_page() + * Jaegeuk Kim, 2015. + * + * This has not yet undergone a rigorous security audit. + * + * The usage of AES-XTS should conform to recommendations in NIST + * Special Publication 800-38E and IEEE P1619/D16. + */ + +#include +#include +#include +#include +#include "fscrypt_private.h" + +/* + * Call fscrypt_decrypt_page on every single page, reusing the encryption + * context. + */ +static void completion_pages(struct work_struct *work) +{ + struct fscrypt_ctx *ctx = + container_of(work, struct fscrypt_ctx, r.work); + struct bio *bio = ctx->r.bio; + struct bio_vec *bv; + int i; + + bio_for_each_segment_all(bv, bio, i) { + struct page *page = bv->bv_page; + int ret = fscrypt_decrypt_page(page->mapping->host, page, + PAGE_SIZE, 0, page->index); + + if (ret) { + WARN_ON_ONCE(1); + SetPageError(page); + } else { + SetPageUptodate(page); + } + unlock_page(page); + } + fscrypt_release_ctx(ctx); + bio_put(bio); +} + +void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio) +{ + INIT_WORK(&ctx->r.work, completion_pages); + ctx->r.bio = bio; + queue_work(fscrypt_read_workqueue, &ctx->r.work); +} +EXPORT_SYMBOL(fscrypt_decrypt_bio_pages); + +void fscrypt_pullback_bio_page(struct page **page, bool restore) +{ + struct fscrypt_ctx *ctx; + struct page *bounce_page; + + /* The bounce data pages are unmapped. */ + if ((*page)->mapping) + return; + + /* The bounce data page is unmapped. */ + bounce_page = *page; + ctx = (struct fscrypt_ctx *)page_private(bounce_page); + + /* restore control page */ + *page = ctx->w.control_page; + + if (restore) + fscrypt_restore_control_page(bounce_page); +} +EXPORT_SYMBOL(fscrypt_pullback_bio_page); + +int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, + sector_t pblk, unsigned int len) +{ + struct fscrypt_ctx *ctx; + struct page *ciphertext_page = NULL; + struct bio *bio; + int ret, err = 0; + + BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE); + + ctx = fscrypt_get_ctx(inode, GFP_NOFS); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ciphertext_page = fscrypt_alloc_bounce_page(ctx, GFP_NOWAIT); + if (IS_ERR(ciphertext_page)) { + err = PTR_ERR(ciphertext_page); + goto errout; + } + + while (len--) { + err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk, + ZERO_PAGE(0), ciphertext_page, + PAGE_SIZE, 0, GFP_NOFS); + if (err) + goto errout; + + bio = bio_alloc(GFP_NOWAIT, 1); + if (!bio) { + err = -ENOMEM; + goto errout; + } + bio->bi_bdev = inode->i_sb->s_bdev; + bio->bi_iter.bi_sector = + pblk << (inode->i_sb->s_blocksize_bits - 9); + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + ret = bio_add_page(bio, ciphertext_page, + inode->i_sb->s_blocksize, 0); + if (ret != inode->i_sb->s_blocksize) { + /* should never happen! */ + WARN_ON(1); + bio_put(bio); + err = -EIO; + goto errout; + } + err = submit_bio_wait(0, bio); + bio_put(bio); + if (err) + goto errout; + lblk++; + pblk++; + } + err = 0; +errout: + fscrypt_release_ctx(ctx); + return err; +} +EXPORT_SYMBOL(fscrypt_zeroout_range); diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 2d40ab9edc9f..02a7a9286449 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -24,10 +24,9 @@ #include #include #include -#include #include #include -#include +#include "fscrypt_private.h" static unsigned int num_prealloc_crypto_pages = 32; static unsigned int num_prealloc_crypto_ctxs = 128; @@ -44,7 +43,7 @@ static mempool_t *fscrypt_bounce_page_pool = NULL; static LIST_HEAD(fscrypt_free_ctxs); static DEFINE_SPINLOCK(fscrypt_ctx_lock); -static struct workqueue_struct *fscrypt_read_workqueue; +struct workqueue_struct *fscrypt_read_workqueue; static DEFINE_MUTEX(fscrypt_init_mutex); static struct kmem_cache *fscrypt_ctx_cachep; @@ -63,7 +62,7 @@ void fscrypt_release_ctx(struct fscrypt_ctx *ctx) { unsigned long flags; - if (ctx->flags & FS_WRITE_PATH_FL && ctx->w.bounce_page) { + if (ctx->flags & FS_CTX_HAS_BOUNCE_BUFFER_FL && ctx->w.bounce_page) { mempool_free(ctx->w.bounce_page, fscrypt_bounce_page_pool); ctx->w.bounce_page = NULL; } @@ -88,7 +87,7 @@ EXPORT_SYMBOL(fscrypt_release_ctx); * Return: An allocated and initialized encryption context on success; error * value or NULL otherwise. */ -struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags) +struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags) { struct fscrypt_ctx *ctx = NULL; struct fscrypt_info *ci = inode->i_crypt_info; @@ -121,7 +120,7 @@ struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags) } else { ctx->flags &= ~FS_CTX_REQUIRES_FREE_ENCRYPT_FL; } - ctx->flags &= ~FS_WRITE_PATH_FL; + ctx->flags &= ~FS_CTX_HAS_BOUNCE_BUFFER_FL; return ctx; } EXPORT_SYMBOL(fscrypt_get_ctx); @@ -141,15 +140,10 @@ static void page_crypt_complete(struct crypto_async_request *req, int res) complete(&ecr->completion); } -typedef enum { - FS_DECRYPT = 0, - FS_ENCRYPT, -} fscrypt_direction_t; - -static int do_page_crypto(struct inode *inode, - fscrypt_direction_t rw, pgoff_t index, - struct page *src_page, struct page *dest_page, - gfp_t gfp_flags) +int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, + u64 lblk_num, struct page *src_page, + struct page *dest_page, unsigned int len, + unsigned int offs, gfp_t gfp_flags) { struct { __le64 index; @@ -162,6 +156,8 @@ static int do_page_crypto(struct inode *inode, struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; + BUG_ON(len == 0); + req = skcipher_request_alloc(tfm, gfp_flags); if (!req) { printk_ratelimited(KERN_ERR @@ -175,14 +171,14 @@ static int do_page_crypto(struct inode *inode, page_crypt_complete, &ecr); BUILD_BUG_ON(sizeof(xts_tweak) != FS_XTS_TWEAK_SIZE); - xts_tweak.index = cpu_to_le64(index); + xts_tweak.index = cpu_to_le64(lblk_num); memset(xts_tweak.padding, 0, sizeof(xts_tweak.padding)); sg_init_table(&dst, 1); - sg_set_page(&dst, dest_page, PAGE_SIZE, 0); + sg_set_page(&dst, dest_page, len, offs); sg_init_table(&src, 1); - sg_set_page(&src, src_page, PAGE_SIZE, 0); - skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, &xts_tweak); + sg_set_page(&src, src_page, len, offs); + skcipher_request_set_crypt(req, &src, &dst, len, &xts_tweak); if (rw == FS_DECRYPT) res = crypto_skcipher_decrypt(req); else @@ -202,53 +198,86 @@ static int do_page_crypto(struct inode *inode, return 0; } -static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags) +struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, + gfp_t gfp_flags) { ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool, gfp_flags); if (ctx->w.bounce_page == NULL) return ERR_PTR(-ENOMEM); - ctx->flags |= FS_WRITE_PATH_FL; + ctx->flags |= FS_CTX_HAS_BOUNCE_BUFFER_FL; return ctx->w.bounce_page; } /** * fscypt_encrypt_page() - Encrypts a page - * @inode: The inode for which the encryption should take place - * @plaintext_page: The page to encrypt. Must be locked. - * @gfp_flags: The gfp flag for memory allocation + * @inode: The inode for which the encryption should take place + * @page: The page to encrypt. Must be locked for bounce-page + * encryption. + * @len: Length of data to encrypt in @page and encrypted + * data in returned page. + * @offs: Offset of data within @page and returned + * page holding encrypted data. + * @lblk_num: Logical block number. This must be unique for multiple + * calls with same inode, except when overwriting + * previously written data. + * @gfp_flags: The gfp flag for memory allocation * - * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx - * encryption context. + * Encrypts @page using the ctx encryption context. Performs encryption + * either in-place or into a newly allocated bounce page. + * Called on the page write path. * - * Called on the page write path. The caller must call + * Bounce page allocation is the default. + * In this case, the contents of @page are encrypted and stored in an + * allocated bounce page. @page has to be locked and the caller must call * fscrypt_restore_control_page() on the returned ciphertext page to * release the bounce buffer and the encryption context. * - * Return: An allocated page with the encrypted content on success. Else, an + * In-place encryption is used by setting the FS_CFLG_OWN_PAGES flag in + * fscrypt_operations. Here, the input-page is returned with its content + * encrypted. + * + * Return: A page with the encrypted content on success. Else, an * error value or NULL. */ -struct page *fscrypt_encrypt_page(struct inode *inode, - struct page *plaintext_page, gfp_t gfp_flags) +struct page *fscrypt_encrypt_page(const struct inode *inode, + struct page *page, + unsigned int len, + unsigned int offs, + u64 lblk_num, gfp_t gfp_flags) + { struct fscrypt_ctx *ctx; - struct page *ciphertext_page = NULL; + struct page *ciphertext_page = page; int err; - BUG_ON(!PageLocked(plaintext_page)); + BUG_ON(len % FS_CRYPTO_BLOCK_SIZE != 0); + + if (inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES) { + /* with inplace-encryption we just encrypt the page */ + err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk_num, page, + ciphertext_page, len, offs, + gfp_flags); + if (err) + return ERR_PTR(err); + + return ciphertext_page; + } + + BUG_ON(!PageLocked(page)); ctx = fscrypt_get_ctx(inode, gfp_flags); if (IS_ERR(ctx)) return (struct page *)ctx; /* The encryption operation will require a bounce page. */ - ciphertext_page = alloc_bounce_page(ctx, gfp_flags); + ciphertext_page = fscrypt_alloc_bounce_page(ctx, gfp_flags); if (IS_ERR(ciphertext_page)) goto errout; - ctx->w.control_page = plaintext_page; - err = do_page_crypto(inode, FS_ENCRYPT, plaintext_page->index, - plaintext_page, ciphertext_page, - gfp_flags); + ctx->w.control_page = page; + err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk_num, + page, ciphertext_page, len, offs, + gfp_flags); if (err) { ciphertext_page = ERR_PTR(err); goto errout; @@ -265,8 +294,13 @@ struct page *fscrypt_encrypt_page(struct inode *inode, EXPORT_SYMBOL(fscrypt_encrypt_page); /** - * f2crypt_decrypt_page() - Decrypts a page in-place - * @page: The page to decrypt. Must be locked. + * fscrypt_decrypt_page() - Decrypts a page in-place + * @inode: The corresponding inode for the page to decrypt. + * @page: The page to decrypt. Must be locked in case + * it is a writeback page (FS_CFLG_OWN_PAGES unset). + * @len: Number of bytes in @page to be decrypted. + * @offs: Start of data in @page. + * @lblk_num: Logical block number. * * Decrypts page in-place using the ctx encryption context. * @@ -274,75 +308,17 @@ EXPORT_SYMBOL(fscrypt_encrypt_page); * * Return: Zero on success, non-zero otherwise. */ -int fscrypt_decrypt_page(struct page *page) +int fscrypt_decrypt_page(const struct inode *inode, struct page *page, + unsigned int len, unsigned int offs, u64 lblk_num) { - BUG_ON(!PageLocked(page)); + if (!(inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES)) + BUG_ON(!PageLocked(page)); - return do_page_crypto(page->mapping->host, - FS_DECRYPT, page->index, page, page, GFP_NOFS); + return fscrypt_do_page_crypto(inode, FS_DECRYPT, lblk_num, page, page, + len, offs, GFP_NOFS); } EXPORT_SYMBOL(fscrypt_decrypt_page); -int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk, - sector_t pblk, unsigned int len) -{ - struct fscrypt_ctx *ctx; - struct page *ciphertext_page = NULL; - struct bio *bio; - int ret, err = 0; - - BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE); - - ctx = fscrypt_get_ctx(inode, GFP_NOFS); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - - ciphertext_page = alloc_bounce_page(ctx, GFP_NOWAIT); - if (IS_ERR(ciphertext_page)) { - err = PTR_ERR(ciphertext_page); - goto errout; - } - - while (len--) { - err = do_page_crypto(inode, FS_ENCRYPT, lblk, - ZERO_PAGE(0), ciphertext_page, - GFP_NOFS); - if (err) - goto errout; - - bio = bio_alloc(GFP_NOWAIT, 1); - if (!bio) { - err = -ENOMEM; - goto errout; - } - bio->bi_bdev = inode->i_sb->s_bdev; - bio->bi_iter.bi_sector = - pblk << (inode->i_sb->s_blocksize_bits - 9); - ret = bio_add_page(bio, ciphertext_page, - inode->i_sb->s_blocksize, 0); - if (ret != inode->i_sb->s_blocksize) { - /* should never happen! */ - WARN_ON(1); - bio_put(bio); - err = -EIO; - goto errout; - } - err = submit_bio_wait(WRITE, bio); - if ((err == 0) && bio->bi_error) - err = -EIO; - bio_put(bio); - if (err) - goto errout; - lblk++; - pblk++; - } - err = 0; -errout: - fscrypt_release_ctx(ctx); - return err; -} -EXPORT_SYMBOL(fscrypt_zeroout_range); - /* * Validate dentries for encrypted directories to make sure we aren't * potentially caching stale data after a key has been added or @@ -399,63 +375,6 @@ const struct dentry_operations fscrypt_d_ops = { }; EXPORT_SYMBOL(fscrypt_d_ops); -/* - * Call fscrypt_decrypt_page on every single page, reusing the encryption - * context. - */ -static void completion_pages(struct work_struct *work) -{ - struct fscrypt_ctx *ctx = - container_of(work, struct fscrypt_ctx, r.work); - struct bio *bio = ctx->r.bio; - struct bio_vec *bv; - int i; - - bio_for_each_segment_all(bv, bio, i) { - struct page *page = bv->bv_page; - int ret = fscrypt_decrypt_page(page); - - if (ret) { - WARN_ON_ONCE(1); - SetPageError(page); - } else { - SetPageUptodate(page); - } - unlock_page(page); - } - fscrypt_release_ctx(ctx); - bio_put(bio); -} - -void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio) -{ - INIT_WORK(&ctx->r.work, completion_pages); - ctx->r.bio = bio; - queue_work(fscrypt_read_workqueue, &ctx->r.work); -} -EXPORT_SYMBOL(fscrypt_decrypt_bio_pages); - -void fscrypt_pullback_bio_page(struct page **page, bool restore) -{ - struct fscrypt_ctx *ctx; - struct page *bounce_page; - - /* The bounce data pages are unmapped. */ - if ((*page)->mapping) - return; - - /* The bounce data page is unmapped. */ - bounce_page = *page; - ctx = (struct fscrypt_ctx *)page_private(bounce_page); - - /* restore control page */ - *page = ctx->w.control_page; - - if (restore) - fscrypt_restore_control_page(bounce_page); -} -EXPORT_SYMBOL(fscrypt_pullback_bio_page); - void fscrypt_restore_control_page(struct page *page) { struct fscrypt_ctx *ctx; @@ -481,17 +400,22 @@ static void fscrypt_destroy(void) /** * fscrypt_initialize() - allocate major buffers for fs encryption. + * @cop_flags: fscrypt operations flags * * We only call this when we start accessing encrypted files, since it * results in memory getting allocated that wouldn't otherwise be used. * * Return: Zero on success, non-zero otherwise. */ -int fscrypt_initialize(void) +int fscrypt_initialize(unsigned int cop_flags) { int i, res = -ENOMEM; - if (fscrypt_bounce_page_pool) + /* + * No need to allocate a bounce page pool if there already is one or + * this FS won't use it. + */ + if (cop_flags & FS_CFLG_OWN_PAGES || fscrypt_bounce_page_pool) return 0; mutex_lock(&fscrypt_init_mutex); @@ -520,7 +444,6 @@ int fscrypt_initialize(void) mutex_unlock(&fscrypt_init_mutex); return res; } -EXPORT_SYMBOL(fscrypt_initialize); /** * fscrypt_init() - Set up for fs encryption. diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 9b774f4b50c8..13052b85c393 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -12,7 +12,7 @@ #include #include -#include +#include "fscrypt_private.h" /** * fname_crypt_complete() - completion callback for filename crypto @@ -209,7 +209,7 @@ static int digest_decode(const char *src, int len, char *dst) return cp - dst; } -u32 fscrypt_fname_encrypted_size(struct inode *inode, u32 ilen) +u32 fscrypt_fname_encrypted_size(const struct inode *inode, u32 ilen) { int padding = 32; struct fscrypt_info *ci = inode->i_crypt_info; @@ -227,7 +227,7 @@ EXPORT_SYMBOL(fscrypt_fname_encrypted_size); * Allocates an output buffer that is sufficient for the crypto operation * specified by the context and the direction. */ -int fscrypt_fname_alloc_buffer(struct inode *inode, +int fscrypt_fname_alloc_buffer(const struct inode *inode, u32 ilen, struct fscrypt_str *crypto_str) { unsigned int olen = fscrypt_fname_encrypted_size(inode, ilen); @@ -332,7 +332,7 @@ int fscrypt_fname_usr_to_disk(struct inode *inode, * in a directory. Consequently, a user space name cannot be mapped to * a disk-space name */ - return -EACCES; + return -ENOKEY; } EXPORT_SYMBOL(fscrypt_fname_usr_to_disk); @@ -350,7 +350,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, fname->disk_name.len = iname->len; return 0; } - ret = get_crypt_info(dir); + ret = fscrypt_get_crypt_info(dir); if (ret && ret != -EOPNOTSUPP) return ret; @@ -367,7 +367,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, return 0; } if (!lookup) - return -EACCES; + return -ENOKEY; /* * We don't have the key and we are doing a lookup; decode the diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h new file mode 100644 index 000000000000..ea01e5279675 --- /dev/null +++ b/fs/crypto/fscrypt_private.h @@ -0,0 +1,117 @@ +/* + * fscrypt_private.h + * + * Copyright (C) 2015, Google, Inc. + * + * This contains encryption key functions. + * + * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. + */ + +#ifndef _FSCRYPT_PRIVATE_H +#define _FSCRYPT_PRIVATE_H + +#include + +#define FS_FNAME_CRYPTO_DIGEST_SIZE 32 + +/* Encryption parameters */ +#define FS_XTS_TWEAK_SIZE 16 +#define FS_AES_128_ECB_KEY_SIZE 16 +#define FS_AES_256_GCM_KEY_SIZE 32 +#define FS_AES_256_CBC_KEY_SIZE 32 +#define FS_AES_256_CTS_KEY_SIZE 32 +#define FS_AES_256_XTS_KEY_SIZE 64 +#define FS_MAX_KEY_SIZE 64 + +#define FS_KEY_DESC_PREFIX "fscrypt:" +#define FS_KEY_DESC_PREFIX_SIZE 8 + +#define FS_KEY_DERIVATION_NONCE_SIZE 16 + +/** + * Encryption context for inode + * + * Protector format: + * 1 byte: Protector format (1 = this version) + * 1 byte: File contents encryption mode + * 1 byte: File names encryption mode + * 1 byte: Flags + * 8 bytes: Master Key descriptor + * 16 bytes: Encryption Key derivation nonce + */ +struct fscrypt_context { + u8 format; + u8 contents_encryption_mode; + u8 filenames_encryption_mode; + u8 flags; + u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; + u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; +} __packed; + +#define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1 + +/* This is passed in from userspace into the kernel keyring */ +struct fscrypt_key { + u32 mode; + u8 raw[FS_MAX_KEY_SIZE]; + u32 size; +} __packed; + +/* + * A pointer to this structure is stored in the file system's in-core + * representation of an inode. + */ +struct fscrypt_info { + u8 ci_data_mode; + u8 ci_filename_mode; + u8 ci_flags; + struct crypto_skcipher *ci_ctfm; + struct key *ci_keyring_key; + u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE]; +}; + +typedef enum { + FS_DECRYPT = 0, + FS_ENCRYPT, +} fscrypt_direction_t; + +#define FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 +#define FS_CTX_HAS_BOUNCE_BUFFER_FL 0x00000002 + +struct fscrypt_completion_result { + struct completion completion; + int res; +}; + +#define DECLARE_FS_COMPLETION_RESULT(ecr) \ + struct fscrypt_completion_result ecr = { \ + COMPLETION_INITIALIZER_ONSTACK((ecr).completion), 0 } + +/* bio stuffs */ +#define REQ_OP_READ READ +#define REQ_OP_WRITE WRITE +#define bio_op(bio) ((bio)->bi_rw & 1) + +static inline void bio_set_op_attrs(struct bio *bio, unsigned op, + unsigned op_flags) +{ + bio->bi_rw = op | op_flags; +} + +/* crypto.c */ +extern int fscrypt_initialize(unsigned int cop_flags); +extern struct workqueue_struct *fscrypt_read_workqueue; +extern int fscrypt_do_page_crypto(const struct inode *inode, + fscrypt_direction_t rw, u64 lblk_num, + struct page *src_page, + struct page *dest_page, + unsigned int len, unsigned int offs, + gfp_t gfp_flags); +extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, + gfp_t gfp_flags); + +/* keyinfo.c */ +extern int fscrypt_get_crypt_info(struct inode *); + +#endif /* _FSCRYPT_PRIVATE_H */ diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 67fb6d8876d0..02eb6b9e4438 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -10,7 +10,7 @@ #include #include -#include +#include "fscrypt_private.h" static void derive_crypt_complete(struct crypto_async_request *req, int rc) { @@ -77,26 +77,22 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], static int validate_user_key(struct fscrypt_info *crypt_info, struct fscrypt_context *ctx, u8 *raw_key, - u8 *prefix, int prefix_size) + const char *prefix) { - u8 *full_key_descriptor; + char *description; struct key *keyring_key; struct fscrypt_key *master_key; const struct user_key_payload *ukp; - int full_key_len = prefix_size + (FS_KEY_DESCRIPTOR_SIZE * 2) + 1; int res; - full_key_descriptor = kmalloc(full_key_len, GFP_NOFS); - if (!full_key_descriptor) + description = kasprintf(GFP_NOFS, "%s%*phN", prefix, + FS_KEY_DESCRIPTOR_SIZE, + ctx->master_key_descriptor); + if (!description) return -ENOMEM; - memcpy(full_key_descriptor, prefix, prefix_size); - sprintf(full_key_descriptor + prefix_size, - "%*phN", FS_KEY_DESCRIPTOR_SIZE, - ctx->master_key_descriptor); - full_key_descriptor[full_key_len - 1] = '\0'; - keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL); - kfree(full_key_descriptor); + keyring_key = request_key(&key_type_logon, description, NULL); + kfree(description); if (IS_ERR(keyring_key)) return PTR_ERR(keyring_key); @@ -178,7 +174,7 @@ static void put_crypt_info(struct fscrypt_info *ci) kmem_cache_free(fscrypt_info_cachep, ci); } -int get_crypt_info(struct inode *inode) +int fscrypt_get_crypt_info(struct inode *inode) { struct fscrypt_info *crypt_info; struct fscrypt_context ctx; @@ -188,7 +184,7 @@ int get_crypt_info(struct inode *inode) u8 *raw_key = NULL; int res; - res = fscrypt_initialize(); + res = fscrypt_initialize(inode->i_sb->s_cop->flags); if (res) return res; @@ -206,12 +202,15 @@ int get_crypt_info(struct inode *inode) res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (res < 0) { - if (!fscrypt_dummy_context_enabled(inode)) + if (!fscrypt_dummy_context_enabled(inode) || + inode->i_sb->s_cop->is_encrypted(inode)) return res; + /* Fake up a context for an unencrypted directory */ + memset(&ctx, 0, sizeof(ctx)); ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS; ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS; - ctx.flags = 0; + memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE); } else if (res != sizeof(ctx)) { return -EINVAL; } @@ -247,20 +246,10 @@ int get_crypt_info(struct inode *inode) if (!raw_key) goto out; - if (fscrypt_dummy_context_enabled(inode)) { - memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE); - goto got_key; - } - - res = validate_user_key(crypt_info, &ctx, raw_key, - FS_KEY_DESC_PREFIX, FS_KEY_DESC_PREFIX_SIZE); + res = validate_user_key(crypt_info, &ctx, raw_key, FS_KEY_DESC_PREFIX); if (res && inode->i_sb->s_cop->key_prefix) { - u8 *prefix = NULL; - int prefix_size, res2; - - prefix_size = inode->i_sb->s_cop->key_prefix(inode, &prefix); - res2 = validate_user_key(crypt_info, &ctx, raw_key, - prefix, prefix_size); + int res2 = validate_user_key(crypt_info, &ctx, raw_key, + inode->i_sb->s_cop->key_prefix); if (res2) { if (res2 == -ENOKEY) res = -ENOKEY; @@ -269,7 +258,6 @@ int get_crypt_info(struct inode *inode) } else if (res) { goto out; } -got_key: ctfm = crypto_alloc_skcipher(cipher_str, 0, 0); if (!ctfm || IS_ERR(ctfm)) { res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; @@ -327,7 +315,7 @@ int fscrypt_get_encryption_info(struct inode *inode) (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED) | (1 << KEY_FLAG_DEAD))))) - return get_crypt_info(inode); + return fscrypt_get_crypt_info(inode); return 0; } EXPORT_SYMBOL(fscrypt_get_encryption_info); diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 69ec4da11a7b..14b76da71269 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -10,40 +10,23 @@ #include #include -#include #include - -static int inode_has_encryption_context(struct inode *inode) -{ - if (!inode->i_sb->s_cop->get_context) - return 0; - return (inode->i_sb->s_cop->get_context(inode, NULL, 0L) > 0); -} +#include "fscrypt_private.h" /* - * check whether the policy is consistent with the encryption context - * for the inode + * check whether an encryption policy is consistent with an encryption context */ -static int is_encryption_context_consistent_with_policy(struct inode *inode, +static bool is_encryption_context_consistent_with_policy( + const struct fscrypt_context *ctx, const struct fscrypt_policy *policy) { - struct fscrypt_context ctx; - int res; - - if (!inode->i_sb->s_cop->get_context) - return 0; - - res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); - if (res != sizeof(ctx)) - return 0; - - return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor, - FS_KEY_DESCRIPTOR_SIZE) == 0 && - (ctx.flags == policy->flags) && - (ctx.contents_encryption_mode == - policy->contents_encryption_mode) && - (ctx.filenames_encryption_mode == - policy->filenames_encryption_mode)); + return memcmp(ctx->master_key_descriptor, policy->master_key_descriptor, + FS_KEY_DESCRIPTOR_SIZE) == 0 && + (ctx->flags == policy->flags) && + (ctx->contents_encryption_mode == + policy->contents_encryption_mode) && + (ctx->filenames_encryption_mode == + policy->filenames_encryption_mode); } static int create_encryption_context_from_policy(struct inode *inode, @@ -66,20 +49,12 @@ static int create_encryption_context_from_policy(struct inode *inode, FS_KEY_DESCRIPTOR_SIZE); if (!fscrypt_valid_contents_enc_mode( - policy->contents_encryption_mode)) { - printk(KERN_WARNING - "%s: Invalid contents encryption mode %d\n", __func__, - policy->contents_encryption_mode); + policy->contents_encryption_mode)) return -EINVAL; - } if (!fscrypt_valid_filenames_enc_mode( - policy->filenames_encryption_mode)) { - printk(KERN_WARNING - "%s: Invalid filenames encryption mode %d\n", __func__, - policy->filenames_encryption_mode); + policy->filenames_encryption_mode)) return -EINVAL; - } if (policy->flags & ~FS_POLICY_FLAGS_VALID) return -EINVAL; @@ -98,6 +73,7 @@ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) struct fscrypt_policy policy; struct inode *inode = file_inode(filp); int ret; + struct fscrypt_context ctx; if (copy_from_user(&policy, arg, sizeof(policy))) return -EFAULT; @@ -114,9 +90,10 @@ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) inode_lock(inode); - if (!inode_has_encryption_context(inode)) { + ret = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + if (ret == -ENODATA) { if (!S_ISDIR(inode->i_mode)) - ret = -EINVAL; + ret = -ENOTDIR; else if (!inode->i_sb->s_cop->empty_dir) ret = -EOPNOTSUPP; else if (!inode->i_sb->s_cop->empty_dir(inode)) @@ -124,12 +101,14 @@ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) else ret = create_encryption_context_from_policy(inode, &policy); - } else if (!is_encryption_context_consistent_with_policy(inode, - &policy)) { - printk(KERN_WARNING - "%s: Policy inconsistent with encryption context\n", - __func__); - ret = -EINVAL; + } else if (ret == sizeof(ctx) && + is_encryption_context_consistent_with_policy(&ctx, + &policy)) { + /* The file already uses the same encryption policy. */ + ret = 0; + } else if (ret >= 0 || ret == -ERANGE) { + /* The file already uses a different encryption policy. */ + ret = -EEXIST; } inode_unlock(inode); @@ -151,8 +130,10 @@ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) return -ENODATA; res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + if (res < 0 && res != -ERANGE) + return res; if (res != sizeof(ctx)) - return -ENODATA; + return -EINVAL; if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1) return -EINVAL; @@ -217,9 +198,9 @@ EXPORT_SYMBOL(fscrypt_has_permitted_context); * @parent: Parent inode from which the context is inherited. * @child: Child inode that inherits the context from @parent. * @fs_data: private data given by FS. - * @preload: preload child i_crypt_info + * @preload: preload child i_crypt_info if true * - * Return: Zero on success, non-zero otherwise + * Return: 0 on success, -errno on failure */ int fscrypt_inherit_context(struct inode *parent, struct inode *child, void *fs_data, bool preload) @@ -240,19 +221,11 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child, return -ENOKEY; ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; - if (fscrypt_dummy_context_enabled(parent)) { - ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS; - ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS; - ctx.flags = 0; - memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE); - res = 0; - } else { - ctx.contents_encryption_mode = ci->ci_data_mode; - ctx.filenames_encryption_mode = ci->ci_filename_mode; - ctx.flags = ci->ci_flags; - memcpy(ctx.master_key_descriptor, ci->ci_master_key, - FS_KEY_DESCRIPTOR_SIZE); - } + ctx.contents_encryption_mode = ci->ci_data_mode; + ctx.filenames_encryption_mode = ci->ci_filename_mode; + ctx.flags = ci->ci_flags; + memcpy(ctx.master_key_descriptor, ci->ci_master_key, + FS_KEY_DESCRIPTOR_SIZE); get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE); res = parent->i_sb->s_cop->set_context(child, &ctx, sizeof(ctx), fs_data); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 58e7dcb9af62..bda784e38407 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1309,7 +1309,9 @@ int do_write_data_page(struct f2fs_io_info *fio) fio->old_blkaddr); retry_encrypt: fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, - gfp_flags); + PAGE_SIZE, 0, + fio->page->index, + gfp_flags); if (IS_ERR(fio->encrypted_page)) { err = PTR_ERR(fio->encrypted_page); if (err == -ENOMEM) { diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 35cbe7185594..4e2153620a3b 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -272,7 +272,10 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, err = fscrypt_setup_filename(dir, child, 1, &fname); if (err) { - *res_page = ERR_PTR(err); + if (err == -ENOENT) + *res_page = NULL; + else + *res_page = ERR_PTR(err); return NULL; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f26b9b451e13..fa463ef1ccc6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -22,7 +22,11 @@ #include #include #include -#include +#ifdef CONFIG_F2FS_FS_ENCRYPTION +#include +#else +#include +#endif #include #include @@ -854,10 +858,6 @@ enum { MAX_TIME, }; -#ifdef CONFIG_F2FS_FS_ENCRYPTION -#define F2FS_KEY_DESC_PREFIX "f2fs:" -#define F2FS_KEY_DESC_PREFIX_SIZE 5 -#endif struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ @@ -865,11 +865,6 @@ struct f2fs_sb_info { int valid_super_block; /* valid super block no */ unsigned long s_flag; /* flags for sbi */ -#ifdef CONFIG_F2FS_FS_ENCRYPTION - u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE]; - u8 key_prefix_size; -#endif - #ifdef CONFIG_BLK_DEV_ZONED unsigned int blocks_per_blkz; /* F2FS blocks per zone */ unsigned int log_blocks_per_blkz; /* log2 F2FS blocks per zone */ @@ -2704,29 +2699,4 @@ static inline bool f2fs_may_encrypt(struct inode *inode) return 0; #endif } - -#ifndef CONFIG_F2FS_FS_ENCRYPTION -#define fscrypt_set_d_op(i) -#define fscrypt_get_ctx fscrypt_notsupp_get_ctx -#define fscrypt_release_ctx fscrypt_notsupp_release_ctx -#define fscrypt_encrypt_page fscrypt_notsupp_encrypt_page -#define fscrypt_decrypt_page fscrypt_notsupp_decrypt_page -#define fscrypt_decrypt_bio_pages fscrypt_notsupp_decrypt_bio_pages -#define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page -#define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page -#define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range -#define fscrypt_ioctl_set_policy fscrypt_notsupp_ioctl_set_policy -#define fscrypt_ioctl_get_policy fscrypt_notsupp_ioctl_get_policy -#define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context -#define fscrypt_inherit_context fscrypt_notsupp_inherit_context -#define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info -#define fscrypt_put_encryption_info fscrypt_notsupp_put_encryption_info -#define fscrypt_setup_filename fscrypt_notsupp_setup_filename -#define fscrypt_free_filename fscrypt_notsupp_free_filename -#define fscrypt_fname_encrypted_size fscrypt_notsupp_fname_encrypted_size -#define fscrypt_fname_alloc_buffer fscrypt_notsupp_fname_alloc_buffer -#define fscrypt_fname_free_buffer fscrypt_notsupp_fname_free_buffer -#define fscrypt_fname_disk_to_usr fscrypt_notsupp_fname_disk_to_usr -#define fscrypt_fname_usr_to_disk fscrypt_notsupp_fname_usr_to_disk -#endif #endif diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index db3079cd665d..a5a9ffc8e358 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -400,7 +400,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, return err; if (!fscrypt_has_encryption_key(dir)) - return -EPERM; + return -ENOKEY; disk_link.len = (fscrypt_fname_encrypted_size(dir, len) + sizeof(struct fscrypt_symlink_data)); @@ -444,7 +444,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, goto err_out; if (!fscrypt_has_encryption_key(inode)) { - err = -EPERM; + err = -ENOKEY; goto err_out; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 85c282272067..291b92a486d5 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1206,12 +1206,6 @@ static int f2fs_get_context(struct inode *inode, void *ctx, size_t len) ctx, len, NULL); } -static int f2fs_key_prefix(struct inode *inode, u8 **key) -{ - *key = F2FS_I_SB(inode)->key_prefix; - return F2FS_I_SB(inode)->key_prefix_size; -} - static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len, void *fs_data) { @@ -1227,8 +1221,8 @@ static unsigned f2fs_max_namelen(struct inode *inode) } static const struct fscrypt_operations f2fs_cryptops = { + .key_prefix = "f2fs:", .get_context = f2fs_get_context, - .key_prefix = f2fs_key_prefix, .set_context = f2fs_set_context, .is_encrypted = f2fs_encrypted_inode, .empty_dir = f2fs_empty_dir, @@ -1568,12 +1562,6 @@ static void init_sb_info(struct f2fs_sb_info *sbi) mutex_init(&sbi->wio_mutex[NODE]); mutex_init(&sbi->wio_mutex[DATA]); spin_lock_init(&sbi->cp_lock); - -#ifdef CONFIG_F2FS_FS_ENCRYPTION - memcpy(sbi->key_prefix, F2FS_KEY_DESC_PREFIX, - F2FS_KEY_DESC_PREFIX_SIZE); - sbi->key_prefix_size = F2FS_KEY_DESC_PREFIX_SIZE; -#endif } static int init_percpu_info(struct f2fs_sb_info *sbi) diff --git a/include/linux/fscrypt_common.h b/include/linux/fscrypt_common.h new file mode 100644 index 000000000000..547f81592ba1 --- /dev/null +++ b/include/linux/fscrypt_common.h @@ -0,0 +1,146 @@ +/* + * fscrypt_common.h: common declarations for per-file encryption + * + * Copyright (C) 2015, Google, Inc. + * + * Written by Michael Halcrow, 2015. + * Modified by Jaegeuk Kim, 2015. + */ + +#ifndef _LINUX_FSCRYPT_COMMON_H +#define _LINUX_FSCRYPT_COMMON_H + +#include +#include +#include +#include +#include +#include +#include + +#define FS_CRYPTO_BLOCK_SIZE 16 + +struct fscrypt_info; + +struct fscrypt_ctx { + union { + struct { + struct page *bounce_page; /* Ciphertext page */ + struct page *control_page; /* Original page */ + } w; + struct { + struct bio *bio; + struct work_struct work; + } r; + struct list_head free_list; /* Free list */ + }; + u8 flags; /* Flags */ +}; + +/** + * For encrypted symlinks, the ciphertext length is stored at the beginning + * of the string in little-endian format. + */ +struct fscrypt_symlink_data { + __le16 len; + char encrypted_path[1]; +} __packed; + +/** + * This function is used to calculate the disk space required to + * store a filename of length l in encrypted symlink format. + */ +static inline u32 fscrypt_symlink_data_len(u32 l) +{ + if (l < FS_CRYPTO_BLOCK_SIZE) + l = FS_CRYPTO_BLOCK_SIZE; + return (l + sizeof(struct fscrypt_symlink_data) - 1); +} + +struct fscrypt_str { + unsigned char *name; + u32 len; +}; + +struct fscrypt_name { + const struct qstr *usr_fname; + struct fscrypt_str disk_name; + u32 hash; + u32 minor_hash; + struct fscrypt_str crypto_buf; +}; + +#define FSTR_INIT(n, l) { .name = n, .len = l } +#define FSTR_TO_QSTR(f) QSTR_INIT((f)->name, (f)->len) +#define fname_name(p) ((p)->disk_name.name) +#define fname_len(p) ((p)->disk_name.len) + +/* + * fscrypt superblock flags + */ +#define FS_CFLG_OWN_PAGES (1U << 1) + +/* + * crypto opertions for filesystems + */ +struct fscrypt_operations { + unsigned int flags; + const char *key_prefix; + int (*get_context)(struct inode *, void *, size_t); + int (*prepare_context)(struct inode *); + int (*set_context)(struct inode *, const void *, size_t, void *); + int (*dummy_context)(struct inode *); + bool (*is_encrypted)(struct inode *); + bool (*empty_dir)(struct inode *); + unsigned (*max_namelen)(struct inode *); +}; + +static inline bool fscrypt_dummy_context_enabled(struct inode *inode) +{ + if (inode->i_sb->s_cop->dummy_context && + inode->i_sb->s_cop->dummy_context(inode)) + return true; + return false; +} + +static inline bool fscrypt_valid_contents_enc_mode(u32 mode) +{ + return (mode == FS_ENCRYPTION_MODE_AES_256_XTS); +} + +static inline bool fscrypt_valid_filenames_enc_mode(u32 mode) +{ + return (mode == FS_ENCRYPTION_MODE_AES_256_CTS); +} + +static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) +{ + if (str->len == 1 && str->name[0] == '.') + return true; + + if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.') + return true; + + return false; +} + +static inline struct page *fscrypt_control_page(struct page *page) +{ +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + return ((struct fscrypt_ctx *)page_private(page))->w.control_page; +#else + WARN_ON_ONCE(1); + return ERR_PTR(-EINVAL); +#endif +} + +static inline int fscrypt_has_encryption_key(const struct inode *inode) +{ +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + return (inode->i_crypt_info != NULL); +#else + return 0; +#endif +} + +#endif /* _LINUX_FSCRYPT_COMMON_H */ diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h new file mode 100644 index 000000000000..3511ca798804 --- /dev/null +++ b/include/linux/fscrypt_notsupp.h @@ -0,0 +1,168 @@ +/* + * fscrypt_notsupp.h + * + * This stubs out the fscrypt functions for filesystems configured without + * encryption support. + */ + +#ifndef _LINUX_FSCRYPT_NOTSUPP_H +#define _LINUX_FSCRYPT_NOTSUPP_H + +#include + +/* crypto.c */ +static inline struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, + gfp_t gfp_flags) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void fscrypt_release_ctx(struct fscrypt_ctx *ctx) +{ + return; +} + +static inline struct page *fscrypt_encrypt_page(const struct inode *inode, + struct page *page, + unsigned int len, + unsigned int offs, + u64 lblk_num, gfp_t gfp_flags) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline int fscrypt_decrypt_page(const struct inode *inode, + struct page *page, + unsigned int len, unsigned int offs, + u64 lblk_num) +{ + return -EOPNOTSUPP; +} + + +static inline void fscrypt_restore_control_page(struct page *page) +{ + return; +} + +static inline void fscrypt_set_d_op(struct dentry *dentry) +{ + return; +} + +static inline void fscrypt_set_encrypted_dentry(struct dentry *dentry) +{ + return; +} + +/* policy.c */ +static inline int fscrypt_ioctl_set_policy(struct file *filp, + const void __user *arg) +{ + return -EOPNOTSUPP; +} + +static inline int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) +{ + return -EOPNOTSUPP; +} + +static inline int fscrypt_has_permitted_context(struct inode *parent, + struct inode *child) +{ + return 0; +} + +static inline int fscrypt_inherit_context(struct inode *parent, + struct inode *child, + void *fs_data, bool preload) +{ + return -EOPNOTSUPP; +} + +/* keyinfo.c */ +static inline int fscrypt_get_encryption_info(struct inode *inode) +{ + return -EOPNOTSUPP; +} + +static inline void fscrypt_put_encryption_info(struct inode *inode, + struct fscrypt_info *ci) +{ + return; +} + + /* fname.c */ +static inline int fscrypt_setup_filename(struct inode *dir, + const struct qstr *iname, + int lookup, struct fscrypt_name *fname) +{ + if (dir->i_sb->s_cop->is_encrypted(dir)) + return -EOPNOTSUPP; + + memset(fname, 0, sizeof(struct fscrypt_name)); + fname->usr_fname = iname; + fname->disk_name.name = (unsigned char *)iname->name; + fname->disk_name.len = iname->len; + return 0; +} + +static inline void fscrypt_free_filename(struct fscrypt_name *fname) +{ + return; +} + +static inline u32 fscrypt_fname_encrypted_size(const struct inode *inode, + u32 ilen) +{ + /* never happens */ + WARN_ON(1); + return 0; +} + +static inline int fscrypt_fname_alloc_buffer(const struct inode *inode, + u32 ilen, + struct fscrypt_str *crypto_str) +{ + return -EOPNOTSUPP; +} + +static inline void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str) +{ + return; +} + +static inline int fscrypt_fname_disk_to_usr(struct inode *inode, + u32 hash, u32 minor_hash, + const struct fscrypt_str *iname, + struct fscrypt_str *oname) +{ + return -EOPNOTSUPP; +} + +static inline int fscrypt_fname_usr_to_disk(struct inode *inode, + const struct qstr *iname, + struct fscrypt_str *oname) +{ + return -EOPNOTSUPP; +} + +/* bio.c */ +static inline void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, + struct bio *bio) +{ + return; +} + +static inline void fscrypt_pullback_bio_page(struct page **page, bool restore) +{ + return; +} + +static inline int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, + sector_t pblk, unsigned int len) +{ + return -EOPNOTSUPP; +} + +#endif /* _LINUX_FSCRYPT_NOTSUPP_H */ diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h new file mode 100644 index 000000000000..a140f47e9b27 --- /dev/null +++ b/include/linux/fscrypt_supp.h @@ -0,0 +1,66 @@ +/* + * fscrypt_supp.h + * + * This is included by filesystems configured with encryption support. + */ + +#ifndef _LINUX_FSCRYPT_SUPP_H +#define _LINUX_FSCRYPT_SUPP_H + +#include + +/* crypto.c */ +extern struct kmem_cache *fscrypt_info_cachep; +extern struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *, gfp_t); +extern void fscrypt_release_ctx(struct fscrypt_ctx *); +extern struct page *fscrypt_encrypt_page(const struct inode *, struct page *, + unsigned int, unsigned int, + u64, gfp_t); +extern int fscrypt_decrypt_page(const struct inode *, struct page *, unsigned int, + unsigned int, u64); +extern void fscrypt_restore_control_page(struct page *); + +extern const struct dentry_operations fscrypt_d_ops; + +static inline void fscrypt_set_d_op(struct dentry *dentry) +{ + d_set_d_op(dentry, &fscrypt_d_ops); +} + +static inline void fscrypt_set_encrypted_dentry(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + dentry->d_flags |= DCACHE_ENCRYPTED_WITH_KEY; + spin_unlock(&dentry->d_lock); +} + +/* policy.c */ +extern int fscrypt_ioctl_set_policy(struct file *, const void __user *); +extern int fscrypt_ioctl_get_policy(struct file *, void __user *); +extern int fscrypt_has_permitted_context(struct inode *, struct inode *); +extern int fscrypt_inherit_context(struct inode *, struct inode *, + void *, bool); +/* keyinfo.c */ +extern int fscrypt_get_encryption_info(struct inode *); +extern void fscrypt_put_encryption_info(struct inode *, struct fscrypt_info *); + +/* fname.c */ +extern int fscrypt_setup_filename(struct inode *, const struct qstr *, + int lookup, struct fscrypt_name *); +extern void fscrypt_free_filename(struct fscrypt_name *); +extern u32 fscrypt_fname_encrypted_size(const struct inode *, u32); +extern int fscrypt_fname_alloc_buffer(const struct inode *, u32, + struct fscrypt_str *); +extern void fscrypt_fname_free_buffer(struct fscrypt_str *); +extern int fscrypt_fname_disk_to_usr(struct inode *, u32, u32, + const struct fscrypt_str *, struct fscrypt_str *); +extern int fscrypt_fname_usr_to_disk(struct inode *, const struct qstr *, + struct fscrypt_str *); + +/* bio.c */ +extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *); +extern void fscrypt_pullback_bio_page(struct page **, bool); +extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t, + unsigned int); + +#endif /* _LINUX_FSCRYPT_SUPP_H */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index c8c093e8c83d..ea33e08d9d75 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -176,6 +176,21 @@ struct inodes_stat_t { /* Policy provided via an ioctl on the topmost directory */ #define FS_KEY_DESCRIPTOR_SIZE 8 +#define FS_POLICY_FLAGS_PAD_4 0x00 +#define FS_POLICY_FLAGS_PAD_8 0x01 +#define FS_POLICY_FLAGS_PAD_16 0x02 +#define FS_POLICY_FLAGS_PAD_32 0x03 +#define FS_POLICY_FLAGS_PAD_MASK 0x03 +#define FS_POLICY_FLAGS_VALID 0x03 + +/* Encryption algorithms */ +#define FS_ENCRYPTION_MODE_INVALID 0 +#define FS_ENCRYPTION_MODE_AES_256_XTS 1 +#define FS_ENCRYPTION_MODE_AES_256_GCM 2 +#define FS_ENCRYPTION_MODE_AES_256_CBC 3 +#define FS_ENCRYPTION_MODE_AES_256_CTS 4 + + struct fscrypt_policy { __u8 version; __u8 contents_encryption_mode; From 9fa38a0c6456093028506d1577e29ed6246751a7 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Tue, 28 Feb 2017 20:32:41 +0800 Subject: [PATCH 0177/1212] f2fs: fix an error return value in truncate_partial_data_page This patch fix a error return value in truncate_partial_data_page Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index cfd86ae20b7c..11053141ee4f 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -530,7 +530,7 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, page = get_lock_data_page(inode, index, true); if (IS_ERR(page)) - return 0; + return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page); truncate_out: f2fs_wait_on_page_writeback(page, DATA, true); zero_user(page, offset, PAGE_SIZE - offset); From 9b86801f59e4ea6375c49d5a316aaa8b8bb76efe Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sat, 4 Mar 2017 13:56:10 -0800 Subject: [PATCH 0178/1212] f2fs: don't need to invalidate wrong node page If f2fs_new_inode() is failed, the bad inode will invalidate 0'th node page during f2fs_evict_inode(), which doesn't need to do. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 24bb8213d974..ef8610bf950f 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -411,7 +411,10 @@ void f2fs_evict_inode(struct inode *inode) stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); - invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); + /* ino == 0, if f2fs_new_inode() was failed t*/ + if (inode->i_ino) + invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, + inode->i_ino); if (xnid) invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); if (inode->i_nlink) { From 4b056f06acfac3a7a5124c5eb1c1ee43807b3367 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 6 Mar 2017 11:59:56 -0800 Subject: [PATCH 0179/1212] f2fs: don't overwrite node block by SSR This patch fixes that SSR can overwrite previous warm node block consisting of a node chain since the last checkpoint. Fixes: 5b6c6be2d878 ("f2fs: use SSR for warm node as well") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a09c726cc1c3..de30f4a86219 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1242,6 +1242,12 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) if (f2fs_discard_en(sbi) && !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; + + /* don't overwrite by SSR to keep node chain */ + if (se->type == CURSEG_WARM_NODE) { + if (!f2fs_test_and_set_bit(offset, se->ckpt_valid_map)) + se->ckpt_valid_blocks++; + } } else { if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) { #ifdef CONFIG_F2FS_CHECK_FS From d95038cf6b1eaaa2b9dc9086563663180f7defb1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 10 Mar 2017 09:36:10 -0800 Subject: [PATCH 0180/1212] f2fs: le16_to_cpu for xattr->e_value_size This patch fixes missing le16 conversion, reported by kbuild test robot. Fixes: 5f35a2cd5 ("f2fs: Don't update the xattr data that same as the exist") Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index d0d15920e3ff..fb5062a4df77 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -586,7 +586,9 @@ static bool f2fs_xattr_value_same(struct f2fs_xattr_entry *entry, const void *value, size_t size) { void *pval = entry->e_name + entry->e_name_len; - return (entry->e_value_size == size) && !memcmp(pval, value, size); + + return (le16_to_cpu(entry->e_value_size) == size) && + !memcmp(pval, value, size); } static int __f2fs_setxattr(struct inode *inode, int index, From 48da6d86af950c6e29538b5579046b361d00e876 Mon Sep 17 00:00:00 2001 From: kbuild test robot Date: Fri, 10 Mar 2017 15:25:59 +0800 Subject: [PATCH 0181/1212] f2fs: __update_nat_bits() can be static Signed-off-by: Fengguang Wu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 81f0daad982b..286d015aab8e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2386,7 +2386,7 @@ static void __adjust_nat_entry_set(struct nat_entry_set *nes, list_add_tail(&nes->set_list, head); } -void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, +static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, struct page *page) { struct f2fs_nm_info *nm_i = NM_I(sbi); From f0135c1551e03aa50c702b4e6caa722eec472082 Mon Sep 17 00:00:00 2001 From: kbuild test robot Date: Fri, 10 Mar 2017 15:54:31 +0800 Subject: [PATCH 0182/1212] f2fs: update_free_nid_bitmap() can be static Signed-off-by: Fengguang Wu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 286d015aab8e..ae4711d17f5f 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1826,7 +1826,7 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) kmem_cache_free(free_nid_slab, i); } -void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, bool set) +static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, bool set) { struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid); @@ -2641,7 +2641,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) return 0; } -int init_free_nid_cache(struct f2fs_sb_info *sbi) +static int init_free_nid_cache(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); From 7ae846e99ba2d099949eccba2dd48421960e1619 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 7 Mar 2017 14:11:06 -0800 Subject: [PATCH 0183/1212] f2fs: use __set{__clear}_bit_le This patch uses __set{__clear}_bit_le for highter speed. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index ae4711d17f5f..3a441d84643a 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1836,9 +1836,9 @@ static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, bool set return; if (set) - set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + __set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); else - clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); } static void scan_nat_page(struct f2fs_sb_info *sbi, @@ -1850,7 +1850,7 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, unsigned int nat_ofs = NAT_BLOCK_OFFSET(start_nid); int i; - set_bit_le(nat_ofs, nm_i->nat_block_bitmap); + __set_bit_le(nat_ofs, nm_i->nat_block_bitmap); i = start_nid % NAT_ENTRY_PER_BLOCK; From 7abdfbd622db16c0a7f222b17601d8a5816d981a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 1 Mar 2017 17:09:07 +0800 Subject: [PATCH 0184/1212] f2fs: skip scanning free nid bitmap of full NAT blocks This patch adds to account free nids for each NAT blocks, and while scanning all free nid bitmap, do check count and skip lookuping in full NAT block. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/node.c --- fs/f2fs/debug.c | 1 + fs/f2fs/f2fs.h | 2 ++ fs/f2fs/node.c | 34 ++++++++++++++++++++++++++++------ 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index a77df377e2e8..ee2d0a485fc3 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -196,6 +196,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem += (NM_I(sbi)->nat_bits_blocks << F2FS_BLKSIZE_BITS); si->base_mem += NM_I(sbi)->nat_blocks * NAT_ENTRY_BITMAP_SIZE; si->base_mem += NM_I(sbi)->nat_blocks / 8; + si->base_mem += NM_I(sbi)->nat_blocks * sizeof(unsigned short); get_cache: si->cache_mem = 0; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fa463ef1ccc6..7c7ebf323255 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -628,6 +628,8 @@ struct f2fs_nm_info { struct mutex build_lock; /* lock for build free nids */ unsigned char (*free_nid_bitmap)[NAT_ENTRY_BITMAP_SIZE]; unsigned char *nat_block_bitmap; + unsigned short *free_nid_count; /* free nid count of NAT block */ + spinlock_t free_nid_lock; /* protect updating of nid count */ /* for checkpoint */ char *nat_bitmap; /* NAT bitmap pointer */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 3a441d84643a..0184ba3fbe94 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1826,7 +1826,8 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) kmem_cache_free(free_nid_slab, i); } -static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, bool set) +static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, + bool set, bool build) { struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid); @@ -1839,6 +1840,13 @@ static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, bool set __set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); else __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + + spin_lock(&nm_i->free_nid_lock); + if (set) + nm_i->free_nid_count[nat_ofs]++; + else if (!build) + nm_i->free_nid_count[nat_ofs]--; + spin_unlock(&nm_i->free_nid_lock); } static void scan_nat_page(struct f2fs_sb_info *sbi, @@ -1850,6 +1858,9 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, unsigned int nat_ofs = NAT_BLOCK_OFFSET(start_nid); int i; + if (test_bit_le(nat_ofs, nm_i->nat_block_bitmap)) + return; + __set_bit_le(nat_ofs, nm_i->nat_block_bitmap); i = start_nid % NAT_ENTRY_PER_BLOCK; @@ -1864,7 +1875,7 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, blk_addr == NEW_ADDR); if (blk_addr == NULL_ADDR) freed = add_free_nid(sbi, start_nid, true); - update_free_nid_bitmap(sbi, start_nid, freed); + update_free_nid_bitmap(sbi, start_nid, freed, true); } } @@ -1880,6 +1891,8 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi) for (i = 0; i < nm_i->nat_blocks; i++) { if (!test_bit_le(i, nm_i->nat_block_bitmap)) continue; + if (!nm_i->free_nid_count[i]) + continue; for (idx = 0; idx < NAT_ENTRY_PER_BLOCK; idx++) { nid_t nid; @@ -2084,7 +2097,7 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false); nm_i->available_nids--; - update_free_nid_bitmap(sbi, *nid, false); + update_free_nid_bitmap(sbi, *nid, false, false); spin_unlock(&nm_i->nid_list_lock); return true; @@ -2140,7 +2153,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) nm_i->available_nids++; - update_free_nid_bitmap(sbi, nid, true); + update_free_nid_bitmap(sbi, nid, true, false); spin_unlock(&nm_i->nid_list_lock); @@ -2470,11 +2483,11 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, add_free_nid(sbi, nid, false); spin_lock(&NM_I(sbi)->nid_list_lock); NM_I(sbi)->available_nids++; - update_free_nid_bitmap(sbi, nid, true); + update_free_nid_bitmap(sbi, nid, true, false); spin_unlock(&NM_I(sbi)->nid_list_lock); } else { spin_lock(&NM_I(sbi)->nid_list_lock); - update_free_nid_bitmap(sbi, nid, false); + update_free_nid_bitmap(sbi, nid, false, false); spin_unlock(&NM_I(sbi)->nid_list_lock); } } @@ -2654,6 +2667,14 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi) GFP_KERNEL); if (!nm_i->nat_block_bitmap) return -ENOMEM; + + nm_i->free_nid_count = f2fs_kvzalloc(nm_i->nat_blocks * + sizeof(unsigned short), GFP_KERNEL); + if (!nm_i->free_nid_count) + return -ENOMEM; + + spin_lock_init(&nm_i->free_nid_lock); + return 0; } @@ -2733,6 +2754,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) kvfree(nm_i->nat_block_bitmap); kvfree(nm_i->free_nid_bitmap); + kvfree(nm_i->free_nid_count); kfree(nm_i->nat_bitmap); kfree(nm_i->nat_bits); From e10680dfc72f2ff195201b7e348288af5c5b072a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 8 Mar 2017 20:07:49 +0800 Subject: [PATCH 0185/1212] f2fs: combine nat_bits and free_nid_bitmap cache MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both nat_bits cache and free_nid_bitmap cache provide same functionality as a intermediate cache between free nid cache and disk, but with different granularity of indicating free nid range, and different persistence policy. nat_bits cache provides better persistence ability, and free_nid_bitmap provides better granularity. In this patch we combine advantage of both caches, so finally policy of the intermediate cache would be: - init: load free nid status from nat_bits into free_nid_bitmap - lookup: scan free_nid_bitmap before load NAT blocks - update: update free_nid_bitmap in real-time - persistence: udpate and persist nat_bits in checkpoint This patch also resolves performance regression reported by lkp-robot. commit: 4ac912427c4214d8031d9ad6fbc3bc75e71512df ("f2fs: introduce free nid bitmap") d00030cf9cd0bb96fdccc41e33d3c91dcbb672ba ("f2fs: use __set{__clear}_bit_le") 1382c0f3f9d3f936c8bc42ed1591cf7a593ef9f7 ("f2fs: combine nat_bits and free_nid_bitmap cache") 4ac912427c4214d8 d00030cf9cd0bb96fdccc41e33 1382c0f3f9d3f936c8bc42ed15 ---------------- -------------------------- -------------------------- %stddev %change %stddev %change %stddev \ | \ | \ 77863 ± 0% +2.1% 79485 ± 1% +50.8% 117404 ± 0% aim7.jobs-per-min 231.63 ± 0% -2.0% 227.01 ± 1% -33.6% 153.80 ± 0% aim7.time.elapsed_time 231.63 ± 0% -2.0% 227.01 ± 1% -33.6% 153.80 ± 0% aim7.time.elapsed_time.max 896604 ± 0% -0.8% 889221 ± 3% -20.2% 715260 ± 1% aim7.time.involuntary_context_switches 2394 ± 1% +4.6% 2503 ± 1% +3.7% 2481 ± 2% aim7.time.maximum_resident_set_size 6240 ± 0% -1.5% 6145 ± 1% -14.1% 5360 ± 1% aim7.time.system_time 1111357 ± 3% +1.9% 1132509 ± 2% -6.2% 1041932 ± 2% aim7.time.voluntary_context_switches ... Signed-off-by: Chao Yu Tested-by: Xiaolong Ye Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 125 +++++++++++++++++++------------------------------ 1 file changed, 47 insertions(+), 78 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 0184ba3fbe94..5c70f33a2b4c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -338,9 +338,6 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, set_nat_flag(e, IS_CHECKPOINTED, false); __set_nat_cache_dirty(nm_i, e); - if (enabled_nat_bits(sbi, NULL) && new_blkaddr == NEW_ADDR) - __clear_bit_le(NAT_BLOCK_OFFSET(ni->nid), nm_i->empty_nat_bits); - /* update fsync_mark if its inode nat entry is still alive */ if (ni->nid != ni->ino) e = __lookup_nat_cache(nm_i, ni->ino); @@ -1827,7 +1824,7 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) } static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, - bool set, bool build) + bool set, bool build, bool locked) { struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid); @@ -1841,12 +1838,14 @@ static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, else __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); - spin_lock(&nm_i->free_nid_lock); + if (!locked) + spin_lock(&nm_i->free_nid_lock); if (set) nm_i->free_nid_count[nat_ofs]++; else if (!build) nm_i->free_nid_count[nat_ofs]--; - spin_unlock(&nm_i->free_nid_lock); + if (!locked) + spin_unlock(&nm_i->free_nid_lock); } static void scan_nat_page(struct f2fs_sb_info *sbi, @@ -1875,7 +1874,7 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, blk_addr == NEW_ADDR); if (blk_addr == NULL_ADDR) freed = add_free_nid(sbi, start_nid, true); - update_free_nid_bitmap(sbi, start_nid, freed, true); + update_free_nid_bitmap(sbi, start_nid, freed, true, false); } } @@ -1923,58 +1922,6 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi) up_read(&nm_i->nat_tree_lock); } -static int scan_nat_bits(struct f2fs_sb_info *sbi) -{ - struct f2fs_nm_info *nm_i = NM_I(sbi); - struct page *page; - unsigned int i = 0; - nid_t nid; - - if (!enabled_nat_bits(sbi, NULL)) - return -EAGAIN; - - down_read(&nm_i->nat_tree_lock); -check_empty: - i = find_next_bit_le(nm_i->empty_nat_bits, nm_i->nat_blocks, i); - if (i >= nm_i->nat_blocks) { - i = 0; - goto check_partial; - } - - for (nid = i * NAT_ENTRY_PER_BLOCK; nid < (i + 1) * NAT_ENTRY_PER_BLOCK; - nid++) { - if (unlikely(nid >= nm_i->max_nid)) - break; - add_free_nid(sbi, nid, true); - } - - if (nm_i->nid_cnt[FREE_NID_LIST] >= MAX_FREE_NIDS) - goto out; - i++; - goto check_empty; - -check_partial: - i = find_next_zero_bit_le(nm_i->full_nat_bits, nm_i->nat_blocks, i); - if (i >= nm_i->nat_blocks) { - disable_nat_bits(sbi, true); - up_read(&nm_i->nat_tree_lock); - return -EINVAL; - } - - nid = i * NAT_ENTRY_PER_BLOCK; - page = get_current_nat_page(sbi, nid); - scan_nat_page(sbi, page, nid); - f2fs_put_page(page, 1); - - if (nm_i->nid_cnt[FREE_NID_LIST] < MAX_FREE_NIDS) { - i++; - goto check_partial; - } -out: - up_read(&nm_i->nat_tree_lock); - return 0; -} - static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -1996,21 +1943,6 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) if (nm_i->nid_cnt[FREE_NID_LIST]) return; - - /* try to find free nids with nat_bits */ - if (!scan_nat_bits(sbi) && nm_i->nid_cnt[FREE_NID_LIST]) - return; - } - - /* find next valid candidate */ - if (enabled_nat_bits(sbi, NULL)) { - int idx = find_next_zero_bit_le(nm_i->full_nat_bits, - nm_i->nat_blocks, 0); - - if (idx >= nm_i->nat_blocks) - set_sbi_flag(sbi, SBI_NEED_FSCK); - else - nid = idx * NAT_ENTRY_PER_BLOCK; } /* readahead nat pages to be scanned */ @@ -2097,7 +2029,7 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false); nm_i->available_nids--; - update_free_nid_bitmap(sbi, *nid, false, false); + update_free_nid_bitmap(sbi, *nid, false, false, false); spin_unlock(&nm_i->nid_list_lock); return true; @@ -2153,7 +2085,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) nm_i->available_nids++; - update_free_nid_bitmap(sbi, nid, true, false); + update_free_nid_bitmap(sbi, nid, true, false, false); spin_unlock(&nm_i->nid_list_lock); @@ -2483,11 +2415,11 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, add_free_nid(sbi, nid, false); spin_lock(&NM_I(sbi)->nid_list_lock); NM_I(sbi)->available_nids++; - update_free_nid_bitmap(sbi, nid, true, false); + update_free_nid_bitmap(sbi, nid, true, false, false); spin_unlock(&NM_I(sbi)->nid_list_lock); } else { spin_lock(&NM_I(sbi)->nid_list_lock); - update_free_nid_bitmap(sbi, nid, false, false); + update_free_nid_bitmap(sbi, nid, false, false, false); spin_unlock(&NM_I(sbi)->nid_list_lock); } } @@ -2593,6 +2525,40 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) return 0; } +inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int i = 0; + nid_t nid, last_nid; + + if (!enabled_nat_bits(sbi, NULL)) + return; + + for (i = 0; i < nm_i->nat_blocks; i++) { + i = find_next_bit_le(nm_i->empty_nat_bits, nm_i->nat_blocks, i); + if (i >= nm_i->nat_blocks) + break; + + __set_bit_le(i, nm_i->nat_block_bitmap); + + nid = i * NAT_ENTRY_PER_BLOCK; + last_nid = (i + 1) * NAT_ENTRY_PER_BLOCK; + + spin_lock(&nm_i->free_nid_lock); + for (; nid < last_nid; nid++) + update_free_nid_bitmap(sbi, nid, true, true, true); + spin_unlock(&nm_i->free_nid_lock); + } + + for (i = 0; i < nm_i->nat_blocks; i++) { + i = find_next_bit_le(nm_i->full_nat_bits, nm_i->nat_blocks, i); + if (i >= nm_i->nat_blocks) + break; + + __set_bit_le(i, nm_i->nat_block_bitmap); + } +} + static int init_node_manager(struct f2fs_sb_info *sbi) { struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi); @@ -2694,6 +2660,9 @@ int build_node_manager(struct f2fs_sb_info *sbi) if (err) return err; + /* load free nid status from nat_bits table */ + load_free_nid_bitmap(sbi); + build_free_nids(sbi, true, true); return 0; } From 983979fd88cf2f79f8fd19d482f381a38385398b Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Tue, 28 Feb 2017 20:32:41 +0800 Subject: [PATCH 0186/1212] f2fs: fix an error return value in truncate_partial_data_page This patch fix a error return value in truncate_partial_data_page Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim From ffcf47f5345bcc2be12259feb7b2285065ab7778 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sat, 4 Mar 2017 13:56:10 -0800 Subject: [PATCH 0187/1212] f2fs: don't need to invalidate wrong node page If f2fs_new_inode() is failed, the bad inode will invalidate 0'th node page during f2fs_evict_inode(), which doesn't need to do. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim From a9fc7327a2cb1df55043ae8ace77e574bae4a8a7 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 10 Mar 2017 09:36:10 -0800 Subject: [PATCH 0188/1212] f2fs: le16_to_cpu for xattr->e_value_size This patch fixes missing le16 conversion, reported by kbuild test robot. Fixes: 5f35a2cd5 ("f2fs: Don't update the xattr data that same as the exist") Reviewed-by: Kinglong Mee Signed-off-by: Jaegeuk Kim From bf5320b6fbd9d37a3776d6daaf0f60fd6c6cd451 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Sat, 11 Mar 2017 21:18:01 +0800 Subject: [PATCH 0189/1212] f2fs: le32_to_cpu for ckpt->cp_pack_total_block_count Fixes: 22ad0b6ab4 ("f2fs: add bitmaps for empty or full NAT blocks") Signed-off-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 645c3f7f21ce..08b9a1f578e3 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1025,7 +1025,8 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) spin_lock(&sbi->cp_lock); - if (cpc->reason == CP_UMOUNT && ckpt->cp_pack_total_block_count > + if (cpc->reason == CP_UMOUNT && + le32_to_cpu(ckpt->cp_pack_total_block_count) > sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) disable_nat_bits(sbi, false); From 3100307fdbdd104f0c5d50fae206859f14e362a3 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 7 Mar 2017 13:32:20 -0800 Subject: [PATCH 0190/1212] f2fs: fix wrong error injection for evict_inode The previous one was not a proper location to inject an error, since there is no point to get errors. Instead, we can emulate EIO during truncation, and the below logic should handle it correctly. Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index ef8610bf950f..2520fa72b23f 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -372,13 +372,6 @@ void f2fs_evict_inode(struct inode *inode) if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; -#ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_EVICT_INODE)) { - f2fs_show_injection_info(FAULT_EVICT_INODE); - goto no_delete; - } -#endif - remove_ino_entry(sbi, inode->i_ino, APPEND_INO); remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); @@ -389,6 +382,12 @@ void f2fs_evict_inode(struct inode *inode) if (F2FS_HAS_BLOCKS(inode)) err = f2fs_truncate(inode); +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_EVICT_INODE)) { + f2fs_show_injection_info(FAULT_EVICT_INODE); + err = -EIO; + } +#endif if (!err) { f2fs_lock_op(sbi); err = remove_inode_page(inode); From 8f326468d5b9c0b640a7265d9d4684917caa1334 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 7 Mar 2017 11:22:45 -0800 Subject: [PATCH 0191/1212] f2fs: don't allow to get pino when filename is encrypted After renaming an encrypted file, we have no way to get its encrypted filename from its dentry. Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 11053141ee4f..38d39f656746 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -112,6 +112,9 @@ static int get_parent_ino(struct inode *inode, nid_t *pino) { struct dentry *dentry; + if (file_enc_name(inode)) + return 0; + inode = igrab(inode); dentry = d_find_any_alias(inode); iput(inode); From 78e31d26ec81b8bc65589bb5289fb6ced240d320 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Sat, 4 Mar 2017 21:48:28 +0800 Subject: [PATCH 0192/1212] f2fs: fix the fault of checking F2FS_LINK_MAX for rename inode The parent directory's nlink will change, not the inode. Signed-off-by: Kinglong Mee Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a5a9ffc8e358..65fff81889cf 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -905,8 +905,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, old_nlink = old_dir_entry ? -1 : 1; new_nlink = -old_nlink; err = -EMLINK; - if ((old_nlink > 0 && old_inode->i_nlink >= F2FS_LINK_MAX) || - (new_nlink > 0 && new_inode->i_nlink >= F2FS_LINK_MAX)) + if ((old_nlink > 0 && old_dir->i_nlink >= F2FS_LINK_MAX) || + (new_nlink > 0 && new_dir->i_nlink >= F2FS_LINK_MAX)) goto out_new_dir; } From ef250a614506801289ffb8cb1e8e20e1a4e341ec Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Wed, 8 Mar 2017 09:49:53 +0800 Subject: [PATCH 0193/1212] f2fs: fix the fault of calculating blkstart twice When the zone type is BLK_ZONE_TYPE_CONVENTIONAL, the blkstart is calculated twice. Signed-off-by: Kinglong Mee Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index de30f4a86219..b914cfb49096 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -875,6 +875,7 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { sector_t sector, nr_sects; + block_t lblkstart = blkstart; int devi = 0; if (sbi->s_ndevs) { @@ -892,7 +893,7 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, case BLK_ZONE_TYPE_CONVENTIONAL: if (!blk_queue_discard(bdev_get_queue(bdev))) return 0; - return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen); + return __f2fs_issue_discard_async(sbi, bdev, lblkstart, blklen); case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF: sector = SECTOR_FROM_BLOCK(blkstart); From b39d14bd84c39a4771dc80cb0acda88236b812e2 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 7 Mar 2017 13:41:22 -0800 Subject: [PATCH 0194/1212] f2fs: build stat_info before orphan inode recovery f2fs_sync_fs() -> write_checkpoint() calls stat_inc_cp_count(sbi->stat_info), which needs stat_info allocation. Otherwise, we can hit: [254042.598623] ? count_shadow_nodes+0xa0/0xa0 [254042.598633] f2fs_sync_fs+0x65/0xd0 [f2fs] [254042.598645] f2fs_balance_fs_bg+0xe4/0x1c0 [f2fs] [254042.598657] f2fs_write_node_pages+0x34/0x1a0 [f2fs] [254042.598664] ? pagevec_lookup_entries+0x1e/0x30 [254042.598673] do_writepages+0x1e/0x30 [254042.598682] __writeback_single_inode+0x45/0x330 [254042.598688] writeback_single_inode+0xd7/0x190 [254042.598694] write_inode_now+0x86/0xa0 [254042.598699] iput+0x122/0x200 [254042.598709] f2fs_fill_super+0xd4a/0x14d0 [f2fs] [254042.598717] mount_bdev+0x184/0x1c0 [254042.598934] ? f2fs_commit_super+0x100/0x100 [f2fs] [254042.599142] f2fs_mount+0x15/0x20 [f2fs] [254042.599349] mount_fs+0x39/0x160 [254042.599554] ? __alloc_percpu+0x15/0x20 [254042.599759] vfs_kern_mount+0x67/0x110 [254042.599972] do_mount+0x1bb/0xc80 [254042.600175] ? memdup_user+0x42/0x60 [254042.600380] SyS_mount+0x83/0xd0 [254042.600583] entry_SYSCALL_64_fastpath+0x1e/0xad Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 291b92a486d5..b760414f3f9d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2028,6 +2028,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) f2fs_join_shrinker(sbi); + err = f2fs_build_stats(sbi); + if (err) + goto free_nm; + /* if there are nt orphan nodes free them */ err = recover_orphan_inodes(sbi); if (err) @@ -2052,10 +2056,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) goto free_root_inode; } - err = f2fs_build_stats(sbi); - if (err) - goto free_root_inode; - if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); @@ -2149,7 +2149,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry(sb->s_id, f2fs_proc_root); } - f2fs_destroy_stats(sbi); free_root_inode: dput(sb->s_root); sb->s_root = NULL; @@ -2167,6 +2166,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) truncate_inode_pages_final(META_MAPPING(sbi)); iput(sbi->node_inode); mutex_unlock(&sbi->umount_mutex); + f2fs_destroy_stats(sbi); free_nm: destroy_node_manager(sbi); free_sm: From 26012ec09c68b91929d33ad268a8e470c5df870e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 17 Mar 2017 09:55:52 +0800 Subject: [PATCH 0195/1212] f2fs: fix stale ATOMIC_WRITTEN_PAGE private pointer When I forced to enable atomic operations intentionally, I could hit the below panic, since we didn't clear page->private in f2fs_invalidate_page called by file truncation. The panic occurs due to NULL mapping having page->private. BUG: unable to handle kernel paging request at ffffffffffffffff IP: drop_buffers+0x38/0xe0 PGD 5d00c067 PUD 5d00e067 PMD 0 CPU: 3 PID: 1648 Comm: fsstress Tainted: G D OE 4.10.0+ #5 Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 task: ffff9151952863c0 task.stack: ffffaaec40db4000 RIP: 0010:drop_buffers+0x38/0xe0 RSP: 0018:ffffaaec40db74c8 EFLAGS: 00010292 Call Trace: ? page_referenced+0x8b/0x170 try_to_free_buffers+0xc5/0xe0 try_to_release_page+0x49/0x50 shrink_page_list+0x8bc/0x9f0 shrink_inactive_list+0x1dd/0x500 ? shrink_active_list+0x2c0/0x430 shrink_node_memcg+0x5eb/0x7c0 shrink_node+0xe1/0x320 do_try_to_free_pages+0xef/0x2e0 try_to_free_pages+0xe9/0x190 __alloc_pages_slowpath+0x390/0xe70 __alloc_pages_nodemask+0x291/0x2b0 alloc_pages_current+0x95/0x140 __page_cache_alloc+0xc4/0xe0 pagecache_get_page+0xab/0x2a0 grab_cache_page_write_begin+0x20/0x40 get_read_data_page+0x2e6/0x4c0 [f2fs] ? f2fs_mark_inode_dirty_sync+0x16/0x30 [f2fs] ? truncate_data_blocks_range+0x238/0x2b0 [f2fs] get_lock_data_page+0x30/0x190 [f2fs] __exchange_data_block+0xaaf/0xf40 [f2fs] f2fs_fallocate+0x418/0xd00 [f2fs] vfs_fallocate+0x157/0x220 SyS_fallocate+0x48/0x80 Signed-off-by: Yunlei He Signed-off-by: Chao Yu [Chao Yu: use INMEM_INVALIDATE for better tracing] Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/segment.c | 30 ++++++++++++++++++++++++++++++ include/trace/events/f2fs.h | 2 ++ 4 files changed, 35 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index bda784e38407..3c1221c12026 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1948,7 +1948,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, /* This is atomic written page, keep Private */ if (IS_ATOMIC_WRITTEN_PAGE(page)) - return; + return drop_inmem_page(inode, page); set_page_private(page, 0); ClearPagePrivate(page); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7c7ebf323255..e32bc391ed0b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -789,6 +789,7 @@ enum page_type { META_FLUSH, INMEM, /* the below types are used by tracepoints only. */ INMEM_DROP, + INMEM_INVALIDATE, INMEM_REVOKE, IPU, OPU, @@ -2251,6 +2252,7 @@ void destroy_node_manager_caches(void); */ void register_inmem_page(struct inode *inode, struct page *page); void drop_inmem_pages(struct inode *inode); +void drop_inmem_page(struct inode *inode, struct page *page); int commit_inmem_pages(struct inode *inode); void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b914cfb49096..7d7a8270bbbe 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -250,6 +250,36 @@ void drop_inmem_pages(struct inode *inode) stat_dec_atomic_write(inode); } +void drop_inmem_page(struct inode *inode, struct page *page) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct list_head *head = &fi->inmem_pages; + struct inmem_pages *cur = NULL; + + f2fs_bug_on(sbi, !IS_ATOMIC_WRITTEN_PAGE(page)); + + mutex_lock(&fi->inmem_lock); + list_for_each_entry(cur, head, list) { + if (cur->page == page) + break; + } + + f2fs_bug_on(sbi, !cur || cur->page != page); + list_del(&cur->list); + mutex_unlock(&fi->inmem_lock); + + dec_page_count(sbi, F2FS_INMEM_PAGES); + kmem_cache_free(inmem_entry_slab, cur); + + ClearPageUptodate(page); + set_page_private(page, 0); + ClearPagePrivate(page); + f2fs_put_page(page, 0); + + trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE); +} + static int __commit_inmem_pages(struct inode *inode, struct list_head *revoke_list) { diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index b95872b9c3ae..c9ea83dfd986 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -15,6 +15,7 @@ TRACE_DEFINE_ENUM(META); TRACE_DEFINE_ENUM(META_FLUSH); TRACE_DEFINE_ENUM(INMEM); TRACE_DEFINE_ENUM(INMEM_DROP); +TRACE_DEFINE_ENUM(INMEM_INVALIDATE); TRACE_DEFINE_ENUM(IPU); TRACE_DEFINE_ENUM(OPU); TRACE_DEFINE_ENUM(CURSEG_HOT_DATA); @@ -52,6 +53,7 @@ TRACE_DEFINE_ENUM(CP_DISCARD); { META_FLUSH, "META_FLUSH" }, \ { INMEM, "INMEM" }, \ { INMEM_DROP, "INMEM_DROP" }, \ + { INMEM_INVALIDATE, "INMEM_INVALIDATE" }, \ { INMEM_REVOKE, "INMEM_REVOKE" }, \ { IPU, "IN-PLACE" }, \ { OPU, "OUT-OF-PLACE" }) From dd6b2029c25b44d823d93b7cde0eea6a469a27fa Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 17 Mar 2017 10:04:15 +0800 Subject: [PATCH 0196/1212] f2fs: don't allow atomic writes for not regular files The atomic writes only supports regular files for database. Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 38d39f656746..0e15770cc728 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1523,6 +1523,9 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + ret = mnt_want_write_file(filp); if (ret) return ret; From 3d60b5db39906313247391c2173496fb0927e5e9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 17 Mar 2017 15:43:57 +0800 Subject: [PATCH 0197/1212] f2fs: don't allow volatile writes for non-regular file Now f2fs only supports volatile writes for journal db regular file. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 0e15770cc728..055495008c6c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1603,6 +1603,9 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + ret = mnt_want_write_file(filp); if (ret) return ret; From 363f8e93f52dafc302716056f31e835b045cd2c3 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Sat, 4 Mar 2017 22:13:10 +0800 Subject: [PATCH 0198/1212] f2fs: make sure trace all f2fs_issue_flush The root device's issue flush trace is missing, add it and tracing the result from submit. Fixes d50aaeec90 ("f2fs: show actual device info in tracepoints") Signed-off-by: Kinglong Mee Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 24 +++++++++++++----------- include/trace/events/f2fs.h | 11 +++++++---- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7d7a8270bbbe..26eef87e82ec 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -441,7 +441,8 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) } } -static int __submit_flush_wait(struct block_device *bdev) +static int __submit_flush_wait(struct f2fs_sb_info *sbi, + struct block_device *bdev) { struct bio *bio = f2fs_bio_alloc(0); int ret; @@ -450,23 +451,24 @@ static int __submit_flush_wait(struct block_device *bdev) bio->bi_bdev = bdev; ret = submit_bio_wait(WRITE_FLUSH, bio); bio_put(bio); + + trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER), + test_opt(sbi, FLUSH_MERGE), ret); return ret; } static int submit_flush_wait(struct f2fs_sb_info *sbi) { - int ret = __submit_flush_wait(sbi->sb->s_bdev); + int ret = __submit_flush_wait(sbi, sbi->sb->s_bdev); int i; - if (sbi->s_ndevs && !ret) { - for (i = 1; i < sbi->s_ndevs; i++) { - trace_f2fs_issue_flush(FDEV(i).bdev, - test_opt(sbi, NOBARRIER), - test_opt(sbi, FLUSH_MERGE)); - ret = __submit_flush_wait(FDEV(i).bdev); - if (ret) - break; - } + if (!sbi->s_ndevs || ret) + return ret; + + for (i = 1; i < sbi->s_ndevs; i++) { + ret = __submit_flush_wait(sbi, FDEV(i).bdev); + if (ret) + break; } return ret; } diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index c9ea83dfd986..8ca1ddf50dc1 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1166,26 +1166,29 @@ TRACE_EVENT(f2fs_issue_reset_zone, TRACE_EVENT(f2fs_issue_flush, TP_PROTO(struct block_device *dev, unsigned int nobarrier, - unsigned int flush_merge), + unsigned int flush_merge, int ret), - TP_ARGS(dev, nobarrier, flush_merge), + TP_ARGS(dev, nobarrier, flush_merge, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned int, nobarrier) __field(unsigned int, flush_merge) + __field(int, ret) ), TP_fast_assign( __entry->dev = dev->bd_dev; __entry->nobarrier = nobarrier; __entry->flush_merge = flush_merge; + __entry->ret = ret; ), - TP_printk("dev = (%d,%d), %s %s", + TP_printk("dev = (%d,%d), %s %s, ret = %d", show_dev(__entry->dev), __entry->nobarrier ? "skip (nobarrier)" : "issue", - __entry->flush_merge ? " with flush_merge" : "") + __entry->flush_merge ? " with flush_merge" : "", + __entry->ret) ); TRACE_EVENT(f2fs_lookup_extent_tree_start, From bfd70a38c16385130ad653d46a2ec694bddbb762 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Tue, 28 Feb 2017 21:34:47 +0800 Subject: [PATCH 0199/1212] f2fs: drop duplicate radix tree lookup of nat_entry_set The nat entry is listed from the set list for freeing, it's duplicate to do radix tree lookup again. Signed-off-by: Kinglong Mee [Jaegeuk Kim: remove unnecessary f2fs_bug_on] Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 5c70f33a2b4c..edabf883cf0c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -177,18 +177,12 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, } static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i, - struct nat_entry *ne) + struct nat_entry_set *set, struct nat_entry *ne) { - nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid); - struct nat_entry_set *head; - - head = radix_tree_lookup(&nm_i->nat_set_root, set); - if (head) { - list_move_tail(&ne->list, &nm_i->nat_entries); - set_nat_flag(ne, IS_DIRTY, false); - head->entry_cnt--; - nm_i->dirty_nat_cnt--; - } + list_move_tail(&ne->list, &nm_i->nat_entries); + set_nat_flag(ne, IS_DIRTY, false); + set->entry_cnt--; + nm_i->dirty_nat_cnt--; } static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, @@ -2410,7 +2404,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, } raw_nat_from_node_info(raw_ne, &ne->ni); nat_reset_flag(ne); - __clear_nat_cache_dirty(NM_I(sbi), ne); + __clear_nat_cache_dirty(NM_I(sbi), set, ne); if (nat_get_blkaddr(ne) == NULL_ADDR) { add_free_nid(sbi, nid, false); spin_lock(&NM_I(sbi)->nid_list_lock); From 1ef38ece5cfe9b91e90a280a5513ecacbdc62920 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Tue, 28 Feb 2017 21:34:37 +0800 Subject: [PATCH 0200/1212] f2fs: remove dead macro PGOFS_OF_NEXT_DNODE Fixes: 3cf4574705 ("f2fs: introduce get_next_page_offset to speed up SEEK_DATA") Signed-off-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e32bc391ed0b..83ccbfceffdf 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2097,12 +2097,6 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags) ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) -/* get offset of first page in next direct node */ -#define PGOFS_OF_NEXT_DNODE(pgofs, inode) \ - ((pgofs < ADDRS_PER_INODE(inode)) ? ADDRS_PER_INODE(inode) : \ - (pgofs - ADDRS_PER_INODE(inode) + ADDRS_PER_BLOCK) / \ - ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode)) - /* * file.c */ From 9f5bdf3b0ab30cf0dacca1c0dcd08f5b258bbc37 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 7 Mar 2017 13:54:56 -0800 Subject: [PATCH 0201/1212] f2fs: show more precise message on orphan recovery failure This case is not caused by fsck.f2fs. User needs to retry mount. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 08b9a1f578e3..4a0b286790e0 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -568,7 +568,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) if (ni.blk_addr != NULL_ADDR) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_msg(sbi->sb, KERN_WARNING, - "%s: orphan failed (ino=%x), run fsck to fix.", + "%s: orphan failed (ino=%x) by kernel, retry mount.", __func__, ino); return -EIO; } From 633f62b7dcf918cfcc01b90b41da78d30679df38 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Wed, 1 Mar 2017 18:07:10 +0800 Subject: [PATCH 0202/1212] f2fs: skip writeback meta pages if cp_mutex acquire failed Skip writeback meta pages if cp_mutex lock acquire failed, cp will flush dirty pages instead. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 4a0b286790e0..61c519688f9d 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -276,10 +276,11 @@ static int f2fs_write_meta_pages(struct address_space *mapping, get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) goto skip_write; - trace_f2fs_writepages(mapping->host, wbc, META); + /* if locked failed, cp will flush dirty pages instead */ + if (!mutex_trylock(&sbi->cp_mutex)) + goto skip_write; - /* if mounting is failed, skip writing node pages */ - mutex_lock(&sbi->cp_mutex); + trace_f2fs_writepages(mapping->host, wbc, META); diff = nr_pages_to_write(sbi, META, wbc); written = sync_meta_pages(sbi, META, wbc->nr_to_write); mutex_unlock(&sbi->cp_mutex); From 28fa89b32d44f7a09e6ef7357049a4ebce892e3b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 7 Mar 2017 18:02:02 -0800 Subject: [PATCH 0203/1212] f2fs: allocate a bio for discarding when actually issuing it Let's allocate a bio when issuing discard commands later. Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/segment.c --- fs/f2fs/f2fs.h | 4 +- fs/f2fs/segment.c | 192 ++++++++++++++++++++++++---------------------- 2 files changed, 105 insertions(+), 91 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 83ccbfceffdf..4ac8700d362a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -264,10 +264,12 @@ enum { struct discard_cmd { struct list_head list; /* command list */ struct completion wait; /* compleation */ + struct block_device *bdev; /* bdev */ block_t lstart; /* logical start address */ + block_t start; /* actual start address in dev */ block_t len; /* length */ - struct bio *bio; /* bio */ int state; /* state */ + int error; /* bio error */ }; struct discard_cmd_control { diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 26eef87e82ec..f0c06e4785a9 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -666,7 +666,8 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) } static void __add_discard_cmd(struct f2fs_sb_info *sbi, - struct bio *bio, block_t lstart, block_t len) + struct block_device *bdev, block_t lstart, + block_t start, block_t len) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *cmd_list = &(dcc->discard_cmd_list); @@ -674,11 +675,12 @@ static void __add_discard_cmd(struct f2fs_sb_info *sbi, dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS); INIT_LIST_HEAD(&dc->list); - dc->bio = bio; - bio->bi_private = dc; + dc->bdev = bdev; dc->lstart = lstart; + dc->start = start; dc->len = len; dc->state = D_PREP; + dc->error = 0; init_completion(&dc->wait); mutex_lock(&dcc->cmd_lock); @@ -688,70 +690,27 @@ static void __add_discard_cmd(struct f2fs_sb_info *sbi, static void __remove_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc) { - int err = dc->bio->bi_error; - if (dc->state == D_DONE) atomic_dec(&(SM_I(sbi)->dcc_info->submit_discard)); - if (err == -EOPNOTSUPP) - err = 0; + if (dc->error == -EOPNOTSUPP) + dc->error = 0; - if (err) + if (dc->error) f2fs_msg(sbi->sb, KERN_INFO, - "Issue discard failed, ret: %d", err); - bio_put(dc->bio); + "Issue discard failed, ret: %d", dc->error); list_del(&dc->list); kmem_cache_free(discard_cmd_slab, dc); } -/* This should be covered by global mutex, &sit_i->sentry_lock */ -void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) -{ - struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct list_head *wait_list = &(dcc->discard_cmd_list); - struct discard_cmd *dc, *tmp; - struct blk_plug plug; - - mutex_lock(&dcc->cmd_lock); - - blk_start_plug(&plug); - - list_for_each_entry_safe(dc, tmp, wait_list, list) { - - if (blkaddr == NULL_ADDR) { - if (dc->state == D_PREP) { - dc->state = D_SUBMIT; - submit_bio(REQ_SYNC, dc->bio); - atomic_inc(&dcc->submit_discard); - } - continue; - } - - if (dc->lstart <= blkaddr && blkaddr < dc->lstart + dc->len) { - if (dc->state == D_SUBMIT) - wait_for_completion_io(&dc->wait); - else - __remove_discard_cmd(sbi, dc); - } - } - blk_finish_plug(&plug); - - /* this comes from f2fs_put_super */ - if (blkaddr == NULL_ADDR) { - list_for_each_entry_safe(dc, tmp, wait_list, list) { - wait_for_completion_io(&dc->wait); - __remove_discard_cmd(sbi, dc); - } - } - mutex_unlock(&dcc->cmd_lock); -} - static void f2fs_submit_discard_endio(struct bio *bio) { struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; complete(&dc->wait); + dc->error = bio->bi_error; dc->state = D_DONE; + bio_put(bio); } /* copied from block/blk-lib.c in 4.10-rc1 */ @@ -835,6 +794,88 @@ static int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, return 0; } +static void __submit_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct bio *bio = NULL; + + if (dc->state != D_PREP) + return; + + dc->error = __blkdev_issue_discard(dc->bdev, + SECTOR_FROM_BLOCK(dc->start), + SECTOR_FROM_BLOCK(dc->len), + GFP_NOFS, 0, &bio); + if (!dc->error) { + /* should keep before submission to avoid D_DONE right away */ + dc->state = D_SUBMIT; + atomic_inc(&dcc->submit_discard); + if (bio) { + bio->bi_private = dc; + bio->bi_end_io = f2fs_submit_discard_endio; + submit_bio(REQ_SYNC, bio); + } + } else { + __remove_discard_cmd(sbi, dc); + } +} + +static int __queue_discard_cmd(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) +{ + block_t lblkstart = blkstart; + + trace_f2fs_issue_discard(bdev, blkstart, blklen); + + if (sbi->s_ndevs) { + int devi = f2fs_target_device_index(sbi, blkstart); + + blkstart -= FDEV(devi).start_blk; + } + __add_discard_cmd(sbi, bdev, lblkstart, blkstart, blklen); + wake_up(&SM_I(sbi)->dcc_info->discard_wait_queue); + return 0; +} + +/* This should be covered by global mutex, &sit_i->sentry_lock */ +void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *wait_list = &(dcc->discard_cmd_list); + struct discard_cmd *dc, *tmp; + struct blk_plug plug; + + mutex_lock(&dcc->cmd_lock); + + blk_start_plug(&plug); + + list_for_each_entry_safe(dc, tmp, wait_list, list) { + + if (blkaddr == NULL_ADDR) { + __submit_discard_cmd(sbi, dc); + continue; + } + + if (dc->lstart <= blkaddr && blkaddr < dc->lstart + dc->len) { + if (dc->state == D_SUBMIT) + wait_for_completion_io(&dc->wait); + else + __remove_discard_cmd(sbi, dc); + } + } + blk_finish_plug(&plug); + + /* this comes from f2fs_put_super */ + if (blkaddr == NULL_ADDR) { + list_for_each_entry_safe(dc, tmp, wait_list, list) { + wait_for_completion_io(&dc->wait); + __remove_discard_cmd(sbi, dc); + } + } + mutex_unlock(&dcc->cmd_lock); +} + static int issue_discard_thread(void *data) { struct f2fs_sb_info *sbi = data; @@ -852,15 +893,14 @@ static int issue_discard_thread(void *data) mutex_lock(&dcc->cmd_lock); list_for_each_entry_safe(dc, tmp, cmd_list, list) { - if (dc->state == D_PREP) { - dc->state = D_SUBMIT; - submit_bio(REQ_SYNC, dc->bio); - atomic_inc(&dcc->submit_discard); - if (iter++ > DISCARD_ISSUE_RATE) - break; - } else if (dc->state == D_DONE) { + + if (is_idle(sbi)) + __submit_discard_cmd(sbi, dc); + + if (dc->state == D_PREP && iter++ > DISCARD_ISSUE_RATE) + break; + if (dc->state == D_DONE) __remove_discard_cmd(sbi, dc); - } } mutex_unlock(&dcc->cmd_lock); @@ -874,34 +914,6 @@ static int issue_discard_thread(void *data) goto repeat; } - -/* this function is copied from blkdev_issue_discard from block/blk-lib.c */ -static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, - struct block_device *bdev, block_t blkstart, block_t blklen) -{ - struct bio *bio = NULL; - block_t lblkstart = blkstart; - int err; - - trace_f2fs_issue_discard(bdev, blkstart, blklen); - - if (sbi->s_ndevs) { - int devi = f2fs_target_device_index(sbi, blkstart); - - blkstart -= FDEV(devi).start_blk; - } - err = __blkdev_issue_discard(bdev, - SECTOR_FROM_BLOCK(blkstart), - SECTOR_FROM_BLOCK(blklen), - GFP_NOFS, 0, &bio); - if (!err && bio) { - bio->bi_end_io = f2fs_submit_discard_endio; - __add_discard_cmd(sbi, bio, lblkstart, blklen); - wake_up(&SM_I(sbi)->dcc_info->discard_wait_queue); - } - return err; -} - #ifdef CONFIG_BLK_DEV_ZONED static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) @@ -925,7 +937,7 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, case BLK_ZONE_TYPE_CONVENTIONAL: if (!blk_queue_discard(bdev_get_queue(bdev))) return 0; - return __f2fs_issue_discard_async(sbi, bdev, lblkstart, blklen); + return __queue_discard_cmd(sbi, bdev, lblkstart, blklen); case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF: sector = SECTOR_FROM_BLOCK(blkstart); @@ -957,7 +969,7 @@ static int __issue_discard_async(struct f2fs_sb_info *sbi, bdev_zoned_model(bdev) != BLK_ZONED_NONE) return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); #endif - return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen); + return __queue_discard_cmd(sbi, bdev, blkstart, blklen); } static int f2fs_issue_discard(struct f2fs_sb_info *sbi, From 02f88520d6f38a485923badc286238a8833dd3ca Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Thu, 2 Mar 2017 10:36:20 +0800 Subject: [PATCH 0204/1212] f2fs: add a punch discard command function This patch add a function to punch discard command if one segment reuse before discard. Split this segment from multi-segments discard range, and discard the left bigger range. Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f0c06e4785a9..5fc0173af7e3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -838,6 +838,25 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, return 0; } +static void __punch_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc, block_t blkaddr) +{ + block_t end_block = START_BLOCK(sbi, GET_SEGNO(sbi, blkaddr) + 1); + + if (dc->state == D_DONE || dc->lstart + dc->len <= end_block) { + __remove_discard_cmd(sbi, dc); + return; + } + + if (blkaddr - dc->lstart < dc->lstart + dc->len - end_block) { + dc->start += (end_block - dc->lstart); + dc->len -= (end_block - dc->lstart); + dc->lstart = end_block; + } else { + dc->len = blkaddr - dc->lstart; + } +} + /* This should be covered by global mutex, &sit_i->sentry_lock */ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) { @@ -860,8 +879,7 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) if (dc->lstart <= blkaddr && blkaddr < dc->lstart + dc->len) { if (dc->state == D_SUBMIT) wait_for_completion_io(&dc->wait); - else - __remove_discard_cmd(sbi, dc); + __punch_discard_cmd(sbi, dc, blkaddr); } } blk_finish_plug(&plug); From 23128a06f3b485635c8388317755c8a7ec1383ef Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Wed, 8 Mar 2017 10:47:11 +0800 Subject: [PATCH 0205/1212] f2fs: use parameter max_items instead of PIDVEC_SIZE Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index 73b4e1d1912a..c82ab4048127 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -138,7 +138,7 @@ static unsigned int gang_lookup_pids(pid_t *results, unsigned long first_index, radix_tree_for_each_slot(slot, &pids, &iter, first_index) { results[ret] = iter.index; - if (++ret == PIDVEC_SIZE) + if (++ret == max_items) break; } return ret; From ba5e838808d91d5203ddac1031fa2bb186c565dd Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Wed, 8 Mar 2017 10:47:12 +0800 Subject: [PATCH 0206/1212] f2fs: check range before defragment This patch checks the parameter range passed by ioctl to void that range exceeds the max_file_blocks limit. Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 055495008c6c..db0659c3c740 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2059,6 +2059,12 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg) goto out; } + if (unlikely((range.start + range.len) >> PAGE_SHIFT > + sbi->max_file_blocks)) { + err = -EINVAL; + goto out; + } + err = f2fs_defragment_range(sbi, filp, &range); f2fs_update_time(sbi, REQ_TIME); if (err < 0) From ce8679a179a818dad97917e8a3077ff7e4518f64 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 9 Mar 2017 15:24:24 -0800 Subject: [PATCH 0207/1212] f2fs: add fault injection on f2fs_truncate Inject a fault during f2fs_truncate(). Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 6 ++++++ fs/f2fs/super.c | 1 + 3 files changed, 8 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4ac8700d362a..c524d875ac79 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -51,6 +51,7 @@ enum { FAULT_BLOCK, FAULT_DIR_DEPTH, FAULT_EVICT_INODE, + FAULT_TRUNCATE, FAULT_IO, FAULT_CHECKPOINT, FAULT_MAX, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index db0659c3c740..094b83b53ebc 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -622,6 +622,12 @@ int f2fs_truncate(struct inode *inode) trace_f2fs_truncate(inode); +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(F2FS_I_SB(inode), FAULT_TRUNCATE)) { + f2fs_show_injection_info(FAULT_TRUNCATE); + return -EIO; + } +#endif /* we should check inline_data size */ if (!f2fs_may_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b760414f3f9d..779fd5e5cf40 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -49,6 +49,7 @@ char *fault_name[FAULT_MAX] = { [FAULT_BLOCK] = "no more block", [FAULT_DIR_DEPTH] = "too big dir depth", [FAULT_EVICT_INODE] = "evict_inode fail", + [FAULT_TRUNCATE] = "truncate fail", [FAULT_IO] = "IO error", [FAULT_CHECKPOINT] = "checkpoint error", }; From cb7b3c2fe5e91e92eebff970e18dacdc0df1a194 Mon Sep 17 00:00:00 2001 From: Fan Li Date: Wed, 8 Mar 2017 13:39:16 +0800 Subject: [PATCH 0208/1212] f2fs: adjust the way of calculating nat block use a slightly simpler expression to calculate nat block with nid. Signed-off-by: Fan Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 2f9603fa85a5..ebed0240aa53 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -200,13 +200,16 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start) struct f2fs_nm_info *nm_i = NM_I(sbi); pgoff_t block_off; pgoff_t block_addr; - int seg_off; + /* + * block_off = segment_off * 512 + off_in_segment + * OLD = (segment_off * 512) * 2 + off_in_segment + * NEW = 2 * (segment_off * 512 + off_in_segment) - off_in_segment + */ block_off = NAT_BLOCK_OFFSET(start); - seg_off = block_off >> sbi->log_blocks_per_seg; block_addr = (pgoff_t)(nm_i->nat_blkaddr + - (seg_off << sbi->log_blocks_per_seg << 1) + + (block_off << 1) - (block_off & (sbi->blocks_per_seg - 1))); if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) From 0812585ae021e44eb27c89bc93582471d2475b22 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Fri, 10 Mar 2017 17:54:03 +0800 Subject: [PATCH 0209/1212] f2fs: drop duplicate new_size assign in f2fs_zero_range Signed-off-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 094b83b53ebc..9e94ba41a559 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1202,8 +1202,6 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) return ret; - if (offset + len > new_size) - new_size = offset + len; new_size = max_t(loff_t, new_size, offset + len); } else { if (off_start) { From 00a248a675d5a94b70d309f3378d0603bd06a180 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Fri, 10 Mar 2017 17:54:26 +0800 Subject: [PATCH 0210/1212] f2fs: avoid copy date to user-space if move file range fail If move file range return error, the data copied to user-space is duplicate. Signed-off-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 9e94ba41a559..7fe9ee4a605d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2213,6 +2213,8 @@ static int f2fs_ioc_move_range(struct file *filp, unsigned long arg) range.pos_out, range.len); mnt_drop_write_file(filp); + if (err) + goto err_out; if (copy_to_user((struct f2fs_move_range __user *)arg, &range, sizeof(range))) From 74492a8e110cb26337d4a7816a7228383c81e392 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Fri, 10 Mar 2017 17:54:52 +0800 Subject: [PATCH 0211/1212] f2fs: check new size by inode_newsize_ok in f2fs_insert_range The inode_newsize_ok is better than only checking the maxbytes, eg. the rlimit etc. Signed-off-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7fe9ee4a605d..702f89a94a9c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1269,8 +1269,9 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) int ret = 0; new_size = i_size_read(inode) + len; - if (new_size > inode->i_sb->s_maxbytes) - return -EFBIG; + ret = inode_newsize_ok(inode, new_size); + if (ret) + return ret; if (offset >= i_size_read(inode)) return -EINVAL; From 0ec599668d9a5fa0feb60c458a441886547a7c46 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Fri, 10 Mar 2017 17:55:07 +0800 Subject: [PATCH 0212/1212] f2fs: move mnt_want_write_file after arguments checking It's needless of mnt_want_write_file for arguments checking. Signed-off-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 50 +++++++++++++++++++++----------------------------- 1 file changed, 21 insertions(+), 29 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 702f89a94a9c..0493afe2b068 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2042,45 +2042,37 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg) if (!S_ISREG(inode->i_mode)) return -EINVAL; + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + if (copy_from_user(&range, (struct f2fs_defragment __user *)arg, + sizeof(range))) + return -EFAULT; + + /* verify alignment of offset & size */ + if (range.start & (F2FS_BLKSIZE - 1) || range.len & (F2FS_BLKSIZE - 1)) + return -EINVAL; + + if (unlikely((range.start + range.len) >> PAGE_SHIFT > + sbi->max_file_blocks)) + return -EINVAL; + err = mnt_want_write_file(filp); if (err) return err; - if (f2fs_readonly(sbi->sb)) { - err = -EROFS; - goto out; - } - - if (copy_from_user(&range, (struct f2fs_defragment __user *)arg, - sizeof(range))) { - err = -EFAULT; - goto out; - } - - /* verify alignment of offset & size */ - if (range.start & (F2FS_BLKSIZE - 1) || - range.len & (F2FS_BLKSIZE - 1)) { - err = -EINVAL; - goto out; - } - - if (unlikely((range.start + range.len) >> PAGE_SHIFT > - sbi->max_file_blocks)) { - err = -EINVAL; - goto out; - } - err = f2fs_defragment_range(sbi, filp, &range); + mnt_drop_write_file(filp); + f2fs_update_time(sbi, REQ_TIME); if (err < 0) - goto out; + return err; if (copy_to_user((struct f2fs_defragment __user *)arg, &range, sizeof(range))) - err = -EFAULT; -out: - mnt_drop_write_file(filp); - return err; + return -EFAULT; + + return 0; } static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, From 186a33ffeb490bfa2b89905789e2dfa8e320b025 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Fri, 10 Mar 2017 20:43:20 +0800 Subject: [PATCH 0213/1212] f2fs: clear FI_DATA_EXIST flag in truncate_inline_inode Clear FI_DATA_EXIST flag atomically in truncate_inline_inode, and the return value from truncate_inline_inode isn't used, remove it. Signed-off-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 8 +------- fs/f2fs/file.c | 4 +--- fs/f2fs/inline.c | 21 +++++++++++---------- 3 files changed, 13 insertions(+), 20 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c524d875ac79..0f7a5a9a8416 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1940,12 +1940,6 @@ static inline int f2fs_has_inline_data(struct inode *inode) return is_inode_flag_set(inode, FI_INLINE_DATA); } -static inline void f2fs_clear_inline_inode(struct inode *inode) -{ - clear_inode_flag(inode, FI_INLINE_DATA); - clear_inode_flag(inode, FI_DATA_EXIST); -} - static inline int f2fs_exist_data(struct inode *inode) { return is_inode_flag_set(inode, FI_DATA_EXIST); @@ -2575,7 +2569,7 @@ extern struct kmem_cache *inode_entry_slab; bool f2fs_may_inline_data(struct inode *inode); bool f2fs_may_inline_dentry(struct inode *inode); void read_inline_data(struct page *page, struct page *ipage); -bool truncate_inline_inode(struct page *ipage, u64 from); +void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from); int f2fs_read_inline_data(struct inode *inode, struct page *page); int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page); int f2fs_convert_inline_inode(struct inode *inode); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 0493afe2b068..24bbe14ff5db 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -571,9 +571,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) } if (f2fs_has_inline_data(inode)) { - truncate_inline_inode(ipage, from); - if (from == 0) - clear_inode_flag(inode, FI_DATA_EXIST); + truncate_inline_inode(inode, ipage, from); f2fs_put_page(ipage, 1); truncate_page = true; goto out; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index d82e97b1e6c4..2b8ac2cd35d6 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -63,19 +63,21 @@ void read_inline_data(struct page *page, struct page *ipage) SetPageUptodate(page); } -bool truncate_inline_inode(struct page *ipage, u64 from) +void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from) { void *addr; if (from >= MAX_INLINE_DATA) - return false; + return; addr = inline_data_addr(ipage); f2fs_wait_on_page_writeback(ipage, NODE, true); memset(addr + from, 0, MAX_INLINE_DATA - from); set_page_dirty(ipage); - return true; + + if (from == 0) + clear_inode_flag(inode, FI_DATA_EXIST); } int f2fs_read_inline_data(struct inode *inode, struct page *page) @@ -146,11 +148,11 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) set_inode_flag(dn->inode, FI_APPEND_WRITE); /* clear inline data and flag after data writeback */ - truncate_inline_inode(dn->inode_page, 0); + truncate_inline_inode(dn->inode, dn->inode_page, 0); clear_inline_node(dn->inode_page); clear_out: stat_dec_inline_inode(dn->inode); - f2fs_clear_inline_inode(dn->inode); + clear_inode_flag(dn->inode, FI_INLINE_DATA); f2fs_put_dnode(dn); return 0; } @@ -267,9 +269,8 @@ bool recover_inline_data(struct inode *inode, struct page *npage) if (f2fs_has_inline_data(inode)) { ipage = get_node_page(sbi, inode->i_ino); f2fs_bug_on(sbi, IS_ERR(ipage)); - if (!truncate_inline_inode(ipage, 0)) - return false; - f2fs_clear_inline_inode(inode); + truncate_inline_inode(inode, ipage, 0); + clear_inode_flag(inode, FI_INLINE_DATA); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { if (truncate_blocks(inode, 0, false)) @@ -380,7 +381,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, set_page_dirty(page); /* clear inline dir and flag after data writeback */ - truncate_inline_inode(ipage, 0); + truncate_inline_inode(dir, ipage, 0); stat_dec_inline_dir(dir); clear_inode_flag(dir, FI_INLINE_DENTRY); @@ -455,7 +456,7 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, } memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA); - truncate_inline_inode(ipage, 0); + truncate_inline_inode(dir, ipage, 0); unlock_page(ipage); From 8f3d1ba54be036b594c017dae8a7087f5710b052 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Mon, 13 Mar 2017 16:35:13 +0800 Subject: [PATCH 0214/1212] f2fs: fix bad prefetchw of NULL page For f2fs_read_data_pages, the f2fs_mpage_readpages gets "page == NULL", so that, the prefetchw(&page->flags) is operated on NULL. Fixes: f1e8866016 ("f2fs: expose f2fs_mpage_readpages") Signed-off-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 3c1221c12026..9781e0b9153c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1150,9 +1150,10 @@ static int f2fs_mpage_readpages(struct address_space *mapping, for (page_idx = 0; nr_pages; page_idx++, nr_pages--) { - prefetchw(&page->flags); if (pages) { page = list_last_entry(pages, struct page, lru); + + prefetchw(&page->flags); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, page->index, GFP_KERNEL)) From 89f28f5e525195254cce03b9e871f5b0f6b6ba80 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 13 Mar 2017 20:10:41 +0800 Subject: [PATCH 0215/1212] f2fs: cover update_free_nid_bitmap with nid_list_lock free_nid_bitmap and free_nid_count in update_free_nid_bitmap should be updated atomically, use nid_list_lock cover them to avoid race in concurrent scenario. Signed-off-by: Chao Yu Reviewed-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 - fs/f2fs/node.c | 27 +++++++++++---------------- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0f7a5a9a8416..4398abfe13b1 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -632,7 +632,6 @@ struct f2fs_nm_info { unsigned char (*free_nid_bitmap)[NAT_ENTRY_BITMAP_SIZE]; unsigned char *nat_block_bitmap; unsigned short *free_nid_count; /* free nid count of NAT block */ - spinlock_t free_nid_lock; /* protect updating of nid count */ /* for checkpoint */ char *nat_bitmap; /* NAT bitmap pointer */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index edabf883cf0c..077bdb134e97 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1818,7 +1818,7 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) } static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, - bool set, bool build, bool locked) + bool set, bool build) { struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid); @@ -1832,14 +1832,10 @@ static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, else __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); - if (!locked) - spin_lock(&nm_i->free_nid_lock); if (set) nm_i->free_nid_count[nat_ofs]++; else if (!build) nm_i->free_nid_count[nat_ofs]--; - if (!locked) - spin_unlock(&nm_i->free_nid_lock); } static void scan_nat_page(struct f2fs_sb_info *sbi, @@ -1868,7 +1864,9 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, blk_addr == NEW_ADDR); if (blk_addr == NULL_ADDR) freed = add_free_nid(sbi, start_nid, true); - update_free_nid_bitmap(sbi, start_nid, freed, true, false); + spin_lock(&NM_I(sbi)->nid_list_lock); + update_free_nid_bitmap(sbi, start_nid, freed, true); + spin_unlock(&NM_I(sbi)->nid_list_lock); } } @@ -2023,7 +2021,7 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false); nm_i->available_nids--; - update_free_nid_bitmap(sbi, *nid, false, false, false); + update_free_nid_bitmap(sbi, *nid, false, false); spin_unlock(&nm_i->nid_list_lock); return true; @@ -2079,7 +2077,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) nm_i->available_nids++; - update_free_nid_bitmap(sbi, nid, true, false, false); + update_free_nid_bitmap(sbi, nid, true, false); spin_unlock(&nm_i->nid_list_lock); @@ -2409,11 +2407,11 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, add_free_nid(sbi, nid, false); spin_lock(&NM_I(sbi)->nid_list_lock); NM_I(sbi)->available_nids++; - update_free_nid_bitmap(sbi, nid, true, false, false); + update_free_nid_bitmap(sbi, nid, true, false); spin_unlock(&NM_I(sbi)->nid_list_lock); } else { spin_lock(&NM_I(sbi)->nid_list_lock); - update_free_nid_bitmap(sbi, nid, false, false, false); + update_free_nid_bitmap(sbi, nid, false, false); spin_unlock(&NM_I(sbi)->nid_list_lock); } } @@ -2538,10 +2536,10 @@ inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi) nid = i * NAT_ENTRY_PER_BLOCK; last_nid = (i + 1) * NAT_ENTRY_PER_BLOCK; - spin_lock(&nm_i->free_nid_lock); + spin_lock(&NM_I(sbi)->nid_list_lock); for (; nid < last_nid; nid++) - update_free_nid_bitmap(sbi, nid, true, true, true); - spin_unlock(&nm_i->free_nid_lock); + update_free_nid_bitmap(sbi, nid, true, true); + spin_unlock(&NM_I(sbi)->nid_list_lock); } for (i = 0; i < nm_i->nat_blocks; i++) { @@ -2632,9 +2630,6 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi) sizeof(unsigned short), GFP_KERNEL); if (!nm_i->free_nid_count) return -ENOMEM; - - spin_lock_init(&nm_i->free_nid_lock); - return 0; } From 27eff7f2f11c6abb30bc338b70bb960fa8ace1a0 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Fri, 10 Mar 2017 16:28:46 +0800 Subject: [PATCH 0216/1212] f2fs: cleanup the disk level filename updating As discuss with Jaegeuk and Chao, "Once checkpoint is done, f2fs doesn't need to update there-in filename at all." The disk-level filename is used only one case, 1. create a file A under a dir 2. sync A 3. godown 4. umount 5. mount (roll_forward) Only the rename/cross_rename changes the filename, if it happens, a. between step 1 and 2, the sync A will caused checkpoint, so that, the roll_forward at step 5 never happens. b. after step 2, the roll_forward happens, file A will roll forward to the result as after step 1. So that, any updating the disk filename is useless, just cleanup it. Signed-off-by: Kinglong Mee Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 25 ++++--------------------- fs/f2fs/f2fs.h | 2 -- fs/f2fs/file.c | 8 -------- fs/f2fs/inline.c | 2 -- fs/f2fs/namei.c | 29 ----------------------------- 5 files changed, 4 insertions(+), 62 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 4e2153620a3b..b71b7f364107 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -337,24 +337,6 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage) set_page_dirty(ipage); } -int update_dent_inode(struct inode *inode, struct inode *to, - const struct qstr *name) -{ - struct page *page; - - if (file_enc_name(to)) - return 0; - - page = get_node_page(F2FS_I_SB(inode), inode->i_ino); - if (IS_ERR(page)) - return PTR_ERR(page); - - init_dent_inode(name, page); - f2fs_put_page(page, 1); - - return 0; -} - void do_make_empty_dir(struct inode *inode, struct inode *parent, struct f2fs_dentry_ptr *d) { @@ -438,8 +420,11 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, set_cold_node(inode, page); } - if (new_name) + if (new_name) { init_dent_inode(new_name, page); + if (f2fs_encrypted_inode(dir)) + file_set_enc_name(inode); + } /* * This file should be checkpointed during fsync. @@ -599,8 +584,6 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, err = PTR_ERR(page); goto fail; } - if (f2fs_encrypted_inode(dir)) - file_set_enc_name(inode); } make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4398abfe13b1..2410f1b4ece2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2154,8 +2154,6 @@ ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, struct page **page); void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, struct page *page, struct inode *inode); -int update_dent_inode(struct inode *inode, struct inode *to, - const struct qstr *name); void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, const struct qstr *name, f2fs_hash_t name_hash, unsigned int bit_pos); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 24bbe14ff5db..bc5f73828a9b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -112,20 +112,12 @@ static int get_parent_ino(struct inode *inode, nid_t *pino) { struct dentry *dentry; - if (file_enc_name(inode)) - return 0; - inode = igrab(inode); dentry = d_find_any_alias(inode); iput(inode); if (!dentry) return 0; - if (update_dent_inode(inode, inode, &dentry->d_name)) { - dput(dentry); - return 0; - } - *pino = parent_ino(dentry); dput(dentry); return 1; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 2b8ac2cd35d6..a92370516659 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -528,8 +528,6 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, err = PTR_ERR(page); goto fail; } - if (f2fs_encrypted_inode(dir)) - file_set_enc_name(inode); } f2fs_wait_on_page_writeback(ipage, NODE, true); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 65fff81889cf..43eb2bd417a8 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -717,13 +717,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (err) goto put_out_dir; - err = update_dent_inode(old_inode, new_inode, - &new_dentry->d_name); - if (err) { - release_orphan_inode(sbi); - goto put_out_dir; - } - f2fs_set_link(new_dir, new_entry, new_page, old_inode); new_inode->i_ctime = current_time(new_inode); @@ -776,8 +769,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, down_write(&F2FS_I(old_inode)->i_sem); file_lost_pino(old_inode); - if (new_inode && file_enc_name(new_inode)) - file_set_enc_name(old_inode); up_write(&F2FS_I(old_inode)->i_sem); old_inode->i_ctime = current_time(old_inode); @@ -914,18 +905,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_lock_op(sbi); - err = update_dent_inode(old_inode, new_inode, &new_dentry->d_name); - if (err) - goto out_unlock; - if (file_enc_name(new_inode)) - file_set_enc_name(old_inode); - - err = update_dent_inode(new_inode, old_inode, &old_dentry->d_name); - if (err) - goto out_undo; - if (file_enc_name(old_inode)) - file_set_enc_name(new_inode); - /* update ".." directory entry info of old dentry */ if (old_dir_entry) f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); @@ -969,14 +948,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) f2fs_sync_fs(sbi->sb, 1); return 0; -out_undo: - /* - * Still we may fail to recover name info of f2fs_inode here - * Drop it, once its name is set as encrypted - */ - update_dent_inode(old_inode, old_inode, &old_dentry->d_name); -out_unlock: - f2fs_unlock_op(sbi); out_new_dir: if (new_dir_entry) { f2fs_dentry_kunmap(new_inode, new_dir_page); From 96d73c33c4a3749d1d2b654313bcc98db0156cd1 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Wed, 15 Mar 2017 21:12:50 +0800 Subject: [PATCH 0217/1212] f2fs: sanity check of crc_offset from raw checkpoint The crc_offset towards or beyond the end of block is wrong, sanity check it. Signed-off-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 61c519688f9d..afe4616af025 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -679,7 +679,7 @@ static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, *cp_block = (struct f2fs_checkpoint *)page_address(*cp_page); crc_offset = le32_to_cpu((*cp_block)->checksum_offset); - if (crc_offset >= blk_size) { + if (crc_offset > (blk_size - sizeof(__le32))) { f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc_offset: %zu", crc_offset); return -EINVAL; From cc248f964ee86a60e04b549caa70277db9c55374 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Sat, 18 Mar 2017 09:20:55 +0800 Subject: [PATCH 0218/1212] f2fs: avoid stat_inc_atomic_write for non-atomic file After filemap_write_and_wait_range fail, the FI_ATOMIC_FILE flags is removed, so that f2fs should not increase the stat of atomic_write. Signed-off-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index bc5f73828a9b..a96d3193f209 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1538,17 +1538,21 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); if (!get_dirty_pages(inode)) - goto out; + goto inc_stat; f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); - if (ret) + if (ret) { clear_inode_flag(inode, FI_ATOMIC_FILE); -out: + goto out; + } + +inc_stat: stat_inc_atomic_write(inode); stat_update_max_atomic_write(inode); +out: inode_unlock(inode); mnt_drop_write_file(filp); return ret; From fecfdd67f86e66a821253e5a6d9d41ed0348cd45 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Sat, 18 Mar 2017 09:25:05 +0800 Subject: [PATCH 0219/1212] f2fs: calculate the f2fs_stat_info into base_mem The memory size of f2fs_stat_info also should be calculated. Signed-off-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index ee2d0a485fc3..ef1179df05d9 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -156,7 +156,11 @@ static void update_mem_info(struct f2fs_sb_info *sbi) if (si->base_mem) goto get_cache; - si->base_mem = sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize; + /* build stat */ + si->base_mem = sizeof(struct f2fs_stat_info); + + /* build superblock */ + si->base_mem += sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize; si->base_mem += 2 * sizeof(struct f2fs_inode_info); si->base_mem += sizeof(*sbi->ckpt); si->base_mem += sizeof(struct percpu_counter) * NR_COUNT_TYPE; From ac60235af91e76a5b497cac4f1524f43cf8e4633 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Sat, 18 Mar 2017 09:26:13 +0800 Subject: [PATCH 0220/1212] f2fs: more reasonable mem_size calculating of ino_entry Signed-off-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 077bdb134e97..c31283624cfe 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -63,8 +63,9 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) int i; for (i = 0; i <= UPDATE_INO; i++) - mem_size += (sbi->im[i].ino_num * - sizeof(struct ino_entry)) >> PAGE_SHIFT; + mem_size += sbi->im[i].ino_num * + sizeof(struct ino_entry); + mem_size >>= PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else if (type == EXTENT_CACHE) { mem_size = (atomic_read(&sbi->total_ext_tree) * From 506e7056e23ec95f2bd45fde50384bb036bb14af Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 21 Mar 2017 20:09:45 +0800 Subject: [PATCH 0221/1212] f2fs: fix recording invalid last_victim When doing garbage collection, we try to record segment offset which locates at next one of last victim, using it as the start offset in next searching. But in some corner cases, recorded offset may cross the end of main segment area, it will cause incorrectly searching in dirty_segmap bitmap. This patch adds modular operation to avoid this issue. Reported-by: Yunlei He Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 68d6a4cad349..c5644127fd4f 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -361,6 +361,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, sbi->last_victim[p.gc_mode] = last_victim + 1; else sbi->last_victim[p.gc_mode] = segno + 1; + sbi->last_victim[p.gc_mode] %= MAIN_SEGS(sbi); break; } } From ca28c969a4b2478f97a9e176163a5dbf2e3fd617 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Wed, 22 Mar 2017 11:59:30 +0800 Subject: [PATCH 0222/1212] f2fs: use set_page_private marcro in f2fs_trace_pid Use set_page_private marcro instead of operte page struct directly Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index c82ab4048127..bccbbf2616d2 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -59,7 +59,7 @@ void f2fs_trace_pid(struct page *page) pid_t pid = task_pid_nr(current); void *p; - page->private = pid; + set_page_private(page, (unsigned long)pid); if (radix_tree_preload(GFP_NOFS)) return; From 743ef11f591dcbb9d0318a107e35cdb40419f9a4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 22 Mar 2017 14:45:05 +0800 Subject: [PATCH 0223/1212] f2fs: fix race condition in between free nid allocator/initializer In below concurrent case, allocated nid can be loaded into free nid cache and be allocated again. Thread A Thread B - f2fs_create - f2fs_new_inode - alloc_nid - __insert_nid_to_list(ALLOC_NID_LIST) - f2fs_balance_fs_bg - build_free_nids - __build_free_nids - scan_nat_page - add_free_nid - __lookup_nat_cache - f2fs_add_link - init_inode_metadata - new_inode_page - new_node_page - set_node_addr - alloc_nid_done - __remove_nid_from_list(ALLOC_NID_LIST) - __insert_nid_to_list(FREE_NID_LIST) This patch makes nat cache lookup and free nid list operation being atomical to avoid this race condition. Signed-off-by: Jaegeuk Kim Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 63 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 45 insertions(+), 18 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index c31283624cfe..c098e90cfae7 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1764,40 +1764,67 @@ static void __remove_nid_from_list(struct f2fs_sb_info *sbi, static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) { struct f2fs_nm_info *nm_i = NM_I(sbi); - struct free_nid *i; + struct free_nid *i, *e; struct nat_entry *ne; - int err; + int err = -EINVAL; + bool ret = false; /* 0 nid should not be used */ if (unlikely(nid == 0)) return false; - if (build) { - /* do not add allocated nids */ - ne = __lookup_nat_cache(nm_i, nid); - if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || - nat_get_blkaddr(ne) != NULL_ADDR)) - return false; - } - i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS); i->nid = nid; i->state = NID_NEW; - if (radix_tree_preload(GFP_NOFS)) { - kmem_cache_free(free_nid_slab, i); - return true; - } + if (radix_tree_preload(GFP_NOFS)) + goto err; spin_lock(&nm_i->nid_list_lock); + + if (build) { + /* + * Thread A Thread B + * - f2fs_create + * - f2fs_new_inode + * - alloc_nid + * - __insert_nid_to_list(ALLOC_NID_LIST) + * - f2fs_balance_fs_bg + * - build_free_nids + * - __build_free_nids + * - scan_nat_page + * - add_free_nid + * - __lookup_nat_cache + * - f2fs_add_link + * - init_inode_metadata + * - new_inode_page + * - new_node_page + * - set_node_addr + * - alloc_nid_done + * - __remove_nid_from_list(ALLOC_NID_LIST) + * - __insert_nid_to_list(FREE_NID_LIST) + */ + ne = __lookup_nat_cache(nm_i, nid); + if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || + nat_get_blkaddr(ne) != NULL_ADDR)) + goto err_out; + + e = __lookup_free_nid_list(nm_i, nid); + if (e) { + if (e->state == NID_NEW) + ret = true; + goto err_out; + } + } + ret = true; err = __insert_nid_to_list(sbi, i, FREE_NID_LIST, true); +err_out: spin_unlock(&nm_i->nid_list_lock); radix_tree_preload_end(); - if (err) { +err: + if (err) kmem_cache_free(free_nid_slab, i); - return true; - } - return true; + return ret; } static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) From df1b8e6f245a1ebdd0055bf603009a8473237d16 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 22 Mar 2017 17:23:45 +0800 Subject: [PATCH 0224/1212] f2fs: show the max number of volatile operations This patch adds to show the max number of volatile operations which are conducting concurrently. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 10 ++++++++-- fs/f2fs/f2fs.h | 18 +++++++++++++++++- fs/f2fs/file.c | 5 +++++ 3 files changed, 30 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index ef1179df05d9..0baa3ee39392 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -51,7 +51,9 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); si->aw_cnt = atomic_read(&sbi->aw_cnt); + si->vw_cnt = atomic_read(&sbi->vw_cnt); si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); + si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt); si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA); if (SM_I(sbi) && SM_I(sbi)->fcc_info) @@ -337,8 +339,10 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: %4d, Discard: %4d)\n", si->nr_wb_cp_data, si->nr_wb_data, si->nr_flush, si->nr_discard); - seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d)\n", - si->inmem_pages, si->aw_cnt, si->max_aw_cnt); + seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), " + "volatile IO: %4d (Max. %4d)\n", + si->inmem_pages, si->aw_cnt, si->max_aw_cnt, + si->vw_cnt, si->max_vw_cnt); seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", @@ -438,7 +442,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->inplace_count, 0); atomic_set(&sbi->aw_cnt, 0); + atomic_set(&sbi->vw_cnt, 0); atomic_set(&sbi->max_aw_cnt, 0); + atomic_set(&sbi->max_vw_cnt, 0); mutex_lock(&f2fs_stat_mutex); list_add_tail(&si->stat_list, &f2fs_stat_list); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2410f1b4ece2..4b19cba0fee2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -982,7 +982,9 @@ struct f2fs_sb_info { atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ atomic_t aw_cnt; /* # of atomic writes */ + atomic_t vw_cnt; /* # of volatile writes */ atomic_t max_aw_cnt; /* max # of atomic writes */ + atomic_t max_vw_cnt; /* max # of volatile writes */ int bg_gc; /* background gc calls */ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ #endif @@ -2395,7 +2397,7 @@ struct f2fs_stat_info { int total_count, utilization; int bg_gc, nr_wb_cp_data, nr_wb_data, nr_flush, nr_discard; int inline_xattr, inline_inode, inline_dir, append, update, orphans; - int aw_cnt, max_aw_cnt; + int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; @@ -2478,6 +2480,17 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) if (cur > max) \ atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ } while (0) +#define stat_inc_volatile_write(inode) \ + (atomic_inc(&F2FS_I_SB(inode)->vw_cnt)) +#define stat_dec_volatile_write(inode) \ + (atomic_dec(&F2FS_I_SB(inode)->vw_cnt)) +#define stat_update_max_volatile_write(inode) \ + do { \ + int cur = atomic_read(&F2FS_I_SB(inode)->vw_cnt); \ + int max = atomic_read(&F2FS_I_SB(inode)->max_vw_cnt); \ + if (cur > max) \ + atomic_set(&F2FS_I_SB(inode)->max_vw_cnt, cur); \ + } while (0) #define stat_inc_seg_count(sbi, type, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ @@ -2534,6 +2547,9 @@ void f2fs_destroy_root_stats(void); #define stat_inc_atomic_write(inode) #define stat_dec_atomic_write(inode) #define stat_update_max_atomic_write(inode) +#define stat_inc_volatile_write(inode) +#define stat_dec_volatile_write(inode) +#define stat_update_max_volatile_write(inode) #define stat_inc_seg_type(sbi, curseg) #define stat_inc_block_count(sbi, curseg) #define stat_inc_inplace_blocks(sbi) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a96d3193f209..b06a52e33a79 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1431,6 +1431,7 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { clear_inode_flag(inode, FI_VOLATILE_FILE); + stat_dec_volatile_write(inode); set_inode_flag(inode, FI_DROP_CACHE); filemap_fdatawrite(inode->i_mapping); clear_inode_flag(inode, FI_DROP_CACHE); @@ -1618,6 +1619,9 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) if (ret) goto out; + stat_inc_volatile_write(inode); + stat_update_max_volatile_write(inode); + set_inode_flag(inode, FI_VOLATILE_FILE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); out: @@ -1673,6 +1677,7 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { clear_inode_flag(inode, FI_VOLATILE_FILE); + stat_dec_volatile_write(inode); ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); } From afc8c720de8613d2e740a96e030060643b21b0d3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 22 Mar 2017 17:23:46 +0800 Subject: [PATCH 0225/1212] f2fs: don't track volatile file in dirty inode list Don't track volatile file in dirty inode list, otherwise with data_flush option, background thread will entry into endless loop for flushing journal file's pages. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index afe4616af025..9d92f83cce94 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -818,7 +818,9 @@ static void __add_dirty_inode(struct inode *inode, enum inode_type type) return; set_inode_flag(inode, flag); - list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]); + if (!f2fs_is_volatile_file(inode)) + list_add_tail(&F2FS_I(inode)->dirty_list, + &sbi->inode_list[type]); stat_inc_dirty_inode(sbi, type); } From b92a30224597ac6062298756fba67380d7069113 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 23 Mar 2017 13:38:25 +0800 Subject: [PATCH 0226/1212] f2fs: clean up xattr operation 1. don't allocate redundant memory in read_all_xattrs. 2. introduce RESERVED_XATTR_SIZE for cleanup. Signed-off-by: Chao Yu Reviewed-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 25 +++++++++++-------------- fs/f2fs/xattr.h | 3 ++- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index fb5062a4df77..afe14845c00a 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -297,15 +297,13 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, void *cur_addr, *txattr_addr, *last_addr = NULL; nid_t xnid = F2FS_I(inode)->i_xattr_nid; unsigned int size = xnid ? VALID_XATTR_BLOCK_SIZE : 0; - unsigned int inline_size = 0; + unsigned int inline_size = inline_xattr_size(inode); int err = 0; - inline_size = inline_xattr_size(inode); - if (!size && !inline_size) return -ENODATA; - txattr_addr = kzalloc(inline_size + size + sizeof(__u32), + txattr_addr = kzalloc(inline_size + size + RESERVED_XATTR_SIZE, GFP_F2FS_ZERO); if (!txattr_addr) return -ENOMEM; @@ -375,13 +373,14 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_xattr_header *header; - size_t size = PAGE_SIZE, inline_size = 0; + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + unsigned int size = VALID_XATTR_BLOCK_SIZE; + unsigned int inline_size = inline_xattr_size(inode); void *txattr_addr; int err; - inline_size = inline_xattr_size(inode); - - txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO); + txattr_addr = kzalloc(inline_size + size + RESERVED_XATTR_SIZE, + GFP_F2FS_ZERO); if (!txattr_addr) return -ENOMEM; @@ -405,19 +404,19 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, } /* read from xattr node block */ - if (F2FS_I(inode)->i_xattr_nid) { + if (xnid) { struct page *xpage; void *xattr_addr; /* The inode already has an extended attribute block. */ - xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); + xpage = get_node_page(sbi, xnid); if (IS_ERR(xpage)) { err = PTR_ERR(xpage); goto fail; } xattr_addr = page_address(xpage); - memcpy(txattr_addr + inline_size, xattr_addr, PAGE_SIZE); + memcpy(txattr_addr + inline_size, xattr_addr, size); f2fs_put_page(xpage, 1); } @@ -439,14 +438,12 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, void *txattr_addr, struct page *ipage) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - size_t inline_size = 0; + size_t inline_size = inline_xattr_size(inode); void *xattr_addr; struct page *xpage; nid_t new_nid = 0; int err; - inline_size = inline_xattr_size(inode); - if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid) if (!alloc_nid(sbi, &new_nid)) return -ENOSPC; diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index ba64f43d163d..d111568daf83 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -73,7 +73,8 @@ struct f2fs_xattr_entry { !IS_XATTR_LAST_ENTRY(entry);\ entry = XATTR_NEXT_ENTRY(entry)) #define MAX_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer)) -#define VALID_XATTR_BLOCK_SIZE (MAX_XATTR_BLOCK_SIZE - sizeof(__u32)) +#define RESERVED_XATTR_SIZE (sizeof(__u32)) +#define VALID_XATTR_BLOCK_SIZE (MAX_XATTR_BLOCK_SIZE - RESERVED_XATTR_SIZE) #define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + \ VALID_XATTR_BLOCK_SIZE) From fbe4cc0f76dcd78a09acd155761b505291e84087 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 23 Mar 2017 13:38:26 +0800 Subject: [PATCH 0227/1212] f2fs: don't reserve additional space in xattr block In this patch, we change xattr block disk layout as below: Before: xattr node block layout +---------------------------------------------+---------------+-------------+ | node block xattr entries | reserved | node footer | | 4068 Bytes | 4 Bytes | 24 Bytes | In memory layout +--------------------+---------------------------------+--------------------+ | inline xattr | node block xattr entries | reserved | | 200 Bytes | 4068 Bytes | 4 Bytes | After: xattr node block layout +-------------------------------------------------------------+-------------+ | node block xattr entries | node footer | | 4072 Bytes | 24 Bytes | In memory layout +--------------------+---------------------------------+--------------------+ | inline xattr | node block xattr entries | reserved | | 200 Bytes | 4072 Bytes | 4 Bytes | With this change, we don't need to reserve additional space in node block, just keep reserved space in logical in-memory layout. So that it would help to enlarge valid free space of xattr node block. As tested, generic/026 shows max stored xattr entires number increases from 531 to 532 when inline_xattr option is enabled. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 6 +++--- fs/f2fs/xattr.h | 5 ++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index afe14845c00a..aaf0a4167175 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -303,7 +303,7 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, if (!size && !inline_size) return -ENODATA; - txattr_addr = kzalloc(inline_size + size + RESERVED_XATTR_SIZE, + txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE, GFP_F2FS_ZERO); if (!txattr_addr) return -ENOMEM; @@ -379,7 +379,7 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, void *txattr_addr; int err; - txattr_addr = kzalloc(inline_size + size + RESERVED_XATTR_SIZE, + txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE, GFP_F2FS_ZERO); if (!txattr_addr) return -ENOMEM; @@ -498,7 +498,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, } xattr_addr = page_address(xpage); - memcpy(xattr_addr, txattr_addr + inline_size, MAX_XATTR_BLOCK_SIZE); + memcpy(xattr_addr, txattr_addr + inline_size, VALID_XATTR_BLOCK_SIZE); set_page_dirty(xpage); f2fs_put_page(xpage, 1); diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index d111568daf83..91f3bd88dcc6 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -72,9 +72,8 @@ struct f2fs_xattr_entry { for (entry = XATTR_FIRST_ENTRY(addr);\ !IS_XATTR_LAST_ENTRY(entry);\ entry = XATTR_NEXT_ENTRY(entry)) -#define MAX_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer)) -#define RESERVED_XATTR_SIZE (sizeof(__u32)) -#define VALID_XATTR_BLOCK_SIZE (MAX_XATTR_BLOCK_SIZE - RESERVED_XATTR_SIZE) +#define VALID_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer)) +#define XATTR_PADDING_SIZE (sizeof(__u32)) #define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + \ VALID_XATTR_BLOCK_SIZE) From aa9de43b3bc690f168f03964a53cb615227c694f Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Mon, 13 Mar 2017 20:22:18 +0800 Subject: [PATCH 0228/1212] f2fs: allow write page cache when writting cp This patch allow write data to normal file when writting new checkpoint. We relax three limitations for write_begin path: 1. data allocation 2. node allocation 3. variables in checkpoint Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 40 ++++++++++++++++++++++++++++------------ fs/f2fs/data.c | 28 ++++++++++++++++++++++------ fs/f2fs/f2fs.h | 1 + fs/f2fs/node.c | 12 ++++++------ fs/f2fs/super.c | 1 + 5 files changed, 58 insertions(+), 24 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 9d92f83cce94..a9f141abac5d 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -945,6 +945,19 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) return 0; } +static void __prepare_cp_block(struct f2fs_sb_info *sbi) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_nm_info *nm_i = NM_I(sbi); + nid_t last_nid = nm_i->next_scan_nid; + + next_free_nid(sbi, &last_nid); + ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi)); + ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi)); + ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi)); + ckpt->next_free_nid = cpu_to_le32(last_nid); +} + /* * Freeze all the FS-operations for checkpoint. */ @@ -971,7 +984,14 @@ static int block_operations(struct f2fs_sb_info *sbi) goto retry_flush_dents; } + /* + * POR: we should ensure that there are no dirty node pages + * until finishing nat/sit flush. inode->i_blocks can be updated. + */ + down_write(&sbi->node_change); + if (get_pages(sbi, F2FS_DIRTY_IMETA)) { + up_write(&sbi->node_change); f2fs_unlock_all(sbi); err = f2fs_sync_inode_meta(sbi); if (err) @@ -979,10 +999,6 @@ static int block_operations(struct f2fs_sb_info *sbi) goto retry_flush_dents; } - /* - * POR: we should ensure that there are no dirty node pages - * until finishing nat/sit flush. - */ retry_flush_nodes: down_write(&sbi->node_write); @@ -990,11 +1006,19 @@ static int block_operations(struct f2fs_sb_info *sbi) up_write(&sbi->node_write); err = sync_node_pages(sbi, &wbc); if (err) { + up_write(&sbi->node_change); f2fs_unlock_all(sbi); goto out; } goto retry_flush_nodes; } + + /* + * sbi->node_change is used only for AIO write_begin path which produces + * dirty node blocks and some checkpoint values by block allocation. + */ + __prepare_cp_block(sbi); + up_write(&sbi->node_change); out: blk_finish_plug(&plug); return err; @@ -1062,7 +1086,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; - nid_t last_nid = nm_i->next_scan_nid; block_t start_blk; unsigned int data_sum_blocks, orphan_blocks; __u32 crc32 = 0; @@ -1079,14 +1102,11 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) return -EIO; } - next_free_nid(sbi, &last_nid); - /* * modify checkpoint * version number is already updated */ ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi)); - ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi)); ckpt->free_segment_count = cpu_to_le32(free_segments(sbi)); for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { ckpt->cur_node_segno[i] = @@ -1105,10 +1125,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) curseg_alloc_type(sbi, i + CURSEG_HOT_DATA); } - ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi)); - ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi)); - ckpt->next_free_nid = cpu_to_le32(last_nid); - /* 2 cp + n data seg summary + orphan inode blocks */ data_sum_blocks = npages_for_summary_flush(sbi, false); spin_lock(&sbi->cp_lock); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9781e0b9153c..49c04c4e3bd8 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -786,6 +786,21 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) return err; } +static inline void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock) +{ + if (flag == F2FS_GET_BLOCK_PRE_AIO) { + if (lock) + down_read(&sbi->node_change); + else + up_read(&sbi->node_change); + } else { + if (lock) + f2fs_lock_op(sbi); + else + f2fs_unlock_op(sbi); + } +} + /* * f2fs_map_blocks() now supported readahead/bmap/rw direct_IO with * f2fs_map_blocks structure. @@ -828,7 +843,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, next_dnode: if (create) - f2fs_lock_op(sbi); + __do_map_lock(sbi, flag, true); /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -938,7 +953,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, f2fs_put_dnode(&dn); if (create) { - f2fs_unlock_op(sbi); + __do_map_lock(sbi, flag, false); f2fs_balance_fs(sbi, dn.node_changed); } goto next_dnode; @@ -947,7 +962,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, f2fs_put_dnode(&dn); unlock_out: if (create) { - f2fs_unlock_op(sbi); + __do_map_lock(sbi, flag, false); f2fs_balance_fs(sbi, dn.node_changed); } out: @@ -1686,7 +1701,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, if (f2fs_has_inline_data(inode) || (pos & PAGE_MASK) >= i_size_read(inode)) { - f2fs_lock_op(sbi); + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); locked = true; } restart: @@ -1722,7 +1737,8 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, err = get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err || dn.data_blkaddr == NULL_ADDR) { f2fs_put_dnode(&dn); - f2fs_lock_op(sbi); + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, + true); locked = true; goto restart; } @@ -1736,7 +1752,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, f2fs_put_dnode(&dn); unlock_out: if (locked) - f2fs_unlock_op(sbi); + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); return err; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4b19cba0fee2..1f3576a74112 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -897,6 +897,7 @@ struct f2fs_sb_info { struct mutex cp_mutex; /* checkpoint procedure lock */ struct rw_semaphore cp_rwsem; /* blocking FS operations */ struct rw_semaphore node_write; /* locking node writes */ + struct rw_semaphore node_change; /* locking node change */ wait_queue_head_t cp_wait; unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ long interval_time[MAX_TIME]; /* to store thresholds */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index c098e90cfae7..b737c049174a 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2451,10 +2451,11 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, f2fs_put_page(page, 1); } - f2fs_bug_on(sbi, set->entry_cnt); - - radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); - kmem_cache_free(nat_entry_set_slab, set); + /* Allow dirty nats by node block allocation in write_begin */ + if (!set->entry_cnt) { + radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); + kmem_cache_free(nat_entry_set_slab, set); + } } /* @@ -2499,8 +2500,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) __flush_nat_entry_set(sbi, set, cpc); up_write(&nm_i->nat_tree_lock); - - f2fs_bug_on(sbi, nm_i->dirty_nat_cnt); + /* Allow dirty nats by node block allocation in write_begin */ } static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 779fd5e5cf40..19abc4fc6592 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1924,6 +1924,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) mutex_init(&sbi->gc_mutex); mutex_init(&sbi->cp_mutex); init_rwsem(&sbi->node_write); + init_rwsem(&sbi->node_change); /* disallow all the data/node/meta page writes */ set_sbi_flag(sbi, SBI_POR_DOING); From 0137923fb501557e166e2d1dc02a4cb1c148b761 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sat, 25 Mar 2017 00:03:02 -0700 Subject: [PATCH 0229/1212] f2fs: fix wrong max cost initialization This patch fixes missing increased max cost caused by a patch that we increased cose of data segments in greedy algorithm. Cc: # v4.10+ Fixes: b9cd20619 "f2fs: node segment is prior to data segment selected victim" Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index c5644127fd4f..3db2d26e004a 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -182,7 +182,7 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi, if (p->alloc_mode == SSR) return sbi->blocks_per_seg; if (p->gc_mode == GC_GREEDY) - return sbi->blocks_per_seg * p->ofs_unit; + return 2 * sbi->blocks_per_seg * p->ofs_unit; else if (p->gc_mode == GC_CB) return UINT_MAX; else /* No other gc_mode */ From 5bac5ad719c9fce5975f1c838a66b530d4a3d8f2 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 24 Mar 2017 20:41:45 -0400 Subject: [PATCH 0230/1212] f2fs: allocate node and hot data in the beginning of partition In order to give more spatial locality, this patch changes the block allocation policy which assigns beginning of partition for small and hot data/node blocks. In order to do this, we set noheap allocation by default and introduce another mount option, heap, to reset it back. Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 6 +++++- fs/f2fs/segment.c | 9 +++++++++ fs/f2fs/super.c | 10 +++++++++- 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 3db2d26e004a..90ed2cdff86d 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -172,7 +172,11 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, if (gc_type != FG_GC && p->max_search > sbi->max_victim_search) p->max_search = sbi->max_victim_search; - p->offset = sbi->last_victim[p->gc_mode]; + /* let's select beginning hot/small space first */ + if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) + p->offset = 0; + else + p->offset = sbi->last_victim[p->gc_mode]; } static unsigned int get_max_cost(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5fc0173af7e3..a305b38737f8 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1598,6 +1598,14 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) __set_sit_entry_type(sbi, type, curseg->segno, modified); } +static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) +{ + if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) + return 0; + + return CURSEG_I(sbi, type)->segno; +} + /* * Allocate a current working segment. * This function always allocates a free segment in LFS manner. @@ -1616,6 +1624,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) if (test_opt(sbi, NOHEAP)) dir = ALLOC_RIGHT; + segno = __get_next_segno(sbi, type); get_new_segment(sbi, &segno, new_sec, dir); curseg->next_segno = segno; reset_curseg(sbi, type, 1); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 19abc4fc6592..e43824849cb7 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -83,6 +83,7 @@ enum { Opt_discard, Opt_nodiscard, Opt_noheap, + Opt_heap, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, @@ -117,6 +118,7 @@ static match_table_t f2fs_tokens = { {Opt_discard, "discard"}, {Opt_nodiscard, "nodiscard"}, {Opt_noheap, "no_heap"}, + {Opt_heap, "heap"}, {Opt_user_xattr, "user_xattr"}, {Opt_nouser_xattr, "nouser_xattr"}, {Opt_acl, "acl"}, @@ -437,6 +439,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_noheap: set_opt(sbi, NOHEAP); break; + case Opt_heap: + clear_opt(sbi, NOHEAP); + break; #ifdef CONFIG_F2FS_FS_XATTR case Opt_user_xattr: set_opt(sbi, XATTR_USER); @@ -915,7 +920,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) if (test_opt(sbi, DISCARD)) seq_puts(seq, ",discard"); if (test_opt(sbi, NOHEAP)) - seq_puts(seq, ",no_heap_alloc"); + seq_puts(seq, ",no_heap"); + else + seq_puts(seq, ",heap"); #ifdef CONFIG_F2FS_FS_XATTR if (test_opt(sbi, XATTR_USER)) seq_puts(seq, ",user_xattr"); @@ -1049,6 +1056,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, INLINE_DATA); set_opt(sbi, INLINE_DENTRY); set_opt(sbi, EXTENT_CACHE); + set_opt(sbi, NOHEAP); sbi->sb->s_flags |= MS_LAZYTIME; set_opt(sbi, FLUSH_MERGE); if (f2fs_sb_mounted_blkzoned(sbi->sb)) { From f546e14f0351c04dc996a266f20d81ad639ba863 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 24 Mar 2017 21:08:56 -0400 Subject: [PATCH 0231/1212] f2fs: start SSR much eariler to avoid FG_GC This patch initiates SSR much eariler, resulting in less FG_GC. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 5e8ad4280a50..31846b0fcb95 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -495,7 +495,7 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi) return false; return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + - reserved_sections(sbi) + 1); + 2 * reserved_sections(sbi)); } static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, From 361ee401443b1eda1267f0a2f9113e45f1fdd947 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 21 Mar 2017 10:59:50 -0400 Subject: [PATCH 0232/1212] f2fs: relax node version check for victim data in gc - has_not_enough_free_secs node_secs: 0 dent_secs: 0 freed:0 free_segments:103 reserved:104 - f2fs_gc - get_victim_by_default alloc_mode 0, gc_mode 1, max_search 2672, offset 4654, ofs_unit 1 - do_garbage_collect start_segno 3976, end_segno 3977 type 0 - is_alive nid 22797, blkaddr 2131882, ofs_in_node 0, version 0x8/0x0 - gc_data_segment 766, segno 3976, block 512/426 not alive So, this patch fixes subtle corrupted case where node version does not match to summary version which results in infinite loop by gc. Reported-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 90ed2cdff86d..d712b64ee6c2 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -555,8 +555,10 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, get_node_info(sbi, nid, dni); if (sum->version != dni->version) { - f2fs_put_page(node_page, 1); - return false; + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: valid data with mismatched node version.", + __func__); + set_sbi_flag(sbi, SBI_NEED_FSCK); } *nofs = ofs_of_node(node_page); From 0d5b6b22f109d29112080ef74da07c2ac8916e3c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 25 Mar 2017 17:19:58 +0800 Subject: [PATCH 0233/1212] f2fs: show issued flush/discard count Show historical count of flush command and discard command. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 24 ++++++++++++++++-------- fs/f2fs/f2fs.h | 9 ++++++--- fs/f2fs/segment.c | 39 ++++++++++++++++++++++++--------------- 3 files changed, 46 insertions(+), 26 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 0baa3ee39392..f27e66ea7ff3 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -56,12 +56,18 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt); si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA); - if (SM_I(sbi) && SM_I(sbi)->fcc_info) - si->nr_flush = - atomic_read(&SM_I(sbi)->fcc_info->submit_flush); - if (SM_I(sbi) && SM_I(sbi)->dcc_info) - si->nr_discard = - atomic_read(&SM_I(sbi)->dcc_info->submit_discard); + if (SM_I(sbi) && SM_I(sbi)->fcc_info) { + si->nr_flushed = + atomic_read(&SM_I(sbi)->fcc_info->issued_flush); + si->nr_flushing = + atomic_read(&SM_I(sbi)->fcc_info->issing_flush); + } + if (SM_I(sbi) && SM_I(sbi)->dcc_info) { + si->nr_discarded = + atomic_read(&SM_I(sbi)->dcc_info->issued_discard); + si->nr_discarding = + atomic_read(&SM_I(sbi)->dcc_info->issing_discard); + } si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); @@ -336,9 +342,11 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: %4d, Discard: %4d)\n", + seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: (%4d %4d), " + "Discard: (%4d %4d))\n", si->nr_wb_cp_data, si->nr_wb_data, - si->nr_flush, si->nr_discard); + si->nr_flushing, si->nr_flushed, + si->nr_discarding, si->nr_discarded); seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), " "volatile IO: %4d (Max. %4d)\n", si->inmem_pages, si->aw_cnt, si->max_aw_cnt, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1f3576a74112..26beb67825c2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -281,7 +281,8 @@ struct discard_cmd_control { wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ struct mutex cmd_lock; int max_discards; /* max. discards to be issued */ - atomic_t submit_discard; /* # of issued discard */ + atomic_t issued_discard; /* # of issued discard */ + atomic_t issing_discard; /* # of issing discard */ }; /* for the list of fsync inodes, used only during recovery */ @@ -710,7 +711,8 @@ struct flush_cmd { struct flush_cmd_control { struct task_struct *f2fs_issue_flush; /* flush thread */ wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ - atomic_t submit_flush; /* # of issued flushes */ + atomic_t issued_flush; /* # of issued flushes */ + atomic_t issing_flush; /* # of issing flushes */ struct llist_head issue_list; /* list for command issue */ struct llist_node *dispatch_list; /* list for command dispatch */ }; @@ -2396,7 +2398,8 @@ struct f2fs_stat_info { unsigned int ndirty_dirs, ndirty_files, ndirty_all; int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids; int total_count, utilization; - int bg_gc, nr_wb_cp_data, nr_wb_data, nr_flush, nr_discard; + int bg_gc, nr_wb_cp_data, nr_wb_data; + int nr_flushing, nr_flushed, nr_discarding, nr_discarded; int inline_xattr, inline_inode, inline_dir, append, update, orphans; int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a305b38737f8..717d6cc51ef2 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -490,6 +490,8 @@ static int issue_flush_thread(void *data) fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); ret = submit_flush_wait(sbi); + atomic_inc(&fcc->issued_flush); + llist_for_each_entry_safe(cmd, next, fcc->dispatch_list, llnode) { cmd->ret = ret; @@ -507,25 +509,29 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) { struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; struct flush_cmd cmd; + int ret; if (test_opt(sbi, NOBARRIER)) return 0; - if (!test_opt(sbi, FLUSH_MERGE)) - return submit_flush_wait(sbi); - - if (!atomic_read(&fcc->submit_flush)) { - int ret; - - atomic_inc(&fcc->submit_flush); + if (!test_opt(sbi, FLUSH_MERGE)) { ret = submit_flush_wait(sbi); - atomic_dec(&fcc->submit_flush); + atomic_inc(&fcc->issued_flush); + return ret; + } + + if (!atomic_read(&fcc->issing_flush)) { + atomic_inc(&fcc->issing_flush); + ret = submit_flush_wait(sbi); + atomic_dec(&fcc->issing_flush); + + atomic_inc(&fcc->issued_flush); return ret; } init_completion(&cmd.wait); - atomic_inc(&fcc->submit_flush); + atomic_inc(&fcc->issing_flush); llist_add(&cmd.llnode, &fcc->issue_list); if (!fcc->dispatch_list) @@ -533,10 +539,10 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) if (fcc->f2fs_issue_flush) { wait_for_completion(&cmd.wait); - atomic_dec(&fcc->submit_flush); + atomic_dec(&fcc->issing_flush); } else { llist_del_all(&fcc->issue_list); - atomic_set(&fcc->submit_flush, 0); + atomic_set(&fcc->issing_flush, 0); } return cmd.ret; @@ -556,7 +562,8 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL); if (!fcc) return -ENOMEM; - atomic_set(&fcc->submit_flush, 0); + atomic_set(&fcc->issued_flush, 0); + atomic_set(&fcc->issing_flush, 0); init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); SM_I(sbi)->fcc_info = fcc; @@ -691,7 +698,7 @@ static void __add_discard_cmd(struct f2fs_sb_info *sbi, static void __remove_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc) { if (dc->state == D_DONE) - atomic_dec(&(SM_I(sbi)->dcc_info->submit_discard)); + atomic_dec(&(SM_I(sbi)->dcc_info->issing_discard)); if (dc->error == -EOPNOTSUPP) dc->error = 0; @@ -810,7 +817,8 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, if (!dc->error) { /* should keep before submission to avoid D_DONE right away */ dc->state = D_SUBMIT; - atomic_inc(&dcc->submit_discard); + atomic_inc(&dcc->issued_discard); + atomic_inc(&dcc->issing_discard); if (bio) { bio->bi_private = dc; bio->bi_end_io = f2fs_submit_discard_endio; @@ -1214,7 +1222,8 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&dcc->discard_entry_list); INIT_LIST_HEAD(&dcc->discard_cmd_list); mutex_init(&dcc->cmd_lock); - atomic_set(&dcc->submit_discard, 0); + atomic_set(&dcc->issued_discard, 0); + atomic_set(&dcc->issing_discard, 0); dcc->nr_discards = 0; dcc->max_discards = 0; From 54c1e9049e250dfbbef3f2a51da4efa72e6e4b0c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 25 Mar 2017 17:19:59 +0800 Subject: [PATCH 0234/1212] f2fs: count discard command entry Adds to count discard command entry and show the number in debugfs, also fix to add cost of discard command cache into total comsumed memory footprint. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 12 +++++++++--- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/segment.c | 4 ++++ 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index f27e66ea7ff3..906f627e44fc 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -67,6 +67,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) atomic_read(&SM_I(sbi)->dcc_info->issued_discard); si->nr_discarding = atomic_read(&SM_I(sbi)->dcc_info->issing_discard); + si->nr_discard_cmd = + atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt); } si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); @@ -220,8 +222,11 @@ static void update_mem_info(struct f2fs_sb_info *sbi) /* build merge flush thread */ if (SM_I(sbi)->fcc_info) si->cache_mem += sizeof(struct flush_cmd_control); - if (SM_I(sbi)->dcc_info) + if (SM_I(sbi)->dcc_info) { si->cache_mem += sizeof(struct discard_cmd_control); + si->cache_mem += sizeof(struct discard_cmd) * + atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt); + } /* free nids */ si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] + @@ -343,10 +348,11 @@ static int stat_show(struct seq_file *s, void *v) si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: (%4d %4d), " - "Discard: (%4d %4d))\n", + "Discard: (%4d %4d)) cmd: %4d\n", si->nr_wb_cp_data, si->nr_wb_data, si->nr_flushing, si->nr_flushed, - si->nr_discarding, si->nr_discarded); + si->nr_discarding, si->nr_discarded, + si->nr_discard_cmd); seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), " "volatile IO: %4d (Max. %4d)\n", si->inmem_pages, si->aw_cnt, si->max_aw_cnt, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 26beb67825c2..101cc39a8f96 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -283,6 +283,7 @@ struct discard_cmd_control { int max_discards; /* max. discards to be issued */ atomic_t issued_discard; /* # of issued discard */ atomic_t issing_discard; /* # of issing discard */ + atomic_t discard_cmd_cnt; /* # of cached cmd count */ }; /* for the list of fsync inodes, used only during recovery */ @@ -2400,6 +2401,7 @@ struct f2fs_stat_info { int total_count, utilization; int bg_gc, nr_wb_cp_data, nr_wb_data; int nr_flushing, nr_flushed, nr_discarding, nr_discarded; + int nr_discard_cmd; int inline_xattr, inline_inode, inline_dir, append, update, orphans; int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 717d6cc51ef2..036b41257d60 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -693,6 +693,8 @@ static void __add_discard_cmd(struct f2fs_sb_info *sbi, mutex_lock(&dcc->cmd_lock); list_add_tail(&dc->list, cmd_list); mutex_unlock(&dcc->cmd_lock); + + atomic_inc(&dcc->discard_cmd_cnt); } static void __remove_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc) @@ -708,6 +710,7 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *d "Issue discard failed, ret: %d", dc->error); list_del(&dc->list); kmem_cache_free(discard_cmd_slab, dc); + atomic_dec(&SM_I(sbi)->dcc_info->discard_cmd_cnt); } static void f2fs_submit_discard_endio(struct bio *bio) @@ -1224,6 +1227,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) mutex_init(&dcc->cmd_lock); atomic_set(&dcc->issued_discard, 0); atomic_set(&dcc->issing_discard, 0); + atomic_set(&dcc->discard_cmd_cnt, 0); dcc->nr_discards = 0; dcc->max_discards = 0; From 79bd5ed6e3181002b57cf693dff1f0a886007453 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 27 Mar 2017 18:14:04 +0800 Subject: [PATCH 0235/1212] f2fs: clean up destroy_discard_cmd_control Remove unneeded parameter and simply change flow in destroy_discard_cmd_control. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 036b41257d60..4934e8869240 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1246,20 +1246,22 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) return err; } -static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi, bool free) +static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - if (dcc && dcc->f2fs_issue_discard) { + if (!dcc) + return; + + if (dcc->f2fs_issue_discard) { struct task_struct *discard_thread = dcc->f2fs_issue_discard; dcc->f2fs_issue_discard = NULL; kthread_stop(discard_thread); } - if (free) { - kfree(dcc); - SM_I(sbi)->dcc_info = NULL; - } + + kfree(dcc); + SM_I(sbi)->dcc_info = NULL; } static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) @@ -3152,7 +3154,7 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi) if (!sm_info) return; destroy_flush_cmd_control(sbi, true); - destroy_discard_cmd_control(sbi, true); + destroy_discard_cmd_control(sbi); destroy_dirty_segmap(sbi); destroy_curseg(sbi); destroy_free_segmap(sbi); From 77deaff0083f66f951b6415382e26b29b41d29af Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 28 Mar 2017 18:18:50 +0800 Subject: [PATCH 0236/1212] f2fs: use bitmap in discard_entry This patch changes to use bitmap instead of extent in struct discard_entry to indicate discard range in one segment, for fragmented space, this implementation can save memory footprint. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 6 ++-- fs/f2fs/segment.c | 72 ++++++++++++++++++++++++++--------------------- 2 files changed, 43 insertions(+), 35 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 101cc39a8f96..c46d1b015db0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -249,11 +249,11 @@ struct inode_entry { struct inode *inode; /* vfs inode pointer */ }; -/* for the list of blockaddresses to be discarded */ +/* for the bitmap indicate blocks to be discarded */ struct discard_entry { struct list_head list; /* list head */ - block_t blkaddr; /* block address to be discarded */ - int len; /* # of consecutive blocks of the discard */ + block_t start_blkaddr; /* start blockaddr of current segment */ + unsigned char discard_map[SIT_VBLOCK_MAP_SIZE]; /* segment discard bitmap */ }; enum { diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 4934e8869240..0aab0bdb5da3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1041,32 +1041,6 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, return err; } -static void __add_discard_entry(struct f2fs_sb_info *sbi, - struct cp_control *cpc, struct seg_entry *se, - unsigned int start, unsigned int end) -{ - struct list_head *head = &SM_I(sbi)->dcc_info->discard_entry_list; - struct discard_entry *new, *last; - - if (!list_empty(head)) { - last = list_last_entry(head, struct discard_entry, list); - if (START_BLOCK(sbi, cpc->trim_start) + start == - last->blkaddr + last->len && - last->len < MAX_DISCARD_BLOCKS(sbi)) { - last->len += end - start; - goto done; - } - } - - new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS); - INIT_LIST_HEAD(&new->list); - new->blkaddr = START_BLOCK(sbi, cpc->trim_start) + start; - new->len = end - start; - list_add_tail(&new->list, head); -done: - SM_I(sbi)->dcc_info->nr_discards += end - start; -} - static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, bool check_only) { @@ -1079,6 +1053,8 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, unsigned long *dmap = SIT_I(sbi)->tmp_map; unsigned int start = 0, end = -1; bool force = (cpc->reason == CP_DISCARD); + struct discard_entry *de = NULL; + struct list_head *head = &SM_I(sbi)->dcc_info->discard_entry_list; int i; if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi)) @@ -1110,7 +1086,17 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, if (check_only) return true; - __add_discard_entry(sbi, cpc, se, start, end); + if (!de) { + de = f2fs_kmem_cache_alloc(discard_entry_slab, + GFP_F2FS_ZERO); + de->start_blkaddr = START_BLOCK(sbi, cpc->trim_start); + list_add_tail(&de->list, head); + } + + for (i = start; i < end; i++) + __set_bit_le(i, (void *)de->discard_map); + + SM_I(sbi)->dcc_info->nr_discards += end - start; } return false; } @@ -1196,13 +1182,35 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* send small discards */ list_for_each_entry_safe(entry, this, head, list) { - if (force && entry->len < cpc->trim_minlen) - goto skip; - f2fs_issue_discard(sbi, entry->blkaddr, entry->len); - cpc->trimmed += entry->len; + unsigned int cur_pos = 0, next_pos, len, total_len = 0; + bool is_valid = test_bit_le(0, entry->discard_map); + +find_next: + if (is_valid) { + next_pos = find_next_zero_bit_le(entry->discard_map, + sbi->blocks_per_seg, cur_pos); + len = next_pos - cur_pos; + + if (force && len < cpc->trim_minlen) + goto skip; + + f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos, + len); + cpc->trimmed += len; + total_len += len; + } else { + next_pos = find_next_bit_le(entry->discard_map, + sbi->blocks_per_seg, cur_pos); + } skip: + cur_pos = next_pos; + is_valid = !is_valid; + + if (cur_pos < sbi->blocks_per_seg) + goto find_next; + list_del(&entry->list); - SM_I(sbi)->dcc_info->nr_discards -= entry->len; + SM_I(sbi)->dcc_info->nr_discards -= total_len; kmem_cache_free(discard_entry_slab, entry); } } From 669457e6c2af8bdf84090510a4955a249697683f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 24 Mar 2017 20:05:13 -0400 Subject: [PATCH 0237/1212] f2fs: write small sized IO to hot log It would better split small and large IOs separately in order to get more consecutive big writes. The default threshold is set to 64KB, but configurable by sysfs/min_hot_blocks. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 9 +++++++++ fs/f2fs/f2fs.h | 2 ++ fs/f2fs/inline.c | 1 + fs/f2fs/segment.c | 13 ++++++------- fs/f2fs/segment.h | 1 + fs/f2fs/super.c | 2 ++ 6 files changed, 21 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 49c04c4e3bd8..5cb4067c3d84 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1430,6 +1430,8 @@ static int __write_data_page(struct page *page, bool *submitted, need_balance_fs = true; else if (has_not_enough_free_secs(sbi, 0, 0)) goto redirty_out; + else + set_inode_flag(inode, FI_HOT_DATA); err = -EAGAIN; if (f2fs_has_inline_data(inode)) { @@ -1455,6 +1457,7 @@ static int __write_data_page(struct page *page, bool *submitted, if (wbc->for_reclaim) { f2fs_submit_merged_bio_cond(sbi, inode, 0, page->index, DATA, WRITE); + clear_inode_flag(inode, FI_HOT_DATA); remove_dirty_inode(inode); submitted = NULL; } @@ -1509,6 +1512,12 @@ static int f2fs_write_cache_pages(struct address_space *mapping, pagevec_init(&pvec, 0); + if (get_dirty_pages(mapping->host) <= + SM_I(F2FS_M_SB(mapping))->min_hot_blocks) + set_inode_flag(mapping->host, FI_HOT_DATA); + else + clear_inode_flag(mapping->host, FI_HOT_DATA); + if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c46d1b015db0..2542548233db 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -744,6 +744,7 @@ struct f2fs_sm_info { unsigned int ipu_policy; /* in-place-update policy */ unsigned int min_ipu_util; /* in-place-update threshold */ unsigned int min_fsync_blocks; /* threshold for fsync */ + unsigned int min_hot_blocks; /* threshold for hot block allocation */ /* for flush command control */ struct flush_cmd_control *fcc_info; @@ -1783,6 +1784,7 @@ enum { FI_DO_DEFRAG, /* indicate defragment is running */ FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ FI_NO_PREALLOC, /* indicate skipped preallocated blocks */ + FI_HOT_DATA, /* indicate file is hot */ }; static inline void __mark_inode_dirty_flag(struct inode *inode, diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index a92370516659..e4f12891d4e4 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -137,6 +137,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) /* write data page to try to make data consistent */ set_page_writeback(page); fio.old_blkaddr = dn->data_blkaddr; + set_inode_flag(dn->inode, FI_HOT_DATA); write_data_page(dn, &fio); f2fs_wait_on_page_writeback(page, DATA, true); if (dirty) { diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0aab0bdb5da3..ed9db665ffe7 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1908,18 +1908,16 @@ static int __get_segment_type_6(struct page *page, enum page_type p_type) if (p_type == DATA) { struct inode *inode = page->mapping->host; - if (S_ISDIR(inode->i_mode)) - return CURSEG_HOT_DATA; - else if (is_cold_data(page) || file_is_cold(inode)) + if (is_cold_data(page) || file_is_cold(inode)) return CURSEG_COLD_DATA; - else - return CURSEG_WARM_DATA; + if (is_inode_flag_set(inode, FI_HOT_DATA)) + return CURSEG_HOT_DATA; + return CURSEG_WARM_DATA; } else { if (IS_DNODE(page)) return is_cold_node(page) ? CURSEG_WARM_NODE : CURSEG_HOT_NODE; - else - return CURSEG_COLD_NODE; + return CURSEG_COLD_NODE; } } @@ -3026,6 +3024,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; + sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS; sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 31846b0fcb95..57e36c1ce7bd 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -540,6 +540,7 @@ static inline int utilization(struct f2fs_sb_info *sbi) */ #define DEF_MIN_IPU_UTIL 70 #define DEF_MIN_FSYNC_BLOCKS 8 +#define DEF_MIN_HOT_BLOCKS 16 enum { F2FS_IPU_FORCE, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e43824849cb7..3691413e51c3 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -296,6 +296,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); @@ -321,6 +322,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), ATTR_LIST(min_fsync_blocks), + ATTR_LIST(min_hot_blocks), ATTR_LIST(max_victim_search), ATTR_LIST(dir_level), ATTR_LIST(ram_thresh), From 654cbabc87dc4f5be00da12a7e7799e2eb5a28dc Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 28 Mar 2017 18:07:38 -0700 Subject: [PATCH 0238/1212] f2fs: avoid IO split due to mixed WB_SYNC_ALL and WB_SYNC_NONE If two threads try to flush dirty pages in different inodes respectively, f2fs_write_data_pages() will produce WRITE and WRITE_SYNC one at a time, resulting in a lot of 4KB seperated IOs. So, this patch gives higher priority to WB_SYNC_ALL IOs and gathers write IOs with a big WRITE_SYNC'ed bio. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 15 +++++++++++++-- fs/f2fs/f2fs.h | 3 +++ fs/f2fs/super.c | 2 ++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 5cb4067c3d84..481dd2cff3ac 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1603,8 +1603,10 @@ static int f2fs_write_cache_pages(struct address_space *mapping, last_idx = page->index; } - if (--wbc->nr_to_write <= 0 && - wbc->sync_mode == WB_SYNC_NONE) { + /* give a priority to WB_SYNC threads */ + if ((atomic_read(&F2FS_M_SB(mapping)->wb_sync_req) || + --wbc->nr_to_write <= 0) && + wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; } @@ -1660,9 +1662,18 @@ static int f2fs_write_data_pages(struct address_space *mapping, trace_f2fs_writepages(mapping->host, wbc, DATA); + /* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */ + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_inc(&sbi->wb_sync_req); + else if (atomic_read(&sbi->wb_sync_req)) + goto skip_write; + blk_start_plug(&plug); ret = f2fs_write_cache_pages(mapping, wbc); blk_finish_plug(&plug); + + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_dec(&sbi->wb_sync_req); /* * if some pages were truncated, we cannot guarantee its mapping->host * to detect pending bios. diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2542548233db..ad8c54848edf 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -954,6 +954,9 @@ struct f2fs_sb_info { /* # of allocated blocks */ struct percpu_counter alloc_valid_block_count; + /* writeback control */ + atomic_t wb_sync_req; /* count # of WB_SYNC threads */ + /* valid inode count */ struct percpu_counter total_valid_inode_count; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3691413e51c3..ea28312fa80f 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1568,6 +1568,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) for (i = 0; i < NR_COUNT_TYPE; i++) atomic_set(&sbi->nr_pages[i], 0); + atomic_set(&sbi->wb_sync_req, 0); + INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); mutex_init(&sbi->wio_mutex[NODE]); From bdc8c12ddd5508dd541c62daf3a9a6e5dadfd104 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Sun, 2 Apr 2017 02:39:48 +0800 Subject: [PATCH 0239/1212] f2fs: remove the redundant variable definition The variable 'i' has been defined before, so here we can use it directly. Signed-off-by: Kaixu Xia Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index a9f141abac5d..8b106d10afe7 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1164,7 +1164,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* write nat bits */ if (enabled_nat_bits(sbi, cpc)) { __u64 cp_ver = cur_cp_version(ckpt); - unsigned int i; block_t blk; cp_ver |= ((__u64)crc32 << 32); From 074a551c90b9b1bf413b7b44ec5ee3ab9c7526eb Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 30 Mar 2017 21:02:46 -0700 Subject: [PATCH 0240/1212] f2fs: submit bio of in-place-update pages This patch tries to split in-place-update bios from sequential bios. Suggested-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 5 ++++- fs/f2fs/f2fs.h | 2 +- fs/f2fs/segment.c | 4 ++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 481dd2cff3ac..9ce6c3435c00 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -361,6 +361,9 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) bio_set_op_attrs(bio, fio->op, fio->op_flags); __submit_bio(fio->sbi, bio, fio->type); + + if (!is_read_io(fio->op)) + inc_page_count(fio->sbi, WB_DATA_TYPE(fio->page)); return 0; } @@ -1352,7 +1355,7 @@ int do_write_data_page(struct f2fs_io_info *fio) !is_cold_data(page) && !IS_ATOMIC_WRITTEN_PAGE(page) && need_inplace_update(inode))) { - rewrite_data_page(fio); + err = rewrite_data_page(fio); set_inode_flag(inode, FI_UPDATE_WRITE); trace_f2fs_do_write_data_page(page, IPU); } else { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ad8c54848edf..91174ed207de 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2273,7 +2273,7 @@ void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr); void write_meta_page(struct f2fs_sb_info *sbi, struct page *page); void write_node_page(unsigned int nid, struct f2fs_io_info *fio); void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio); -void rewrite_data_page(struct f2fs_io_info *fio); +int rewrite_data_page(struct f2fs_io_info *fio); void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, block_t old_blkaddr, block_t new_blkaddr, bool recover_curseg, bool recover_newaddr); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ed9db665ffe7..5e9635df9923 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2040,11 +2040,11 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) f2fs_update_data_blkaddr(dn, fio->new_blkaddr); } -void rewrite_data_page(struct f2fs_io_info *fio) +int rewrite_data_page(struct f2fs_io_info *fio) { fio->new_blkaddr = fio->old_blkaddr; stat_inc_inplace_blocks(fio->sbi); - f2fs_submit_page_mbio(fio); + return f2fs_submit_page_bio(fio); } void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, From 34cc766bfbd1f720362f457a9dfe7603c76605d6 Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Tue, 4 Apr 2017 13:01:22 +0300 Subject: [PATCH 0241/1212] f2fs: split make_dentry_ptr() into block and inline versions Since callers statically know which type to use, make_dentry_ptr() can simply be splitted into two inline functions. This way, the code has less inlined, fewer arguments, and no cast. Signed-off-by: Tomohiro Kusumi Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 8 ++++---- fs/f2fs/f2fs.h | 32 +++++++++++++++----------------- fs/f2fs/inline.c | 10 +++++----- 3 files changed, 24 insertions(+), 26 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index b71b7f364107..db077960e376 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -94,7 +94,7 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, dentry_blk = (struct f2fs_dentry_block *)kmap(dentry_page); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(NULL, &d, dentry_blk); de = find_target_dentry(fname, namehash, max_slots, &d); if (de) *res_page = dentry_page; @@ -366,7 +366,7 @@ static int make_empty_dir(struct inode *inode, dentry_blk = kmap_atomic(dentry_page); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(NULL, &d, dentry_blk); do_make_empty_dir(inode, parent, &d); kunmap_atomic(dentry_blk); @@ -586,7 +586,7 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, } } - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(NULL, &d, dentry_blk); f2fs_update_dentry(ino, mode, &d, new_name, dentry_hash, bit_pos); set_page_dirty(dentry_page); @@ -894,7 +894,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) dentry_blk = kmap(dentry_page); - make_dentry_ptr(inode, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(inode, &d, dentry_blk); err = f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 91174ed207de..48d3882f9d88 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -395,26 +395,24 @@ struct f2fs_dentry_ptr { int max; }; -static inline void make_dentry_ptr(struct inode *inode, - struct f2fs_dentry_ptr *d, void *src, int type) +static inline void make_dentry_ptr_block(struct inode *inode, + struct f2fs_dentry_ptr *d, struct f2fs_dentry_block *t) { d->inode = inode; + d->max = NR_DENTRY_IN_BLOCK; + d->bitmap = &t->dentry_bitmap; + d->dentry = t->dentry; + d->filename = t->filename; +} - if (type == 1) { - struct f2fs_dentry_block *t = (struct f2fs_dentry_block *)src; - - d->max = NR_DENTRY_IN_BLOCK; - d->bitmap = &t->dentry_bitmap; - d->dentry = t->dentry; - d->filename = t->filename; - } else { - struct f2fs_inline_dentry *t = (struct f2fs_inline_dentry *)src; - - d->max = NR_INLINE_DENTRY; - d->bitmap = &t->dentry_bitmap; - d->dentry = t->dentry; - d->filename = t->filename; - } +static inline void make_dentry_ptr_inline(struct inode *inode, + struct f2fs_dentry_ptr *d, struct f2fs_inline_dentry *t) +{ + d->inode = inode; + d->max = NR_INLINE_DENTRY; + d->bitmap = &t->dentry_bitmap; + d->dentry = t->dentry; + d->filename = t->filename; } /* diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index e4f12891d4e4..b3bd1012a4fc 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -302,7 +302,7 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, inline_dentry = inline_data_addr(ipage); - make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2); + make_dentry_ptr_inline(NULL, &d, inline_dentry); de = find_target_dentry(fname, namehash, NULL, &d); unlock_page(ipage); if (de) @@ -321,7 +321,7 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent, dentry_blk = inline_data_addr(ipage); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2); + make_dentry_ptr_inline(NULL, &d, dentry_blk); do_make_empty_dir(inode, parent, &d); set_page_dirty(ipage); @@ -402,7 +402,7 @@ static int f2fs_add_inline_entries(struct inode *dir, unsigned long bit_pos = 0; int err = 0; - make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2); + make_dentry_ptr_inline(NULL, &d, inline_dentry); while (bit_pos < d.max) { struct f2fs_dir_entry *de; @@ -534,7 +534,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, f2fs_wait_on_page_writeback(ipage, NODE, true); name_hash = f2fs_dentry_hash(new_name); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2); + make_dentry_ptr_inline(NULL, &d, dentry_blk); f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos); set_page_dirty(ipage); @@ -623,7 +623,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, inline_dentry = inline_data_addr(ipage); - make_dentry_ptr(inode, &d, (void *)inline_dentry, 2); + make_dentry_ptr_inline(inode, &d, inline_dentry); err = f2fs_fill_dentries(ctx, &d, 0, fstr); if (!err) From e5c2a70c4a8b85817054c1292f497309a7d871f9 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 4 Apr 2017 16:45:30 -0700 Subject: [PATCH 0242/1212] Revert "f2fs: put allocate_segment after refresh_sit_entry" This reverts commit 3436c4bdb30de421d46f58c9174669fbcfd40ce0. This makes a leak to register dirty segments. I reproduced the issue by modified postmark which injects a lot of file create/delete/update and finally triggers huge number of SSR allocations. Cc: # v4.10+ [Jaegeuk Kim: Change missing incorrect comment] Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5e9635df9923..c79f1b05d667 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1960,14 +1960,13 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, stat_inc_block_count(sbi, curseg); - /* - * SIT information should be updated before segment allocation, - * since SSR needs latest valid block information. - */ - refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); - if (!__has_curseg_space(sbi, type)) sit_i->s_ops->allocate_segment(sbi, type, false); + /* + * SIT information should be updated after segment allocation, + * since we need to keep dirty segments precisely under SSR. + */ + refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); mutex_unlock(&sit_i->sentry_lock); From 4adc71ee11d7edb42a176efc970baa45ba9d6e9a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 5 Apr 2017 18:19:48 +0800 Subject: [PATCH 0243/1212] f2fs: split discard_cmd_list Split discard_cmd_list to discard_{pend,wait}_list, so while sending/waiting discard command, we can avoid traversing unneeded entries in original list. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/segment.c --- fs/f2fs/f2fs.h | 3 ++- fs/f2fs/segment.c | 47 ++++++++++++++++++++++++++++++----------------- 2 files changed, 32 insertions(+), 18 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 48d3882f9d88..d0d9668bd738 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -277,7 +277,8 @@ struct discard_cmd_control { struct task_struct *f2fs_issue_discard; /* discard thread */ struct list_head discard_entry_list; /* 4KB discard entry list */ int nr_discards; /* # of discards in the list */ - struct list_head discard_cmd_list; /* discard cmd list */ + struct list_head discard_pend_list; /* store pending entries */ + struct list_head discard_wait_list; /* store on-flushing entries */ wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ struct mutex cmd_lock; int max_discards; /* max. discards to be issued */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c79f1b05d667..0582eecb5272 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -677,7 +677,7 @@ static void __add_discard_cmd(struct f2fs_sb_info *sbi, block_t start, block_t len) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct list_head *cmd_list = &(dcc->discard_cmd_list); + struct list_head *pend_list = &(dcc->discard_pend_list); struct discard_cmd *dc; dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS); @@ -691,7 +691,7 @@ static void __add_discard_cmd(struct f2fs_sb_info *sbi, init_completion(&dc->wait); mutex_lock(&dcc->cmd_lock); - list_add_tail(&dc->list, cmd_list); + list_add_tail(&dc->list, pend_list); mutex_unlock(&dcc->cmd_lock); atomic_inc(&dcc->discard_cmd_cnt); @@ -826,6 +826,7 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, bio->bi_private = dc; bio->bi_end_io = f2fs_submit_discard_endio; submit_bio(REQ_SYNC, bio); + list_move_tail(&dc->list, &dcc->discard_wait_list); } } else { __remove_discard_cmd(sbi, dc); @@ -872,31 +873,37 @@ static void __punch_discard_cmd(struct f2fs_sb_info *sbi, void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct list_head *wait_list = &(dcc->discard_cmd_list); + struct list_head *pend_list = &(dcc->discard_pend_list); + struct list_head *wait_list = &(dcc->discard_wait_list); struct discard_cmd *dc, *tmp; struct blk_plug plug; mutex_lock(&dcc->cmd_lock); - blk_start_plug(&plug); + if (blkaddr == NULL_ADDR) + goto release_discard; + + list_for_each_entry_safe(dc, tmp, pend_list, list) { + if (dc->lstart <= blkaddr && blkaddr < dc->lstart + dc->len) + __punch_discard_cmd(sbi, dc, blkaddr); + } list_for_each_entry_safe(dc, tmp, wait_list, list) { - - if (blkaddr == NULL_ADDR) { - __submit_discard_cmd(sbi, dc); - continue; - } - if (dc->lstart <= blkaddr && blkaddr < dc->lstart + dc->len) { if (dc->state == D_SUBMIT) wait_for_completion_io(&dc->wait); __punch_discard_cmd(sbi, dc, blkaddr); } } - blk_finish_plug(&plug); +release_discard: /* this comes from f2fs_put_super */ if (blkaddr == NULL_ADDR) { + blk_start_plug(&plug); + list_for_each_entry_safe(dc, tmp, pend_list, list) + __submit_discard_cmd(sbi, dc); + blk_finish_plug(&plug); + list_for_each_entry_safe(dc, tmp, wait_list, list) { wait_for_completion_io(&dc->wait); __remove_discard_cmd(sbi, dc); @@ -910,7 +917,8 @@ static int issue_discard_thread(void *data) struct f2fs_sb_info *sbi = data; struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; wait_queue_head_t *q = &dcc->discard_wait_queue; - struct list_head *cmd_list = &dcc->discard_cmd_list; + struct list_head *pend_list = &dcc->discard_pend_list; + struct list_head *wait_list = &dcc->discard_wait_list; struct discard_cmd *dc, *tmp; struct blk_plug plug; int iter = 0; @@ -921,13 +929,17 @@ static int issue_discard_thread(void *data) blk_start_plug(&plug); mutex_lock(&dcc->cmd_lock); - list_for_each_entry_safe(dc, tmp, cmd_list, list) { + list_for_each_entry_safe(dc, tmp, pend_list, list) { + f2fs_bug_on(sbi, dc->state != D_PREP); if (is_idle(sbi)) __submit_discard_cmd(sbi, dc); - if (dc->state == D_PREP && iter++ > DISCARD_ISSUE_RATE) + if (iter++ > DISCARD_ISSUE_RATE) break; + } + + list_for_each_entry_safe(dc, tmp, wait_list, list) { if (dc->state == D_DONE) __remove_discard_cmd(sbi, dc); } @@ -938,8 +950,8 @@ static int issue_discard_thread(void *data) iter = 0; congestion_wait(BLK_RW_SYNC, HZ/50); - wait_event_interruptible(*q, - kthread_should_stop() || !list_empty(&dcc->discard_cmd_list)); + wait_event_interruptible(*q, kthread_should_stop() || + !list_empty(pend_list) || !list_empty(wait_list)); goto repeat; } @@ -1231,7 +1243,8 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) return -ENOMEM; INIT_LIST_HEAD(&dcc->discard_entry_list); - INIT_LIST_HEAD(&dcc->discard_cmd_list); + INIT_LIST_HEAD(&dcc->discard_pend_list); + INIT_LIST_HEAD(&dcc->discard_wait_list); mutex_init(&dcc->cmd_lock); atomic_set(&dcc->issued_discard, 0); atomic_set(&dcc->issing_discard, 0); From 745d922434837feebaa8497563bf8f8b39cde782 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 5 Apr 2017 18:19:49 +0800 Subject: [PATCH 0244/1212] f2fs: introduce f2fs_wait_discard_bios Split f2fs_wait_discard_bios from f2fs_wait_discard_bio, just for cleanup, no logic change. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/segment.c | 37 ++++++++++++++++++++++--------------- fs/f2fs/super.c | 2 +- 3 files changed, 24 insertions(+), 17 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d0d9668bd738..63e333edefaa 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2260,7 +2260,7 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new); -void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr); +void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); void release_discard_addrs(struct f2fs_sb_info *sbi); int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0582eecb5272..acf0c2c7d3b3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -876,13 +876,9 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) struct list_head *pend_list = &(dcc->discard_pend_list); struct list_head *wait_list = &(dcc->discard_wait_list); struct discard_cmd *dc, *tmp; - struct blk_plug plug; mutex_lock(&dcc->cmd_lock); - if (blkaddr == NULL_ADDR) - goto release_discard; - list_for_each_entry_safe(dc, tmp, pend_list, list) { if (dc->lstart <= blkaddr && blkaddr < dc->lstart + dc->len) __punch_discard_cmd(sbi, dc, blkaddr); @@ -896,19 +892,30 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) } } -release_discard: - /* this comes from f2fs_put_super */ - if (blkaddr == NULL_ADDR) { - blk_start_plug(&plug); - list_for_each_entry_safe(dc, tmp, pend_list, list) - __submit_discard_cmd(sbi, dc); - blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); +} - list_for_each_entry_safe(dc, tmp, wait_list, list) { - wait_for_completion_io(&dc->wait); - __remove_discard_cmd(sbi, dc); - } +/* This comes from f2fs_put_super */ +void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *pend_list = &(dcc->discard_pend_list); + struct list_head *wait_list = &(dcc->discard_wait_list); + struct discard_cmd *dc, *tmp; + struct blk_plug plug; + + mutex_lock(&dcc->cmd_lock); + + blk_start_plug(&plug); + list_for_each_entry_safe(dc, tmp, pend_list, list) + __submit_discard_cmd(sbi, dc); + blk_finish_plug(&plug); + + list_for_each_entry_safe(dc, tmp, wait_list, list) { + wait_for_completion_io(&dc->wait); + __remove_discard_cmd(sbi, dc); } + mutex_unlock(&dcc->cmd_lock); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ea28312fa80f..d50eb8d27e60 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -795,7 +795,7 @@ static void f2fs_put_super(struct super_block *sb) } /* be sure to wait for any on-going discard commands */ - f2fs_wait_discard_bio(sbi, NULL_ADDR); + f2fs_wait_discard_bios(sbi); /* write_checkpoint can update stat informaion */ f2fs_destroy_stats(sbi); From e09409d5c38de68d0c0f4fe43c3ae6da521fb58a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 5 Apr 2017 18:26:26 +0800 Subject: [PATCH 0245/1212] f2fs: prevent waiter encountering incorrect discard states In f2fs_submit_discard_endio, we will wake up waiter before setting discard command states, so waiter may use incorrect states. Change the order between complete() and states setting to fix this issue. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index acf0c2c7d3b3..dc2737cfaa67 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -717,9 +717,9 @@ static void f2fs_submit_discard_endio(struct bio *bio) { struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; - complete(&dc->wait); dc->error = bio->bi_error; dc->state = D_DONE; + complete(&dc->wait); bio_put(bio); } @@ -886,8 +886,7 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) list_for_each_entry_safe(dc, tmp, wait_list, list) { if (dc->lstart <= blkaddr && blkaddr < dc->lstart + dc->len) { - if (dc->state == D_SUBMIT) - wait_for_completion_io(&dc->wait); + wait_for_completion_io(&dc->wait); __punch_discard_cmd(sbi, dc, blkaddr); } } @@ -947,8 +946,10 @@ static int issue_discard_thread(void *data) } list_for_each_entry_safe(dc, tmp, wait_list, list) { - if (dc->state == D_DONE) + if (dc->state == D_DONE) { + wait_for_completion_io(&dc->wait); __remove_discard_cmd(sbi, dc); + } } mutex_unlock(&dcc->cmd_lock); From f886a1df9e79b2e70d4fb4e6a93a328a8cf64405 Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Wed, 5 Apr 2017 22:49:44 +0300 Subject: [PATCH 0246/1212] f2fs: fix comment on f2fs_flush_merged_bios() after 86531d6b Callers are to unlock the page on failure after 86531d6b. Signed-off-by: Tomohiro Kusumi Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9ce6c3435c00..85da089004ee 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -340,7 +340,7 @@ void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi) /* * Fill the locked page with data located in the block address. - * Return unlocked page. + * A caller needs to unlock the page on failure. */ int f2fs_submit_page_bio(struct f2fs_io_info *fio) { From 1c72805ab23732e18b36c8727ebad842786b18dc Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Sun, 9 Apr 2017 02:11:36 +0300 Subject: [PATCH 0247/1212] f2fs: guard macro variables with braces Add braces around variables used within macros for those make sense to do it. Many of the macros in f2fs already do this. What this commit doesn't do is anything that changes line# as a result of adding braces, which usually affects the binary via __LINE__. Confirmed no diff in fs/f2fs/f2fs.ko before/after this commit on x86_64, to make sure this has no functional change as well as there's been no unexpected side effect due to callers' arithmetics within the existing code. Signed-off-by: Tomohiro Kusumi Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 36 +++++++++++----------- fs/f2fs/node.c | 2 +- fs/f2fs/node.h | 22 +++++++------- fs/f2fs/segment.h | 76 +++++++++++++++++++++++------------------------ fs/f2fs/xattr.h | 4 +-- 5 files changed, 70 insertions(+), 70 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 63e333edefaa..cbc35660c466 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -64,7 +64,7 @@ struct f2fs_fault_info { }; extern char *fault_name[FAULT_MAX]; -#define IS_FAULT_SET(fi, type) (fi->inject_type & (1 << (type))) +#define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type))) #endif /* @@ -90,9 +90,9 @@ extern char *fault_name[FAULT_MAX]; #define F2FS_MOUNT_ADAPTIVE 0x00020000 #define F2FS_MOUNT_LFS 0x00040000 -#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) -#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) -#define test_opt(sbi, option) (sbi->mount_opt.opt & F2FS_MOUNT_##option) +#define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option) +#define set_opt(sbi, option) ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option) +#define test_opt(sbi, option) ((sbi)->mount_opt.opt & F2FS_MOUNT_##option) #define ver_after(a, b) (typecheck(unsigned long long, a) && \ typecheck(unsigned long long, b) && \ @@ -295,13 +295,13 @@ struct fsync_inode_entry { block_t last_dentry; /* block address locating the last dentry */ }; -#define nats_in_cursum(jnl) (le16_to_cpu(jnl->n_nats)) -#define sits_in_cursum(jnl) (le16_to_cpu(jnl->n_sits)) +#define nats_in_cursum(jnl) (le16_to_cpu((jnl)->n_nats)) +#define sits_in_cursum(jnl) (le16_to_cpu((jnl)->n_sits)) -#define nat_in_journal(jnl, i) (jnl->nat_j.entries[i].ne) -#define nid_in_journal(jnl, i) (jnl->nat_j.entries[i].nid) -#define sit_in_journal(jnl, i) (jnl->sit_j.entries[i].se) -#define segno_in_journal(jnl, i) (jnl->sit_j.entries[i].segno) +#define nat_in_journal(jnl, i) ((jnl)->nat_j.entries[i].ne) +#define nid_in_journal(jnl, i) ((jnl)->nat_j.entries[i].nid) +#define sit_in_journal(jnl, i) ((jnl)->sit_j.entries[i].se) +#define segno_in_journal(jnl, i) ((jnl)->sit_j.entries[i].segno) #define MAX_NAT_JENTRIES(jnl) (NAT_JOURNAL_ENTRIES - nats_in_cursum(jnl)) #define MAX_SIT_JENTRIES(jnl) (SIT_JOURNAL_ENTRIES - sits_in_cursum(jnl)) @@ -812,7 +812,7 @@ struct f2fs_io_info { bool submitted; /* indicate IO submission */ }; -#define is_read_io(rw) (rw == READ) +#define is_read_io(rw) ((rw) == READ) struct f2fs_bio_info { struct f2fs_sb_info *sbi; /* f2fs superblock */ struct bio *bio; /* bios to merge */ @@ -1050,8 +1050,8 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) * and the return value is in kbytes. s is of struct f2fs_sb_info. */ #define BD_PART_WRITTEN(s) \ -(((u64)part_stat_read(s->sb->s_bdev->bd_part, sectors[1]) - \ - s->sectors_written_start) >> 1) +(((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[1]) - \ + (s)->sectors_written_start) >> 1) static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) { @@ -2504,8 +2504,8 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) #define stat_inc_seg_count(sbi, type, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ - (si)->tot_segs++; \ - if (type == SUM_TYPE_DATA) { \ + si->tot_segs++; \ + if ((type) == SUM_TYPE_DATA) { \ si->data_segs++; \ si->bg_data_segs += (gc_type == BG_GC) ? 1 : 0; \ } else { \ @@ -2515,14 +2515,14 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) } while (0) #define stat_inc_tot_blk_count(si, blks) \ - (si->tot_blks += (blks)) + ((si)->tot_blks += (blks)) #define stat_inc_data_blk_count(sbi, blks, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ stat_inc_tot_blk_count(si, blks); \ si->data_blks += (blks); \ - si->bg_data_blks += (gc_type == BG_GC) ? (blks) : 0; \ + si->bg_data_blks += ((gc_type) == BG_GC) ? (blks) : 0; \ } while (0) #define stat_inc_node_blk_count(sbi, blks, gc_type) \ @@ -2530,7 +2530,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) struct f2fs_stat_info *si = F2FS_STAT(sbi); \ stat_inc_tot_blk_count(si, blks); \ si->node_blks += (blks); \ - si->bg_node_blks += (gc_type == BG_GC) ? (blks) : 0; \ + si->bg_node_blks += ((gc_type) == BG_GC) ? (blks) : 0; \ } while (0) int f2fs_build_stats(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b737c049174a..19ea77dc3192 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -22,7 +22,7 @@ #include "trace.h" #include -#define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock) +#define on_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock) static struct kmem_cache *nat_entry_slab; static struct kmem_cache *free_nid_slab; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index ebed0240aa53..558048e33cf9 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -9,10 +9,10 @@ * published by the Free Software Foundation. */ /* start node id of a node block dedicated to the given node id */ -#define START_NID(nid) ((nid / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK) +#define START_NID(nid) (((nid) / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK) /* node block offset on the NAT area dedicated to the given start node id */ -#define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK) +#define NAT_BLOCK_OFFSET(start_nid) ((start_nid) / NAT_ENTRY_PER_BLOCK) /* # of pages to perform synchronous readahead before building free nids */ #define FREE_NID_PAGES 8 @@ -62,16 +62,16 @@ struct nat_entry { struct node_info ni; /* in-memory node information */ }; -#define nat_get_nid(nat) (nat->ni.nid) -#define nat_set_nid(nat, n) (nat->ni.nid = n) -#define nat_get_blkaddr(nat) (nat->ni.blk_addr) -#define nat_set_blkaddr(nat, b) (nat->ni.blk_addr = b) -#define nat_get_ino(nat) (nat->ni.ino) -#define nat_set_ino(nat, i) (nat->ni.ino = i) -#define nat_get_version(nat) (nat->ni.version) -#define nat_set_version(nat, v) (nat->ni.version = v) +#define nat_get_nid(nat) ((nat)->ni.nid) +#define nat_set_nid(nat, n) ((nat)->ni.nid = (n)) +#define nat_get_blkaddr(nat) ((nat)->ni.blk_addr) +#define nat_set_blkaddr(nat, b) ((nat)->ni.blk_addr = (b)) +#define nat_get_ino(nat) ((nat)->ni.ino) +#define nat_set_ino(nat, i) ((nat)->ni.ino = (i)) +#define nat_get_version(nat) ((nat)->ni.version) +#define nat_set_version(nat, v) ((nat)->ni.version = (v)) -#define inc_node_version(version) (++version) +#define inc_node_version(version) (++(version)) static inline void copy_node_info(struct node_info *dst, struct node_info *src) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 57e36c1ce7bd..b8a1bac9355d 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -21,78 +21,78 @@ #define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */ /* L: Logical segment # in volume, R: Relative segment # in main area */ -#define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) -#define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno) +#define GET_L2R_SEGNO(free_i, segno) ((segno) - (free_i)->start_segno) +#define GET_R2L_SEGNO(free_i, segno) ((segno) + (free_i)->start_segno) -#define IS_DATASEG(t) (t <= CURSEG_COLD_DATA) -#define IS_NODESEG(t) (t >= CURSEG_HOT_NODE) +#define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA) +#define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE) #define IS_CURSEG(sbi, seg) \ - ((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) + (((seg) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) #define IS_CURSEC(sbi, secno) \ - ((secno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ - sbi->segs_per_sec)) \ + (((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ + (sbi)->segs_per_sec)) \ #define MAIN_BLKADDR(sbi) (SM_I(sbi)->main_blkaddr) #define SEG0_BLKADDR(sbi) (SM_I(sbi)->seg0_blkaddr) #define MAIN_SEGS(sbi) (SM_I(sbi)->main_segments) -#define MAIN_SECS(sbi) (sbi->total_sections) +#define MAIN_SECS(sbi) ((sbi)->total_sections) #define TOTAL_SEGS(sbi) (SM_I(sbi)->segment_count) -#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << sbi->log_blocks_per_seg) +#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << (sbi)->log_blocks_per_seg) #define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi)) -#define SEGMENT_SIZE(sbi) (1ULL << (sbi->log_blocksize + \ - sbi->log_blocks_per_seg)) +#define SEGMENT_SIZE(sbi) (1ULL << ((sbi)->log_blocksize + \ + (sbi)->log_blocks_per_seg)) #define START_BLOCK(sbi, segno) (SEG0_BLKADDR(sbi) + \ - (GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg)) + (GET_R2L_SEGNO(FREE_I(sbi), segno) << (sbi)->log_blocks_per_seg)) #define NEXT_FREE_BLKADDR(sbi, curseg) \ - (START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff) + (START_BLOCK(sbi, (curseg)->segno) + (curseg)->next_blkoff) #define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) ((blk_addr) - SEG0_BLKADDR(sbi)) #define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \ - (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg) + (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> (sbi)->log_blocks_per_seg) #define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \ - (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (sbi->blocks_per_seg - 1)) + (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & ((sbi)->blocks_per_seg - 1)) #define GET_SEGNO(sbi, blk_addr) \ - (((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ? \ + ((((blk_addr) == NULL_ADDR) || ((blk_addr) == NEW_ADDR)) ? \ NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ GET_SEGNO_FROM_SEG0(sbi, blk_addr))) #define GET_SECNO(sbi, segno) \ - ((segno) / sbi->segs_per_sec) + ((segno) / (sbi)->segs_per_sec) #define GET_ZONENO_FROM_SEGNO(sbi, segno) \ - ((segno / sbi->segs_per_sec) / sbi->secs_per_zone) + (((segno) / (sbi)->segs_per_sec) / (sbi)->secs_per_zone) #define GET_SUM_BLOCK(sbi, segno) \ - ((sbi->sm_info->ssa_blkaddr) + segno) + ((sbi)->sm_info->ssa_blkaddr + (segno)) #define GET_SUM_TYPE(footer) ((footer)->entry_type) -#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = type) +#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = (type)) #define SIT_ENTRY_OFFSET(sit_i, segno) \ - (segno % sit_i->sents_per_block) + ((segno) % (sit_i)->sents_per_block) #define SIT_BLOCK_OFFSET(segno) \ - (segno / SIT_ENTRY_PER_BLOCK) + ((segno) / SIT_ENTRY_PER_BLOCK) #define START_SEGNO(segno) \ (SIT_BLOCK_OFFSET(segno) * SIT_ENTRY_PER_BLOCK) #define SIT_BLK_CNT(sbi) \ @@ -103,7 +103,7 @@ #define SECTOR_FROM_BLOCK(blk_addr) \ (((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK) #define SECTOR_TO_BLOCK(sectors) \ - (sectors >> F2FS_LOG_SECTORS_PER_BLOCK) + ((sectors) >> F2FS_LOG_SECTORS_PER_BLOCK) /* * indicate a block allocation direction: RIGHT and LEFT. diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index 91f3bd88dcc6..08a4840d6d7d 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -58,10 +58,10 @@ struct f2fs_xattr_entry { #define XATTR_FIRST_ENTRY(ptr) (XATTR_ENTRY(XATTR_HDR(ptr) + 1)) #define XATTR_ROUND (3) -#define XATTR_ALIGN(size) ((size + XATTR_ROUND) & ~XATTR_ROUND) +#define XATTR_ALIGN(size) (((size) + XATTR_ROUND) & ~XATTR_ROUND) #define ENTRY_SIZE(entry) (XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + \ - entry->e_name_len + le16_to_cpu(entry->e_value_size))) + (entry)->e_name_len + le16_to_cpu((entry)->e_value_size))) #define XATTR_NEXT_ENTRY(entry) ((struct f2fs_xattr_entry *)((char *)(entry) +\ ENTRY_SIZE(entry))) From 20e7964704de1526f45318fadc3b65aa80dcb662 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 7 Apr 2017 14:27:07 -0700 Subject: [PATCH 0248/1212] f2fs: use segment number for get_valid_blocks This patch fixes to submit a segment number for get_valid_blocks. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index b8a1bac9355d..39ef9cc0093b 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -80,6 +80,8 @@ GET_SEGNO_FROM_SEG0(sbi, blk_addr))) #define GET_SECNO(sbi, segno) \ ((segno) / (sbi)->segs_per_sec) +#define GET_SEGNO_FROM_SECNO(sbi, secno) \ + ((secno) * (sbi)->segs_per_sec) #define GET_ZONENO_FROM_SEGNO(sbi, segno) \ (((segno) / (sbi)->segs_per_sec) / (sbi)->secs_per_zone) @@ -720,8 +722,8 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) static inline bool no_fggc_candidate(struct f2fs_sb_info *sbi, unsigned int secno) { - if (get_valid_blocks(sbi, secno, sbi->segs_per_sec) >= - sbi->fggc_threshold) + if (get_valid_blocks(sbi, GET_SEGNO_FROM_SECNO(sbi, secno), + sbi->segs_per_sec) >= sbi->fggc_threshold) return true; return false; } From af381ca699eb25cb4ab153300b5c0b9d3c727bb2 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 7 Apr 2017 14:33:22 -0700 Subject: [PATCH 0249/1212] f2fs: clean up get_valid_blocks with consistent parameter This patch cleans up get_valid_blocks, which has no functional change. Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 2 +- fs/f2fs/gc.c | 8 ++++---- fs/f2fs/segment.c | 8 ++++---- fs/f2fs/segment.h | 8 ++++---- fs/f2fs/super.c | 4 ++-- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 906f627e44fc..dc16a52db275 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -137,7 +137,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi) blks_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg; hblks_per_sec = blks_per_sec / 2; for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { - vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); + vblocks = get_valid_blocks(sbi, segno, true); dist = abs(vblocks - hblks_per_sec); bimodal += dist * dist; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d712b64ee6c2..ec07940ea722 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -229,7 +229,7 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) for (i = 0; i < sbi->segs_per_sec; i++) mtime += get_seg_entry(sbi, start + i)->mtime; - vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); + vblocks = get_valid_blocks(sbi, segno, true); mtime = div_u64(mtime, sbi->segs_per_sec); vblocks = div_u64(vblocks, sbi->segs_per_sec); @@ -252,7 +252,7 @@ static unsigned int get_greedy_cost(struct f2fs_sb_info *sbi, unsigned int segno) { unsigned int valid_blocks = - get_valid_blocks(sbi, segno, sbi->segs_per_sec); + get_valid_blocks(sbi, segno, true); return IS_DATASEG(get_seg_entry(sbi, segno)->type) ? valid_blocks * 2 : valid_blocks; @@ -897,7 +897,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, GET_SUM_BLOCK(sbi, segno)); f2fs_put_page(sum_page, 0); - if (get_valid_blocks(sbi, segno, 1) == 0 || + if (get_valid_blocks(sbi, segno, false) == 0 || !PageUptodate(sum_page) || unlikely(f2fs_cp_error(sbi))) goto next; @@ -931,7 +931,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, blk_finish_plug(&plug); if (gc_type == FG_GC && - get_valid_blocks(sbi, start_segno, sbi->segs_per_sec) == 0) + get_valid_blocks(sbi, start_segno, true) == 0) sec_freed = 1; stat_inc_call_count(sbi->stat_info); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index dc2737cfaa67..8ab152f209af 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -636,7 +636,7 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) dirty_i->nr_dirty[t]--; - if (get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0) + if (get_valid_blocks(sbi, segno, true) == 0) clear_bit(GET_SECNO(sbi, segno), dirty_i->victim_secmap); } @@ -657,7 +657,7 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_lock(&dirty_i->seglist_lock); - valid_blocks = get_valid_blocks(sbi, segno, 0); + valid_blocks = get_valid_blocks(sbi, segno, false); if (valid_blocks == 0) { __locate_dirty_segment(sbi, segno, PRE); @@ -1188,7 +1188,7 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) secno = GET_SECNO(sbi, start); start_segno = secno * sbi->segs_per_sec; if (!IS_CURSEC(sbi, secno) && - !get_valid_blocks(sbi, start, sbi->segs_per_sec)) + !get_valid_blocks(sbi, start, true)) f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno), sbi->segs_per_sec << sbi->log_blocks_per_seg); @@ -2938,7 +2938,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) if (segno >= MAIN_SEGS(sbi)) break; offset = segno + 1; - valid_blocks = get_valid_blocks(sbi, segno, 0); + valid_blocks = get_valid_blocks(sbi, segno, false); if (valid_blocks == sbi->blocks_per_seg || !valid_blocks) continue; if (valid_blocks > sbi->blocks_per_seg) { diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 39ef9cc0093b..053166038bfe 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -309,13 +309,13 @@ static inline struct sec_entry *get_sec_entry(struct f2fs_sb_info *sbi, } static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi, - unsigned int segno, int section) + unsigned int segno, bool use_section) { /* * In order to get # of valid blocks in a section instantly from many * segments, f2fs manages two counting structures separately. */ - if (section > 1) + if (use_section && sbi->segs_per_sec > 1) return get_sec_entry(sbi, segno)->valid_blocks; else return get_seg_entry(sbi, segno)->valid_blocks; @@ -722,8 +722,8 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) static inline bool no_fggc_candidate(struct f2fs_sb_info *sbi, unsigned int secno) { - if (get_valid_blocks(sbi, GET_SEGNO_FROM_SECNO(sbi, secno), - sbi->segs_per_sec) >= sbi->fggc_threshold) + if (get_valid_blocks(sbi, GET_SEGNO_FROM_SECNO(sbi, secno), true) >= + sbi->fggc_threshold) return true; return false; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d50eb8d27e60..aa6ee31ef39b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -997,7 +997,7 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset) if ((i % 10) == 0) seq_printf(seq, "%-10d", i); seq_printf(seq, "%d|%-3u", se->type, - get_valid_blocks(sbi, i, 1)); + get_valid_blocks(sbi, i, false)); if ((i % 10) == 9 || i == (total_segs - 1)) seq_putc(seq, '\n'); else @@ -1023,7 +1023,7 @@ static int segment_bits_seq_show(struct seq_file *seq, void *offset) seq_printf(seq, "%-10d", i); seq_printf(seq, "%d|%-3u|", se->type, - get_valid_blocks(sbi, i, 1)); + get_valid_blocks(sbi, i, false)); for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++) seq_printf(seq, " %.2x", se->cur_valid_map[j]); seq_putc(seq, '\n'); From d1c1a744c455f1f76098792f9b1bdd682fc95dfe Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 7 Apr 2017 15:08:17 -0700 Subject: [PATCH 0250/1212] f2fs: clean up some macros in terms of GET_SEGNO This patch cleans several macros by introducing: - BLKS_PER_SEC - GET_SEC_FROM_SEG - GET_SEG_FROM_SEC - GET_ZONE_FROM_SEC - GET_ZONE_FROM_SEG Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 6 +++--- fs/f2fs/f2fs.h | 7 +++---- fs/f2fs/file.c | 3 +-- fs/f2fs/gc.c | 17 ++++++++--------- fs/f2fs/segment.c | 20 ++++++++++---------- fs/f2fs/segment.h | 34 ++++++++++++++++++++-------------- 6 files changed, 45 insertions(+), 42 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index dc16a52db275..692beff66bf8 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -109,8 +109,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_NODE; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); si->curseg[i] = curseg->segno; - si->cursec[i] = curseg->segno / sbi->segs_per_sec; - si->curzone[i] = si->cursec[i] / sbi->secs_per_zone; + si->cursec[i] = GET_SEC_FROM_SEG(sbi, curseg->segno); + si->curzone[i] = GET_ZONE_FROM_SEC(sbi, si->cursec[i]); } for (i = 0; i < 2; i++) { @@ -134,7 +134,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi) bimodal = 0; total_vblocks = 0; - blks_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg; + blks_per_sec = BLKS_PER_SEC(sbi); hblks_per_sec = blks_per_sec / 2; for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { vblocks = get_valid_blocks(sbi, segno, true); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index cbc35660c466..e93bf44c34fa 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -202,12 +202,11 @@ enum { #define DEF_BATCHED_TRIM_SECTIONS 2048 #define BATCHED_TRIM_SEGMENTS(sbi) \ - (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec) + (GET_SEG_FROM_SEC(sbi, SM_I(sbi)->trim_sections)) #define BATCHED_TRIM_BLOCKS(sbi) \ (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) -#define MAX_DISCARD_BLOCKS(sbi) \ - ((1 << (sbi)->log_blocks_per_seg) * (sbi)->segs_per_sec) -#define DISCARD_ISSUE_RATE 8 +#define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi) +#define DISCARD_ISSUE_RATE 8 #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index b06a52e33a79..321bfca8a4f9 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1897,7 +1897,6 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, pgoff_t pg_start, pg_end; unsigned int blk_per_seg = sbi->blocks_per_seg; unsigned int total = 0, sec_num; - unsigned int pages_per_sec = sbi->segs_per_sec * blk_per_seg; block_t blk_end = 0; bool fragmented = false; int err; @@ -1961,7 +1960,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, map.m_lblk = pg_start; map.m_len = pg_end - pg_start; - sec_num = (map.m_len + pages_per_sec - 1) / pages_per_sec; + sec_num = (map.m_len + BLKS_PER_SEC(sbi) - 1) / BLKS_PER_SEC(sbi); /* * make sure there are enough free section for LFS allocation, this can diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index ec07940ea722..fe7716269b33 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -211,7 +211,7 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) continue; clear_bit(secno, dirty_i->victim_secmap); - return secno * sbi->segs_per_sec; + return GET_SEG_FROM_SEC(sbi, secno); } return NULL_SEGNO; } @@ -219,8 +219,8 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); - unsigned int secno = GET_SECNO(sbi, segno); - unsigned int start = secno * sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start = GET_SEG_FROM_SEC(sbi, secno); unsigned long long mtime = 0; unsigned int vblocks; unsigned char age = 0; @@ -343,7 +343,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, nsearched++; } - secno = GET_SECNO(sbi, segno); + secno = GET_SEC_FROM_SEG(sbi, segno); if (sec_usage_check(sbi, secno)) goto next; @@ -372,7 +372,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, if (p.min_segno != NULL_SEGNO) { got_it: if (p.alloc_mode == LFS) { - secno = GET_SECNO(sbi, p.min_segno); + secno = GET_SEC_FROM_SEG(sbi, p.min_segno); if (gc_type == FG_GC) sbi->cur_victim_sec = secno; else @@ -1006,7 +1006,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background) void build_gc_manager(struct f2fs_sb_info *sbi) { - u64 main_count, resv_count, ovp_count, blocks_per_sec; + u64 main_count, resv_count, ovp_count; DIRTY_I(sbi)->v_ops = &default_v_ops; @@ -1014,8 +1014,7 @@ void build_gc_manager(struct f2fs_sb_info *sbi) main_count = SM_I(sbi)->main_segments << sbi->log_blocks_per_seg; resv_count = SM_I(sbi)->reserved_segments << sbi->log_blocks_per_seg; ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg; - blocks_per_sec = sbi->blocks_per_seg * sbi->segs_per_sec; - sbi->fggc_threshold = div64_u64((main_count - ovp_count) * blocks_per_sec, - (main_count - resv_count)); + sbi->fggc_threshold = div64_u64((main_count - ovp_count) * + BLKS_PER_SEC(sbi), (main_count - resv_count)); } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 8ab152f209af..40474e7c2033 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -637,7 +637,7 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, dirty_i->nr_dirty[t]--; if (get_valid_blocks(sbi, segno, true) == 0) - clear_bit(GET_SECNO(sbi, segno), + clear_bit(GET_SEC_FROM_SEG(sbi, segno), dirty_i->victim_secmap); } } @@ -1185,8 +1185,8 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) continue; } next: - secno = GET_SECNO(sbi, start); - start_segno = secno * sbi->segs_per_sec; + secno = GET_SEC_FROM_SEG(sbi, start); + start_segno = GET_SEG_FROM_SEC(sbi, secno); if (!IS_CURSEC(sbi, secno) && !get_valid_blocks(sbi, start, true)) f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno), @@ -1541,8 +1541,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi, struct free_segmap_info *free_i = FREE_I(sbi); unsigned int segno, secno, zoneno; unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone; - unsigned int hint = *newseg / sbi->segs_per_sec; - unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg); + unsigned int hint = GET_SEC_FROM_SEG(sbi, *newseg); + unsigned int old_zoneno = GET_ZONE_FROM_SEG(sbi, *newseg); unsigned int left_start = hint; bool init = true; int go_left = 0; @@ -1552,8 +1552,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi, if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) { segno = find_next_zero_bit(free_i->free_segmap, - (hint + 1) * sbi->segs_per_sec, *newseg + 1); - if (segno < (hint + 1) * sbi->segs_per_sec) + GET_SEG_FROM_SEC(sbi, hint + 1), *newseg + 1); + if (segno < GET_SEG_FROM_SEC(sbi, hint + 1)) goto got_it; } find_other_zone: @@ -1584,8 +1584,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi, secno = left_start; skip_left: hint = secno; - segno = secno * sbi->segs_per_sec; - zoneno = secno / sbi->secs_per_zone; + segno = GET_SEG_FROM_SEC(sbi, secno); + zoneno = GET_ZONE_FROM_SEC(sbi, secno); /* give up on finding another zone */ if (!init) @@ -1629,7 +1629,7 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) struct summary_footer *sum_footer; curseg->segno = curseg->next_segno; - curseg->zone = GET_ZONENO_FROM_SEGNO(sbi, curseg->segno); + curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno); curseg->next_blkoff = 0; curseg->next_segno = NULL_SEGNO; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 053166038bfe..5f6ef163aa8f 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -78,12 +78,16 @@ ((((blk_addr) == NULL_ADDR) || ((blk_addr) == NEW_ADDR)) ? \ NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ GET_SEGNO_FROM_SEG0(sbi, blk_addr))) -#define GET_SECNO(sbi, segno) \ +#define BLKS_PER_SEC(sbi) \ + ((sbi)->segs_per_sec * (sbi)->blocks_per_seg) +#define GET_SEC_FROM_SEG(sbi, segno) \ ((segno) / (sbi)->segs_per_sec) -#define GET_SEGNO_FROM_SECNO(sbi, secno) \ +#define GET_SEG_FROM_SEC(sbi, secno) \ ((secno) * (sbi)->segs_per_sec) -#define GET_ZONENO_FROM_SEGNO(sbi, segno) \ - (((segno) / (sbi)->segs_per_sec) / (sbi)->secs_per_zone) +#define GET_ZONE_FROM_SEC(sbi, secno) \ + ((secno) / (sbi)->secs_per_zone) +#define GET_ZONE_FROM_SEG(sbi, segno) \ + GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno)) #define GET_SUM_BLOCK(sbi, segno) \ ((sbi)->sm_info->ssa_blkaddr + (segno)) @@ -305,7 +309,7 @@ static inline struct sec_entry *get_sec_entry(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); - return &sit_i->sec_entries[GET_SECNO(sbi, segno)]; + return &sit_i->sec_entries[GET_SEC_FROM_SEG(sbi, segno)]; } static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi, @@ -360,8 +364,8 @@ static inline unsigned int find_next_inuse(struct free_segmap_info *free_i, static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; - unsigned int start_segno = secno * sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; spin_lock(&free_i->segmap_lock); @@ -381,7 +385,8 @@ static inline void __set_inuse(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + set_bit(segno, free_i->free_segmap); free_i->free_segments--; if (!test_and_set_bit(secno, free_i->free_secmap)) @@ -392,8 +397,8 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; - unsigned int start_segno = secno * sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; spin_lock(&free_i->segmap_lock); @@ -414,7 +419,8 @@ static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + spin_lock(&free_i->segmap_lock); if (!test_and_set_bit(segno, free_i->free_segmap)) { free_i->free_segments--; @@ -479,12 +485,12 @@ static inline int overprovision_segments(struct f2fs_sb_info *sbi) static inline int overprovision_sections(struct f2fs_sb_info *sbi) { - return ((unsigned int) overprovision_segments(sbi)) / sbi->segs_per_sec; + return GET_SEC_FROM_SEG(sbi, (unsigned int)overprovision_segments(sbi)); } static inline int reserved_sections(struct f2fs_sb_info *sbi) { - return ((unsigned int) reserved_segments(sbi)) / sbi->segs_per_sec; + return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi)); } static inline bool need_SSR(struct f2fs_sb_info *sbi) @@ -722,7 +728,7 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) static inline bool no_fggc_candidate(struct f2fs_sb_info *sbi, unsigned int secno) { - if (get_valid_blocks(sbi, GET_SEGNO_FROM_SECNO(sbi, secno), true) >= + if (get_valid_blocks(sbi, GET_SEG_FROM_SEC(sbi, secno), true) >= sbi->fggc_threshold) return true; return false; From 68033a5ab66f3973813d9b18fa9a28b1da48d13d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 7 Apr 2017 17:25:54 -0700 Subject: [PATCH 0251/1212] f2fs: avoid frequent checkpoint during f2fs_gc Now we're doing SSR aggressively more than ever before, so once we reach to the reserved_segment, f2fs_balance_fs will call f2fs_gc, which triggers checkpoint everytime. We actually must avoid that. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index fe7716269b33..84ade968d149 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -966,9 +966,11 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background) * threshold, we can make them free by checkpoint. Then, we * secure free segments which doesn't need fggc any more. */ - ret = write_checkpoint(sbi, &cpc); - if (ret) - goto stop; + if (prefree_segments(sbi)) { + ret = write_checkpoint(sbi, &cpc); + if (ret) + goto stop; + } if (has_not_enough_free_secs(sbi, 0, 0)) gc_type = FG_GC; } From 6e4fee6a144e7f275217cf4900d1cdd12f39d42d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Apr 2017 09:25:22 +0800 Subject: [PATCH 0252/1212] f2fs: extract rb-tree operation infrastructure rb-tree lookup/update functions are deeply coupled into extent cache codes, it's very hard to reuse these basic functions, this patch extracts common rb-tree operation infrastructure for latter reusing. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 291 +++++++++++++++++++++++------------------ fs/f2fs/f2fs.h | 20 ++- 2 files changed, 179 insertions(+), 132 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index c6934f014e0f..68e649a31c7d 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -18,6 +18,146 @@ #include "node.h" #include +static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re, + unsigned int ofs) +{ + if (cached_re) { + if (cached_re->ofs <= ofs && + cached_re->ofs + cached_re->len > ofs) { + return cached_re; + } + } + return NULL; +} + +static struct rb_entry *__lookup_rb_tree_slow(struct rb_root *root, + unsigned int ofs) +{ + struct rb_node *node = root->rb_node; + struct rb_entry *re; + + while (node) { + re = rb_entry(node, struct rb_entry, rb_node); + + if (ofs < re->ofs) + node = node->rb_left; + else if (ofs >= re->ofs + re->len) + node = node->rb_right; + else + return re; + } + return NULL; +} + +static struct rb_entry *__lookup_rb_tree(struct rb_root *root, + struct rb_entry *cached_re, unsigned int ofs) +{ + struct rb_entry *re; + + re = __lookup_rb_tree_fast(cached_re, ofs); + if (!re) + return __lookup_rb_tree_slow(root, ofs); + + return re; +} + +static struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, + struct rb_root *root, struct rb_node **parent, + unsigned int ofs) +{ + struct rb_node **p = &root->rb_node; + struct rb_entry *re; + + while (*p) { + *parent = *p; + re = rb_entry(*parent, struct rb_entry, rb_node); + + if (ofs < re->ofs) + p = &(*p)->rb_left; + else if (ofs >= re->ofs + re->len) + p = &(*p)->rb_right; + else + f2fs_bug_on(sbi, 1); + } + + return p; +} + +/* + * lookup rb entry in position of @ofs in rb-tree, + * if hit, return the entry, otherwise, return NULL + * @prev_ex: extent before ofs + * @next_ex: extent after ofs + * @insert_p: insert point for new extent at ofs + * in order to simpfy the insertion after. + * tree must stay unchanged between lookup and insertion. + */ +static struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root, + struct rb_entry *cached_re, + unsigned int ofs, + struct rb_entry **prev_entry, + struct rb_entry **next_entry, + struct rb_node ***insert_p, + struct rb_node **insert_parent) +{ + struct rb_node **pnode = &root->rb_node; + struct rb_node *parent = NULL, *tmp_node; + struct rb_entry *re = cached_re; + + *insert_p = NULL; + *insert_parent = NULL; + *prev_entry = NULL; + *next_entry = NULL; + + if (RB_EMPTY_ROOT(root)) + return NULL; + + if (re) { + if (re->ofs <= ofs && re->ofs + re->len > ofs) + goto lookup_neighbors; + } + + while (*pnode) { + parent = *pnode; + re = rb_entry(*pnode, struct rb_entry, rb_node); + + if (ofs < re->ofs) + pnode = &(*pnode)->rb_left; + else if (ofs >= re->ofs + re->len) + pnode = &(*pnode)->rb_right; + else + goto lookup_neighbors; + } + + *insert_p = pnode; + *insert_parent = parent; + + re = rb_entry(parent, struct rb_entry, rb_node); + tmp_node = parent; + if (parent && ofs > re->ofs) + tmp_node = rb_next(parent); + *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + + tmp_node = parent; + if (parent && ofs < re->ofs) + tmp_node = rb_prev(parent); + *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + return NULL; + +lookup_neighbors: + if (ofs == re->ofs) { + /* lookup prev node for merging backward later */ + tmp_node = rb_prev(&re->rb_node); + *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + } + if (ofs == re->ofs + re->len - 1) { + /* lookup next node for merging frontward later */ + tmp_node = rb_next(&re->rb_node); + *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + } + return re; +} + static struct kmem_cache *extent_tree_slab; static struct kmem_cache *extent_node_slab; @@ -102,36 +242,6 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) return et; } -static struct extent_node *__lookup_extent_tree(struct f2fs_sb_info *sbi, - struct extent_tree *et, unsigned int fofs) -{ - struct rb_node *node = et->root.rb_node; - struct extent_node *en = et->cached_en; - - if (en) { - struct extent_info *cei = &en->ei; - - if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) { - stat_inc_cached_node_hit(sbi); - return en; - } - } - - while (node) { - en = rb_entry(node, struct extent_node, rb_node); - - if (fofs < en->ei.fofs) { - node = node->rb_left; - } else if (fofs >= en->ei.fofs + en->ei.len) { - node = node->rb_right; - } else { - stat_inc_rbtree_node_hit(sbi); - return en; - } - } - return NULL; -} - static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_info *ei) { @@ -237,17 +347,24 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, goto out; } - en = __lookup_extent_tree(sbi, et, pgofs); - if (en) { - *ei = en->ei; - spin_lock(&sbi->extent_lock); - if (!list_empty(&en->list)) { - list_move_tail(&en->list, &sbi->extent_list); - et->cached_en = en; - } - spin_unlock(&sbi->extent_lock); - ret = true; + en = (struct extent_node *)__lookup_rb_tree(&et->root, + (struct rb_entry *)et->cached_en, pgofs); + if (!en) + goto out; + + if (en == et->cached_en) + stat_inc_cached_node_hit(sbi); + else + stat_inc_rbtree_node_hit(sbi); + + *ei = en->ei; + spin_lock(&sbi->extent_lock); + if (!list_empty(&en->list)) { + list_move_tail(&en->list, &sbi->extent_list); + et->cached_en = en; } + spin_unlock(&sbi->extent_lock); + ret = true; out: stat_inc_total_hit(sbi); read_unlock(&et->lock); @@ -256,83 +373,6 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, return ret; } - -/* - * lookup extent at @fofs, if hit, return the extent - * if not, return NULL and - * @prev_ex: extent before fofs - * @next_ex: extent after fofs - * @insert_p: insert point for new extent at fofs - * in order to simpfy the insertion after. - * tree must stay unchanged between lookup and insertion. - */ -static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et, - unsigned int fofs, - struct extent_node **prev_ex, - struct extent_node **next_ex, - struct rb_node ***insert_p, - struct rb_node **insert_parent) -{ - struct rb_node **pnode = &et->root.rb_node; - struct rb_node *parent = NULL, *tmp_node; - struct extent_node *en = et->cached_en; - - *insert_p = NULL; - *insert_parent = NULL; - *prev_ex = NULL; - *next_ex = NULL; - - if (RB_EMPTY_ROOT(&et->root)) - return NULL; - - if (en) { - struct extent_info *cei = &en->ei; - - if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) - goto lookup_neighbors; - } - - while (*pnode) { - parent = *pnode; - en = rb_entry(*pnode, struct extent_node, rb_node); - - if (fofs < en->ei.fofs) - pnode = &(*pnode)->rb_left; - else if (fofs >= en->ei.fofs + en->ei.len) - pnode = &(*pnode)->rb_right; - else - goto lookup_neighbors; - } - - *insert_p = pnode; - *insert_parent = parent; - - en = rb_entry(parent, struct extent_node, rb_node); - tmp_node = parent; - if (parent && fofs > en->ei.fofs) - tmp_node = rb_next(parent); - *next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); - - tmp_node = parent; - if (parent && fofs < en->ei.fofs) - tmp_node = rb_prev(parent); - *prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); - return NULL; - -lookup_neighbors: - if (fofs == en->ei.fofs) { - /* lookup prev node for merging backward later */ - tmp_node = rb_prev(&en->rb_node); - *prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); - } - if (fofs == en->ei.fofs + en->ei.len - 1) { - /* lookup next node for merging frontward later */ - tmp_node = rb_next(&en->rb_node); - *next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); - } - return en; -} - static struct extent_node *__try_merge_extent_node(struct inode *inode, struct extent_tree *et, struct extent_info *ei, struct extent_node *prev_ex, @@ -387,17 +427,7 @@ static struct extent_node *__insert_extent_tree(struct inode *inode, goto do_insert; } - while (*p) { - parent = *p; - en = rb_entry(parent, struct extent_node, rb_node); - - if (ei->fofs < en->ei.fofs) - p = &(*p)->rb_left; - else if (ei->fofs >= en->ei.fofs + en->ei.len) - p = &(*p)->rb_right; - else - f2fs_bug_on(sbi, 1); - } + p = __lookup_rb_tree_for_insert(sbi, &et->root, &parent, ei->fofs); do_insert: en = __attach_extent_node(sbi, et, ei, parent, p); if (!en) @@ -447,7 +477,10 @@ static void f2fs_update_extent_tree_range(struct inode *inode, __drop_largest_extent(inode, fofs, len); /* 1. lookup first extent node in range [fofs, fofs + len - 1] */ - en = __lookup_extent_tree_ret(et, fofs, &prev_en, &next_en, + en = (struct extent_node *)__lookup_rb_tree_ret(&et->root, + (struct rb_entry *)et->cached_en, fofs, + (struct rb_entry **)&prev_en, + (struct rb_entry **)&next_en, &insert_p, &insert_parent); if (!en) en = next_en; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e93bf44c34fa..1a5737831401 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -444,16 +444,30 @@ enum { /* number of extent info in extent cache we try to shrink */ #define EXTENT_CACHE_SHRINK_NUMBER 128 +struct rb_entry { + struct rb_node rb_node; /* rb node located in rb-tree */ + unsigned int ofs; /* start offset of the entry */ + unsigned int len; /* length of the entry */ +}; + struct extent_info { unsigned int fofs; /* start offset in a file */ - u32 blk; /* start block address of the extent */ unsigned int len; /* length of the extent */ + u32 blk; /* start block address of the extent */ }; struct extent_node { - struct rb_node rb_node; /* rb node located in rb-tree */ + struct rb_node rb_node; + union { + struct { + unsigned int fofs; + unsigned int len; + u32 blk; + }; + struct extent_info ei; /* extent info */ + + }; struct list_head list; /* node in global extent list of sbi */ - struct extent_info ei; /* extent info */ struct extent_tree *et; /* extent tree pointer */ }; From fa3a914e8bf8f0f06e9423fc857ca4f5d44d00d3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 27 Mar 2017 18:14:05 +0800 Subject: [PATCH 0253/1212] f2fs: shrink blk plug region Don't use blk plug covering area where there won't be any IOs being issued. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 40474e7c2033..1e726e893eec 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -932,9 +932,8 @@ static int issue_discard_thread(void *data) if (kthread_should_stop()) return 0; - blk_start_plug(&plug); - mutex_lock(&dcc->cmd_lock); + blk_start_plug(&plug); list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); @@ -944,6 +943,7 @@ static int issue_discard_thread(void *data) if (iter++ > DISCARD_ISSUE_RATE) break; } + blk_finish_plug(&plug); list_for_each_entry_safe(dc, tmp, wait_list, list) { if (dc->state == D_DONE) { @@ -953,8 +953,6 @@ static int issue_discard_thread(void *data) } mutex_unlock(&dcc->cmd_lock); - blk_finish_plug(&plug); - iter = 0; congestion_wait(BLK_RW_SYNC, HZ/50); From be687c56d1f697a86abd1e3359e2b63fd3aa73bc Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 11 Apr 2017 19:01:26 -0700 Subject: [PATCH 0254/1212] f2fs: fix fs corruption due to zero inode page This patch fixes the following scenario. - f2fs_create/f2fs_mkdir - write_checkpoint - f2fs_mark_inode_dirty_sync - block_operations - f2fs_lock_all - f2fs_sync_inode_meta - f2fs_unlock_all - sync_inode_metadata - f2fs_lock_op - f2fs_write_inode - update_inode_page - get_node_page return -ENOENT - new_inode_page - fill_node_footer - f2fs_mark_inode_dirty_sync - ... - f2fs_unlock_op - f2fs_inode_synced - f2fs_lock_all - do_checkpoint In this checkpoint, we can get an inode page which contains zeros having valid node footer only. Cc: Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 2 +- fs/f2fs/namei.c | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 2520fa72b23f..0900814485c7 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -316,7 +316,6 @@ int update_inode_page(struct inode *inode) } else if (err != -ENOENT) { f2fs_stop_checkpoint(sbi, false); } - f2fs_inode_synced(inode); return 0; } ret = update_inode(inode, node_page); @@ -450,6 +449,7 @@ void handle_failed_inode(struct inode *inode) * in a panic when flushing dirty inodes in gdirty_list. */ update_inode_page(inode); + f2fs_inode_synced(inode); /* don't make bad inode, since it becomes a regular file. */ unlock_new_inode(inode); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 43eb2bd417a8..35fca4c39993 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -148,8 +148,6 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, inode->i_mapping->a_ops = &f2fs_dblock_aops; ino = inode->i_ino; - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -163,6 +161,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); return 0; out: handle_failed_inode(inode); @@ -420,8 +420,6 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, inode_nohighmem(inode); inode->i_mapping->a_ops = &f2fs_dblock_aops; - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -484,6 +482,8 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, } kfree(sd); + + f2fs_balance_fs(sbi, true); return err; out: handle_failed_inode(inode); @@ -505,8 +505,6 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_mapping->a_ops = &f2fs_dblock_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO); - f2fs_balance_fs(sbi, true); - set_inode_flag(inode, FI_INC_LINK); f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); @@ -521,6 +519,8 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); return 0; out_fail: @@ -551,8 +551,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &f2fs_special_inode_operations; - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -566,6 +564,8 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); return 0; out: handle_failed_inode(inode); @@ -592,8 +592,6 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, inode->i_mapping->a_ops = &f2fs_dblock_aops; } - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = acquire_orphan_inode(sbi); if (err) @@ -619,6 +617,8 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, /* link_count was changed by d_tmpfile as well. */ f2fs_unlock_op(sbi); unlock_new_inode(inode); + + f2fs_balance_fs(sbi, true); return 0; release_out: From 24f3c7e19565a9438dc075c9cd61a6d8a962c315 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 11 Apr 2017 19:15:33 -0700 Subject: [PATCH 0255/1212] f2fs: give time to flush dirty pages for checkpoint If all the threads are waiting for checkpoint, we have no chance to flush required dirty pages. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 8b106d10afe7..0983b7646444 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -981,6 +981,7 @@ static int block_operations(struct f2fs_sb_info *sbi) err = sync_dirty_inodes(sbi, DIR_INODE); if (err) goto out; + cond_resched(); goto retry_flush_dents; } @@ -996,6 +997,7 @@ static int block_operations(struct f2fs_sb_info *sbi) err = f2fs_sync_inode_meta(sbi); if (err) goto out; + cond_resched(); goto retry_flush_dents; } @@ -1010,6 +1012,7 @@ static int block_operations(struct f2fs_sb_info *sbi) f2fs_unlock_all(sbi); goto out; } + cond_resched(); goto retry_flush_nodes; } From 933686cf727b9428a0933c8d94b4ef4c2cedc0e3 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 12 Apr 2017 10:01:33 -0700 Subject: [PATCH 0256/1212] f2fs: allocate hot_data for atomic writes We'd better allocate atomic writes to hot_data zone. Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 321bfca8a4f9..1da2ceaaac3e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1536,6 +1536,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) goto out; set_inode_flag(inode, FI_ATOMIC_FILE); + set_inode_flag(inode, FI_HOT_DATA); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); if (!get_dirty_pages(inode)) From 8aa17546af24daf62a9cad74d09ab6797065db48 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 12 Apr 2017 12:02:00 -0700 Subject: [PATCH 0257/1212] f2fs: fix not to set fsync/dentry mark Otherwise, we can see stale fsync/dentry mark given by previous calls, resulting in giving up roll-forward recovery due to wrong dentry mark. Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 19ea77dc3192..dbf0efeb1cde 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1458,6 +1458,9 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, f2fs_wait_on_page_writeback(page, NODE, true); BUG_ON(PageWriteback(page)); + set_fsync_mark(page, 0); + set_dentry_mark(page, 0); + if (!atomic || page == last_page) { set_fsync_mark(page, 1); if (IS_INODE(page)) { From 9febed8ff933c555a0c637b49e09a4264f6c2907 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 14 Apr 2017 15:46:23 -0700 Subject: [PATCH 0258/1212] f2fs: avoid dirty node pages in check_only recovery In the check_only mode, we should not make any dirty node pages. Otherwise, we can get this panic: F2FS-fs (nvme0n1p1): Need to recover fsync data ------------[ cut here ]------------ kernel BUG at fs/f2fs/node.c:2204! CPU: 7 PID: 19923 Comm: mount Tainted: G OE 4.9.8 #2 RIP: 0010:[] [] flush_nat_entries+0x43b/0x7d0 [f2fs] Call Trace: [] ? __f2fs_submit_merged_bio+0x5a/0xd0 [f2fs] [] ? __f2fs_submit_merged_bio+0x5a/0xd0 [f2fs] [] ? __f2fs_submit_merged_bio+0x8b/0xd0 [f2fs] [] ? up_write+0x1f/0x40 [] ? __f2fs_submit_merged_bio+0x8b/0xd0 [f2fs] [] write_checkpoint+0x2f4/0xf20 [f2fs] [] ? trace_hardirqs_on+0xd/0x10 [] ? f2fs_sync_fs+0x79/0x190 [f2fs] [] ? f2fs_sync_fs+0x79/0x190 [f2fs] [] f2fs_sync_fs+0x85/0x190 [f2fs] [] f2fs_balance_fs_bg+0x7e/0x1c0 [f2fs] [] f2fs_write_node_pages+0x34/0x350 [f2fs] [] ? __lock_is_held+0x52/0x70 [] do_writepages+0x21/0x30 [] __writeback_single_inode+0x61/0x760 [] ? _raw_spin_unlock+0x27/0x40 [] writeback_single_inode+0xd5/0x190 [] write_inode_now+0x99/0xc0 [] iput+0x1f6/0x2c0 [] f2fs_fill_super+0xc32/0x10c0 [f2fs] [] mount_bdev+0x182/0x1b0 [] ? f2fs_commit_super+0x100/0x100 [f2fs] [] f2fs_mount+0x15/0x20 [f2fs] [] mount_fs+0x38/0x170 [] vfs_kern_mount+0x6b/0x160 [] do_mount+0x1be/0xd60 Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index d025aa83fb5b..907d6b7dde6a 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -198,7 +198,8 @@ static void recover_inode(struct inode *inode, struct page *page) ino_of_node(page), name); } -static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) +static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, + bool check_only) { struct curseg_info *curseg; struct page *page = NULL; @@ -225,7 +226,8 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) entry = get_fsync_inode(head, ino_of_node(page)); if (!entry) { - if (IS_INODE(page) && is_dent_dnode(page)) { + if (!check_only && + IS_INODE(page) && is_dent_dnode(page)) { err = recover_inode_page(sbi, page); if (err) break; @@ -569,7 +571,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) mutex_lock(&sbi->cp_mutex); /* step #1: find fsynced inode numbers */ - err = find_fsync_dnodes(sbi, &inode_list); + err = find_fsync_dnodes(sbi, &inode_list, check_only); if (err || list_empty(&inode_list)) goto out; From 062eb908b28711e4c9323a79dfe69a265efd64d5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 14 Apr 2017 23:24:55 +0800 Subject: [PATCH 0259/1212] f2fs: use rb-tree to track pending discard commands Introduce rb-tree based discard cache infrastructure to speed up lookup and merge operation of discard entry. Signed-off-by: Chao Yu [Jaegeuk Kim: initialize dc to avoid build warning] Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 15 +-- fs/f2fs/f2fs.h | 48 ++++++++- fs/f2fs/segment.c | 227 +++++++++++++++++++++++++++++++++-------- 3 files changed, 238 insertions(+), 52 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 68e649a31c7d..221ad086ee00 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -49,7 +49,7 @@ static struct rb_entry *__lookup_rb_tree_slow(struct rb_root *root, return NULL; } -static struct rb_entry *__lookup_rb_tree(struct rb_root *root, +struct rb_entry *__lookup_rb_tree(struct rb_root *root, struct rb_entry *cached_re, unsigned int ofs) { struct rb_entry *re; @@ -61,7 +61,7 @@ static struct rb_entry *__lookup_rb_tree(struct rb_root *root, return re; } -static struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, +struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, struct rb_root *root, struct rb_node **parent, unsigned int ofs) { @@ -92,13 +92,14 @@ static struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, * in order to simpfy the insertion after. * tree must stay unchanged between lookup and insertion. */ -static struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root, +struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root, struct rb_entry *cached_re, unsigned int ofs, struct rb_entry **prev_entry, struct rb_entry **next_entry, struct rb_node ***insert_p, - struct rb_node **insert_parent) + struct rb_node **insert_parent, + bool force) { struct rb_node **pnode = &root->rb_node; struct rb_node *parent = NULL, *tmp_node; @@ -145,12 +146,12 @@ static struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root, return NULL; lookup_neighbors: - if (ofs == re->ofs) { + if (ofs == re->ofs || force) { /* lookup prev node for merging backward later */ tmp_node = rb_prev(&re->rb_node); *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); } - if (ofs == re->ofs + re->len - 1) { + if (ofs == re->ofs + re->len - 1 || force) { /* lookup next node for merging frontward later */ tmp_node = rb_next(&re->rb_node); *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); @@ -481,7 +482,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode, (struct rb_entry *)et->cached_en, fofs, (struct rb_entry **)&prev_en, (struct rb_entry **)&next_en, - &insert_p, &insert_parent); + &insert_p, &insert_parent, false); if (!en) en = next_en; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1a5737831401..dd25e6ff0785 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -261,13 +261,26 @@ enum { D_DONE, }; +struct discard_info { + block_t lstart; /* logical start address */ + block_t len; /* length */ + block_t start; /* actual start address in dev */ +}; + struct discard_cmd { + struct rb_node rb_node; /* rb node located in rb-tree */ + union { + struct { + block_t lstart; /* logical start address */ + block_t len; /* length */ + block_t start; /* actual start address in dev */ + }; + struct discard_info di; /* discard info */ + + }; struct list_head list; /* command list */ struct completion wait; /* compleation */ struct block_device *bdev; /* bdev */ - block_t lstart; /* logical start address */ - block_t start; /* actual start address in dev */ - block_t len; /* length */ int state; /* state */ int error; /* bio error */ }; @@ -284,6 +297,7 @@ struct discard_cmd_control { atomic_t issued_discard; /* # of issued discard */ atomic_t issing_discard; /* # of issing discard */ atomic_t discard_cmd_cnt; /* # of cached cmd count */ + struct rb_root root; /* root of discard rb-tree */ }; /* for the list of fsync inodes, used only during recovery */ @@ -584,6 +598,24 @@ static inline void set_extent_info(struct extent_info *ei, unsigned int fofs, ei->len = len; } +static inline bool __is_discard_mergeable(struct discard_info *back, + struct discard_info *front) +{ + return back->lstart + back->len == front->lstart; +} + +static inline bool __is_discard_back_mergeable(struct discard_info *cur, + struct discard_info *back) +{ + return __is_discard_mergeable(back, cur); +} + +static inline bool __is_discard_front_mergeable(struct discard_info *cur, + struct discard_info *front) +{ + return __is_discard_mergeable(cur, front); +} + static inline bool __is_extent_mergeable(struct extent_info *back, struct extent_info *front) { @@ -2640,6 +2672,16 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); /* * extent_cache.c */ +struct rb_entry *__lookup_rb_tree(struct rb_root *root, + struct rb_entry *cached_re, unsigned int ofs); +struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, + struct rb_root *root, struct rb_node **parent, + unsigned int ofs); +struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root, + struct rb_entry *cached_re, unsigned int ofs, + struct rb_entry **prev_entry, struct rb_entry **next_entry, + struct rb_node ***insert_p, struct rb_node **insert_parent, + bool force); unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext); void f2fs_drop_extent_tree(struct inode *inode); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1e726e893eec..9adc3bcfb4f4 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -672,7 +672,7 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_unlock(&dirty_i->seglist_lock); } -static void __add_discard_cmd(struct f2fs_sb_info *sbi, +static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t lstart, block_t start, block_t len) { @@ -689,18 +689,46 @@ static void __add_discard_cmd(struct f2fs_sb_info *sbi, dc->state = D_PREP; dc->error = 0; init_completion(&dc->wait); - - mutex_lock(&dcc->cmd_lock); list_add_tail(&dc->list, pend_list); - mutex_unlock(&dcc->cmd_lock); - atomic_inc(&dcc->discard_cmd_cnt); + + return dc; } -static void __remove_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc) +static struct discard_cmd *__attach_discard_cmd(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len, + struct rb_node *parent, struct rb_node **p) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *dc; + + dc = __create_discard_cmd(sbi, bdev, lstart, start, len); + + rb_link_node(&dc->rb_node, parent, p); + rb_insert_color(&dc->rb_node, &dcc->root); + + return dc; +} + +static void __detach_discard_cmd(struct discard_cmd_control *dcc, + struct discard_cmd *dc) { if (dc->state == D_DONE) - atomic_dec(&(SM_I(sbi)->dcc_info->issing_discard)); + atomic_dec(&dcc->issing_discard); + + list_del(&dc->list); + rb_erase(&dc->rb_node, &dcc->root); + + kmem_cache_free(discard_cmd_slab, dc); + + atomic_dec(&dcc->discard_cmd_cnt); +} + +static void __remove_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; if (dc->error == -EOPNOTSUPP) dc->error = 0; @@ -708,9 +736,7 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *d if (dc->error) f2fs_msg(sbi->sb, KERN_INFO, "Issue discard failed, ret: %d", dc->error); - list_del(&dc->list); - kmem_cache_free(discard_cmd_slab, dc); - atomic_dec(&SM_I(sbi)->dcc_info->discard_cmd_cnt); + __detach_discard_cmd(dcc, dc); } static void f2fs_submit_discard_endio(struct bio *bio) @@ -833,6 +859,148 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, } } +static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len, + struct rb_node **insert_p, + struct rb_node *insert_parent) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct rb_node **p = &dcc->root.rb_node; + struct rb_node *parent = NULL; + struct discard_cmd *dc = NULL; + + if (insert_p && insert_parent) { + parent = insert_parent; + p = insert_p; + goto do_insert; + } + + p = __lookup_rb_tree_for_insert(sbi, &dcc->root, &parent, lstart); +do_insert: + dc = __attach_discard_cmd(sbi, bdev, lstart, start, len, parent, p); + if (!dc) + return NULL; + + return dc; +} + +static void __punch_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc, block_t blkaddr) +{ + struct discard_info di = dc->di; + bool modified = false; + + if (dc->state == D_DONE || dc->len == 1) { + __remove_discard_cmd(sbi, dc); + return; + } + + if (blkaddr > di.lstart) { + dc->len = blkaddr - dc->lstart; + modified = true; + } + + if (blkaddr < di.lstart + di.len - 1) { + if (modified) { + __insert_discard_tree(sbi, dc->bdev, blkaddr + 1, + di.start + blkaddr + 1 - di.lstart, + di.lstart + di.len - 1 - blkaddr, + NULL, NULL); + } else { + dc->lstart++; + dc->len--; + dc->start++; + } + } +} + +static void __update_discard_tree_range(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *prev_dc = NULL, *next_dc = NULL; + struct discard_cmd *dc; + struct discard_info di = {0}; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + block_t end = lstart + len; + + mutex_lock(&dcc->cmd_lock); + + dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root, + NULL, lstart, + (struct rb_entry **)&prev_dc, + (struct rb_entry **)&next_dc, + &insert_p, &insert_parent, true); + if (dc) + prev_dc = dc; + + if (!prev_dc) { + di.lstart = lstart; + di.len = next_dc ? next_dc->lstart - lstart : len; + di.len = min(di.len, len); + di.start = start; + } + + while (1) { + struct rb_node *node; + bool merged = false; + struct discard_cmd *tdc = NULL; + + if (prev_dc) { + di.lstart = prev_dc->lstart + prev_dc->len; + if (di.lstart < lstart) + di.lstart = lstart; + if (di.lstart >= end) + break; + + if (!next_dc || next_dc->lstart > end) + di.len = end - di.lstart; + else + di.len = next_dc->lstart - di.lstart; + di.start = start + di.lstart - lstart; + } + + if (!di.len) + goto next; + + if (prev_dc && prev_dc->state == D_PREP && + prev_dc->bdev == bdev && + __is_discard_back_mergeable(&di, &prev_dc->di)) { + prev_dc->di.len += di.len; + di = prev_dc->di; + tdc = prev_dc; + merged = true; + } + + if (next_dc && next_dc->state == D_PREP && + next_dc->bdev == bdev && + __is_discard_front_mergeable(&di, &next_dc->di)) { + next_dc->di.lstart = di.lstart; + next_dc->di.len += di.len; + next_dc->di.start = di.start; + if (tdc) + __remove_discard_cmd(sbi, tdc); + + merged = true; + } + + if (!merged) + __insert_discard_tree(sbi, bdev, di.lstart, di.start, + di.len, NULL, NULL); + next: + prev_dc = next_dc; + if (!prev_dc) + break; + + node = rb_next(&prev_dc->rb_node); + next_dc = rb_entry_safe(node, struct discard_cmd, rb_node); + } + + mutex_unlock(&dcc->cmd_lock); +} + static int __queue_discard_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { @@ -845,50 +1013,24 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, blkstart -= FDEV(devi).start_blk; } - __add_discard_cmd(sbi, bdev, lblkstart, blkstart, blklen); + __update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen); wake_up(&SM_I(sbi)->dcc_info->discard_wait_queue); return 0; } -static void __punch_discard_cmd(struct f2fs_sb_info *sbi, - struct discard_cmd *dc, block_t blkaddr) -{ - block_t end_block = START_BLOCK(sbi, GET_SEGNO(sbi, blkaddr) + 1); - - if (dc->state == D_DONE || dc->lstart + dc->len <= end_block) { - __remove_discard_cmd(sbi, dc); - return; - } - - if (blkaddr - dc->lstart < dc->lstart + dc->len - end_block) { - dc->start += (end_block - dc->lstart); - dc->len -= (end_block - dc->lstart); - dc->lstart = end_block; - } else { - dc->len = blkaddr - dc->lstart; - } -} - /* This should be covered by global mutex, &sit_i->sentry_lock */ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct list_head *pend_list = &(dcc->discard_pend_list); - struct list_head *wait_list = &(dcc->discard_wait_list); - struct discard_cmd *dc, *tmp; + struct discard_cmd *dc; mutex_lock(&dcc->cmd_lock); - list_for_each_entry_safe(dc, tmp, pend_list, list) { - if (dc->lstart <= blkaddr && blkaddr < dc->lstart + dc->len) - __punch_discard_cmd(sbi, dc, blkaddr); - } - - list_for_each_entry_safe(dc, tmp, wait_list, list) { - if (dc->lstart <= blkaddr && blkaddr < dc->lstart + dc->len) { + dc = (struct discard_cmd *)__lookup_rb_tree(&dcc->root, NULL, blkaddr); + if (dc) { + if (dc->state != D_PREP) wait_for_completion_io(&dc->wait); - __punch_discard_cmd(sbi, dc, blkaddr); - } + __punch_discard_cmd(sbi, dc, blkaddr); } mutex_unlock(&dcc->cmd_lock); @@ -1257,6 +1399,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) atomic_set(&dcc->discard_cmd_cnt, 0); dcc->nr_discards = 0; dcc->max_discards = 0; + dcc->root = RB_ROOT; init_waitqueue_head(&dcc->discard_wait_queue); SM_I(sbi)->dcc_info = dcc; From 2814d83ec77245d9855b2b053fcab47ae23614d9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 15 Apr 2017 14:09:36 +0800 Subject: [PATCH 0260/1212] f2fs: clean up discard_cmd_control structure Avoid long variable name in discard_cmd_control structure, no logic change. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/segment.c --- fs/f2fs/f2fs.h | 8 ++++---- fs/f2fs/segment.c | 24 ++++++++++++------------ 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index dd25e6ff0785..85821e6d71fd 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -287,12 +287,12 @@ struct discard_cmd { struct discard_cmd_control { struct task_struct *f2fs_issue_discard; /* discard thread */ - struct list_head discard_entry_list; /* 4KB discard entry list */ - int nr_discards; /* # of discards in the list */ - struct list_head discard_pend_list; /* store pending entries */ - struct list_head discard_wait_list; /* store on-flushing entries */ + struct list_head entry_list; /* 4KB discard entry list */ + struct list_head pend_list; /* store pending entries */ + struct list_head wait_list; /* store on-flushing entries */ wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ struct mutex cmd_lock; + int nr_discards; /* # of discards in the list */ int max_discards; /* max. discards to be issued */ atomic_t issued_discard; /* # of issued discard */ atomic_t issing_discard; /* # of issing discard */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 9adc3bcfb4f4..d237efa523ca 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -677,7 +677,7 @@ static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, block_t start, block_t len) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct list_head *pend_list = &(dcc->discard_pend_list); + struct list_head *pend_list = &(dcc->pend_list); struct discard_cmd *dc; dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS); @@ -852,7 +852,7 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, bio->bi_private = dc; bio->bi_end_io = f2fs_submit_discard_endio; submit_bio(REQ_SYNC, bio); - list_move_tail(&dc->list, &dcc->discard_wait_list); + list_move_tail(&dc->list, &dcc->wait_list); } } else { __remove_discard_cmd(sbi, dc); @@ -1040,8 +1040,8 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct list_head *pend_list = &(dcc->discard_pend_list); - struct list_head *wait_list = &(dcc->discard_wait_list); + struct list_head *pend_list = &(dcc->pend_list); + struct list_head *wait_list = &(dcc->wait_list); struct discard_cmd *dc, *tmp; struct blk_plug plug; @@ -1065,8 +1065,8 @@ static int issue_discard_thread(void *data) struct f2fs_sb_info *sbi = data; struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; wait_queue_head_t *q = &dcc->discard_wait_queue; - struct list_head *pend_list = &dcc->discard_pend_list; - struct list_head *wait_list = &dcc->discard_wait_list; + struct list_head *pend_list = &dcc->pend_list; + struct list_head *wait_list = &dcc->wait_list; struct discard_cmd *dc, *tmp; struct blk_plug plug; int iter = 0; @@ -1214,7 +1214,7 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, unsigned int start = 0, end = -1; bool force = (cpc->reason == CP_DISCARD); struct discard_entry *de = NULL; - struct list_head *head = &SM_I(sbi)->dcc_info->discard_entry_list; + struct list_head *head = &SM_I(sbi)->dcc_info->entry_list; int i; if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi)) @@ -1263,7 +1263,7 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, void release_discard_addrs(struct f2fs_sb_info *sbi) { - struct list_head *head = &(SM_I(sbi)->dcc_info->discard_entry_list); + struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list); struct discard_entry *entry, *this; /* drop caches */ @@ -1289,7 +1289,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) { - struct list_head *head = &(SM_I(sbi)->dcc_info->discard_entry_list); + struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list); struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; @@ -1390,9 +1390,9 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) if (!dcc) return -ENOMEM; - INIT_LIST_HEAD(&dcc->discard_entry_list); - INIT_LIST_HEAD(&dcc->discard_pend_list); - INIT_LIST_HEAD(&dcc->discard_wait_list); + INIT_LIST_HEAD(&dcc->entry_list); + INIT_LIST_HEAD(&dcc->pend_list); + INIT_LIST_HEAD(&dcc->wait_list); mutex_init(&dcc->cmd_lock); atomic_set(&dcc->issued_discard, 0); atomic_set(&dcc->issing_discard, 0); From 0e9f98f97b6871261a5fd7420402039cc6a77198 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 15 Apr 2017 14:09:37 +0800 Subject: [PATCH 0261/1212] f2fs: in prior to issue big discard Keep issuing big size discard in prior instead of the one with random size, so that we expect that it will help to: - be quick to recycle unused large space in flash storage device. - give a chance for a) wait to merge small piece discards into bigger one, or b) avoid issuing discards while they have being reallocated by SSR. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 7 +++++- fs/f2fs/segment.c | 54 ++++++++++++++++++++++++++++++++++------------- 2 files changed, 45 insertions(+), 16 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 85821e6d71fd..9ede6bd15084 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -255,6 +255,11 @@ struct discard_entry { unsigned char discard_map[SIT_VBLOCK_MAP_SIZE]; /* segment discard bitmap */ }; +/* max discard pend list number */ +#define MAX_PLIST_NUM 512 +#define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \ + (MAX_PLIST_NUM - 1) : (blk_num - 1)) + enum { D_PREP, D_SUBMIT, @@ -288,7 +293,7 @@ struct discard_cmd { struct discard_cmd_control { struct task_struct *f2fs_issue_discard; /* discard thread */ struct list_head entry_list; /* 4KB discard entry list */ - struct list_head pend_list; /* store pending entries */ + struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */ struct list_head wait_list; /* store on-flushing entries */ wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ struct mutex cmd_lock; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d237efa523ca..c77037521dfe 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -677,9 +677,13 @@ static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, block_t start, block_t len) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct list_head *pend_list = &(dcc->pend_list); + struct list_head *pend_list; struct discard_cmd *dc; + f2fs_bug_on(sbi, !len); + + pend_list = &dcc->pend_list[plist_idx(len)]; + dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS); INIT_LIST_HEAD(&dc->list); dc->bdev = bdev; @@ -885,9 +889,16 @@ static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi, return dc; } +static void __relocate_discard_cmd(struct discard_cmd_control *dcc, + struct discard_cmd *dc) +{ + list_move_tail(&dc->list, &dcc->pend_list[plist_idx(dc->len)]); +} + static void __punch_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc, block_t blkaddr) { + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_info di = dc->di; bool modified = false; @@ -898,6 +909,7 @@ static void __punch_discard_cmd(struct f2fs_sb_info *sbi, if (blkaddr > di.lstart) { dc->len = blkaddr - dc->lstart; + __relocate_discard_cmd(dcc, dc); modified = true; } @@ -911,6 +923,7 @@ static void __punch_discard_cmd(struct f2fs_sb_info *sbi, dc->lstart++; dc->len--; dc->start++; + __relocate_discard_cmd(dcc, dc); } } } @@ -969,6 +982,7 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, prev_dc->bdev == bdev && __is_discard_back_mergeable(&di, &prev_dc->di)) { prev_dc->di.len += di.len; + __relocate_discard_cmd(dcc, prev_dc); di = prev_dc->di; tdc = prev_dc; merged = true; @@ -980,6 +994,7 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, next_dc->di.lstart = di.lstart; next_dc->di.len += di.len; next_dc->di.start = di.start; + __relocate_discard_cmd(dcc, next_dc); if (tdc) __remove_discard_cmd(sbi, tdc); @@ -1040,16 +1055,20 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct list_head *pend_list = &(dcc->pend_list); + struct list_head *pend_list; struct list_head *wait_list = &(dcc->wait_list); struct discard_cmd *dc, *tmp; struct blk_plug plug; + int i; mutex_lock(&dcc->cmd_lock); blk_start_plug(&plug); - list_for_each_entry_safe(dc, tmp, pend_list, list) - __submit_discard_cmd(sbi, dc); + for (i = 0; i < MAX_PLIST_NUM; i++) { + pend_list = &dcc->pend_list[i]; + list_for_each_entry_safe(dc, tmp, pend_list, list) + __submit_discard_cmd(sbi, dc); + } blk_finish_plug(&plug); list_for_each_entry_safe(dc, tmp, wait_list, list) { @@ -1065,26 +1084,30 @@ static int issue_discard_thread(void *data) struct f2fs_sb_info *sbi = data; struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; wait_queue_head_t *q = &dcc->discard_wait_queue; - struct list_head *pend_list = &dcc->pend_list; + struct list_head *pend_list; struct list_head *wait_list = &dcc->wait_list; struct discard_cmd *dc, *tmp; struct blk_plug plug; - int iter = 0; + int iter = 0, i; repeat: if (kthread_should_stop()) return 0; mutex_lock(&dcc->cmd_lock); blk_start_plug(&plug); - list_for_each_entry_safe(dc, tmp, pend_list, list) { - f2fs_bug_on(sbi, dc->state != D_PREP); + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + pend_list = &dcc->pend_list[i]; + list_for_each_entry_safe(dc, tmp, pend_list, list) { + f2fs_bug_on(sbi, dc->state != D_PREP); - if (is_idle(sbi)) - __submit_discard_cmd(sbi, dc); + if (is_idle(sbi)) + __submit_discard_cmd(sbi, dc); - if (iter++ > DISCARD_ISSUE_RATE) - break; + if (iter++ > DISCARD_ISSUE_RATE) + goto next_step; + } } +next_step: blk_finish_plug(&plug); list_for_each_entry_safe(dc, tmp, wait_list, list) { @@ -1099,7 +1122,7 @@ static int issue_discard_thread(void *data) congestion_wait(BLK_RW_SYNC, HZ/50); wait_event_interruptible(*q, kthread_should_stop() || - !list_empty(pend_list) || !list_empty(wait_list)); + atomic_read(&dcc->discard_cmd_cnt)); goto repeat; } @@ -1379,7 +1402,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; struct discard_cmd_control *dcc; - int err = 0; + int err = 0, i; if (SM_I(sbi)->dcc_info) { dcc = SM_I(sbi)->dcc_info; @@ -1391,7 +1414,8 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) return -ENOMEM; INIT_LIST_HEAD(&dcc->entry_list); - INIT_LIST_HEAD(&dcc->pend_list); + for (i = 0; i < MAX_PLIST_NUM; i++) + INIT_LIST_HEAD(&dcc->pend_list[i]); INIT_LIST_HEAD(&dcc->wait_list); mutex_init(&dcc->cmd_lock); atomic_set(&dcc->issued_discard, 0); From 048fe2a0a94798c9708f4ad3a0d3bcdd095f5da8 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 15 Apr 2017 14:09:38 +0800 Subject: [PATCH 0262/1212] f2fs: trace __submit_discard_cmd Add an even class f2fs_discard for introducing f2fs_queue_discard, then use f2fs_{queue,issue}_discard to trace __{queue,submit}_discard_cmd. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 4 +++- include/trace/events/f2fs.h | 16 +++++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c77037521dfe..e44fea9e2205 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -843,6 +843,8 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, if (dc->state != D_PREP) return; + trace_f2fs_issue_discard(dc->bdev, dc->start, dc->len); + dc->error = __blkdev_issue_discard(dc->bdev, SECTOR_FROM_BLOCK(dc->start), SECTOR_FROM_BLOCK(dc->len), @@ -1021,7 +1023,7 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, { block_t lblkstart = blkstart; - trace_f2fs_issue_discard(bdev, blkstart, blklen); + trace_f2fs_queue_discard(bdev, blkstart, blklen); if (sbi->s_ndevs) { int devi = f2fs_target_device_index(sbi, blkstart); diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 8ca1ddf50dc1..0796b2bf6870 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1118,7 +1118,7 @@ TRACE_EVENT(f2fs_write_checkpoint, __entry->msg) ); -TRACE_EVENT(f2fs_issue_discard, +DECLARE_EVENT_CLASS(f2fs_discard, TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen), @@ -1142,6 +1142,20 @@ TRACE_EVENT(f2fs_issue_discard, (unsigned long long)__entry->blklen) ); +DEFINE_EVENT(f2fs_discard, f2fs_queue_discard, + + TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen), + + TP_ARGS(dev, blkstart, blklen) +); + +DEFINE_EVENT(f2fs_discard, f2fs_issue_discard, + + TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen), + + TP_ARGS(dev, blkstart, blklen) +); + TRACE_EVENT(f2fs_issue_reset_zone, TP_PROTO(struct block_device *dev, block_t blkstart), From 73d23680deb6dada7423e3d7656ca8fd743cc58e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 17 Apr 2017 18:21:43 +0800 Subject: [PATCH 0263/1212] f2fs: introduce __check_rb_tree_consistence Introduce __check_rb_tree_consistence to check consistence of rb-tree based discard cache in runtime. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 32 ++++++++++++++++++++++++++++++++ fs/f2fs/f2fs.h | 2 ++ fs/f2fs/segment.c | 15 +++++++++++++-- 3 files changed, 47 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 221ad086ee00..2f98d7039701 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -159,6 +159,38 @@ struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root, return re; } +bool __check_rb_tree_consistence(struct f2fs_sb_info *sbi, + struct rb_root *root) +{ +#ifdef CONFIG_F2FS_CHECK_FS + struct rb_node *cur = rb_first(root), *next; + struct rb_entry *cur_re, *next_re; + + if (!cur) + return true; + + while (cur) { + next = rb_next(cur); + if (!next) + return true; + + cur_re = rb_entry(cur, struct rb_entry, rb_node); + next_re = rb_entry(next, struct rb_entry, rb_node); + + if (cur_re->ofs + cur_re->len > next_re->ofs) { + f2fs_msg(sbi->sb, KERN_INFO, "inconsistent rbtree, " + "cur(%u, %u) next(%u, %u)", + cur_re->ofs, cur_re->len, + next_re->ofs, next_re->len); + return false; + } + + cur = next; + } +#endif + return true; +} + static struct kmem_cache *extent_tree_slab; static struct kmem_cache *extent_node_slab; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9ede6bd15084..cab03e5532f0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2687,6 +2687,8 @@ struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root, struct rb_entry **prev_entry, struct rb_entry **next_entry, struct rb_node ***insert_p, struct rb_node **insert_parent, bool force); +bool __check_rb_tree_consistence(struct f2fs_sb_info *sbi, + struct rb_root *root); unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext); void f2fs_drop_extent_tree(struct inode *inode); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e44fea9e2205..df0bb6c4bb90 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -912,6 +912,7 @@ static void __punch_discard_cmd(struct f2fs_sb_info *sbi, if (blkaddr > di.lstart) { dc->len = blkaddr - dc->lstart; __relocate_discard_cmd(dcc, dc); + f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); modified = true; } @@ -921,11 +922,15 @@ static void __punch_discard_cmd(struct f2fs_sb_info *sbi, di.start + blkaddr + 1 - di.lstart, di.lstart + di.len - 1 - blkaddr, NULL, NULL); + f2fs_bug_on(sbi, + !__check_rb_tree_consistence(sbi, &dcc->root)); } else { dc->lstart++; dc->len--; dc->start++; __relocate_discard_cmd(dcc, dc); + f2fs_bug_on(sbi, + !__check_rb_tree_consistence(sbi, &dcc->root)); } } } @@ -985,6 +990,8 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, __is_discard_back_mergeable(&di, &prev_dc->di)) { prev_dc->di.len += di.len; __relocate_discard_cmd(dcc, prev_dc); + f2fs_bug_on(sbi, + !__check_rb_tree_consistence(sbi, &dcc->root)); di = prev_dc->di; tdc = prev_dc; merged = true; @@ -999,13 +1006,17 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, __relocate_discard_cmd(dcc, next_dc); if (tdc) __remove_discard_cmd(sbi, tdc); - + f2fs_bug_on(sbi, + !__check_rb_tree_consistence(sbi, &dcc->root)); merged = true; } - if (!merged) + if (!merged) { __insert_discard_tree(sbi, bdev, di.lstart, di.start, di.len, NULL, NULL); + f2fs_bug_on(sbi, + !__check_rb_tree_consistence(sbi, &dcc->root)); + } next: prev_dc = next_dc; if (!prev_dc) From e818486a9ada270ff28bbd21c02294122352b392 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 18 Apr 2017 19:23:39 +0800 Subject: [PATCH 0264/1212] f2fs: unlock cp_rwsem early for IPU writes For IPU writes, there won't be any udpates in dnode page since we will reuse old block address instead of allocating new one, so we don't need to lock cp_rwsem during IPU IO submitting. Signed-off-by: Chao Yu --- fs/f2fs/data.c | 6 +++++- fs/f2fs/f2fs.h | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 85da089004ee..64aa38b21bf9 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1355,6 +1355,8 @@ int do_write_data_page(struct f2fs_io_info *fio) !is_cold_data(page) && !IS_ATOMIC_WRITTEN_PAGE(page) && need_inplace_update(inode))) { + f2fs_unlock_op(F2FS_I_SB(inode)); + fio->cp_rwsem_locked = false; err = rewrite_data_page(fio); set_inode_flag(inode, FI_UPDATE_WRITE); trace_f2fs_do_write_data_page(page, IPU); @@ -1390,6 +1392,7 @@ static int __write_data_page(struct page *page, bool *submitted, .page = page, .encrypted_page = NULL, .submitted = false, + .cp_rwsem_locked = true, }; trace_f2fs_writepage(page, DATA); @@ -1447,7 +1450,8 @@ static int __write_data_page(struct page *page, bool *submitted, err = do_write_data_page(&fio); if (F2FS_I(inode)->last_disk_size < psize) F2FS_I(inode)->last_disk_size = psize; - f2fs_unlock_op(sbi); + if (fio.cp_rwsem_locked) + f2fs_unlock_op(sbi); done: if (err && err != -ENOENT) goto redirty_out; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index cab03e5532f0..6e73b4aa0de2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -860,6 +860,7 @@ struct f2fs_io_info { struct page *page; /* page to be written */ struct page *encrypted_page; /* encrypted page */ bool submitted; /* indicate IO submission */ + bool cp_rwsem_locked; /* indicate cp_rwsem is held */ }; #define is_read_io(rw) ((rw) == READ) From b88a1ae0f2d2baa6aab582fc22c7af238547a1e3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 18 Apr 2017 19:27:39 +0800 Subject: [PATCH 0265/1212] f2fs: add undiscard blocks stat This patch adds to account undiscard blocks. Signed-off-by: Chao Yu --- fs/f2fs/debug.c | 5 +++-- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/segment.c | 9 +++++++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 692beff66bf8..6102737473d4 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -69,6 +69,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) atomic_read(&SM_I(sbi)->dcc_info->issing_discard); si->nr_discard_cmd = atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt); + si->undiscard_blks = SM_I(sbi)->dcc_info->undiscard_blks; } si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); @@ -348,11 +349,11 @@ static int stat_show(struct seq_file *s, void *v) si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: (%4d %4d), " - "Discard: (%4d %4d)) cmd: %4d\n", + "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n", si->nr_wb_cp_data, si->nr_wb_data, si->nr_flushing, si->nr_flushed, si->nr_discarding, si->nr_discarded, - si->nr_discard_cmd); + si->nr_discard_cmd, si->undiscard_blks); seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), " "volatile IO: %4d (Max. %4d)\n", si->inmem_pages, si->aw_cnt, si->max_aw_cnt, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6e73b4aa0de2..1ca8f8963e61 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -299,6 +299,7 @@ struct discard_cmd_control { struct mutex cmd_lock; int nr_discards; /* # of discards in the list */ int max_discards; /* max. discards to be issued */ + unsigned int undiscard_blks; /* # of undiscard blocks */ atomic_t issued_discard; /* # of issued discard */ atomic_t issing_discard; /* # of issing discard */ atomic_t discard_cmd_cnt; /* # of cached cmd count */ @@ -2457,6 +2458,7 @@ struct f2fs_stat_info { int bg_gc, nr_wb_cp_data, nr_wb_data; int nr_flushing, nr_flushed, nr_discarding, nr_discarded; int nr_discard_cmd; + unsigned int undiscard_blks; int inline_xattr, inline_inode, inline_dir, append, update, orphans; int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index df0bb6c4bb90..ba46aa20db1a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -695,6 +695,7 @@ static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, init_completion(&dc->wait); list_add_tail(&dc->list, pend_list); atomic_inc(&dcc->discard_cmd_cnt); + dcc->undiscard_blks += len; return dc; } @@ -723,6 +724,7 @@ static void __detach_discard_cmd(struct discard_cmd_control *dcc, list_del(&dc->list); rb_erase(&dc->rb_node, &dcc->root); + dcc->undiscard_blks -= dc->len; kmem_cache_free(discard_cmd_slab, dc); @@ -909,8 +911,11 @@ static void __punch_discard_cmd(struct f2fs_sb_info *sbi, return; } + dcc->undiscard_blks -= di.len; + if (blkaddr > di.lstart) { dc->len = blkaddr - dc->lstart; + dcc->undiscard_blks += dc->len; __relocate_discard_cmd(dcc, dc); f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); modified = true; @@ -928,6 +933,7 @@ static void __punch_discard_cmd(struct f2fs_sb_info *sbi, dc->lstart++; dc->len--; dc->start++; + dcc->undiscard_blks += dc->len; __relocate_discard_cmd(dcc, dc); f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); @@ -989,6 +995,7 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, prev_dc->bdev == bdev && __is_discard_back_mergeable(&di, &prev_dc->di)) { prev_dc->di.len += di.len; + dcc->undiscard_blks += di.len; __relocate_discard_cmd(dcc, prev_dc); f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); @@ -1003,6 +1010,7 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, next_dc->di.lstart = di.lstart; next_dc->di.len += di.len; next_dc->di.start = di.start; + dcc->undiscard_blks += di.len; __relocate_discard_cmd(dcc, next_dc); if (tdc) __remove_discard_cmd(sbi, tdc); @@ -1436,6 +1444,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) atomic_set(&dcc->discard_cmd_cnt, 0); dcc->nr_discards = 0; dcc->max_discards = 0; + dcc->undiscard_blks = 0; dcc->root = RB_ROOT; init_waitqueue_head(&dcc->discard_wait_queue); From ac2de6c6cbd6e927584b805a10172eb5f1f1d330 Mon Sep 17 00:00:00 2001 From: Hou Pengyang Date: Tue, 18 Apr 2017 11:57:16 +0000 Subject: [PATCH 0266/1212] f2fs: introduce async IPU policy This patch introduces an ASYNC IPU policy. Under senario of large # of async updating(e.g. log writing in Android), disk would be seriously fragmented, and higher frequent gc would be triggered. This patch uses IPU to rewrite the async update writting, since async is NOT sensitive to io latency. Signed-off-by: Hou Pengyang --- fs/f2fs/data.c | 2 +- fs/f2fs/file.c | 2 +- fs/f2fs/segment.h | 12 +++++++++++- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 64aa38b21bf9..c990c4735505 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1354,7 +1354,7 @@ int do_write_data_page(struct f2fs_io_info *fio) if (unlikely(fio->old_blkaddr != NEW_ADDR && !is_cold_data(page) && !IS_ATOMIC_WRITTEN_PAGE(page) && - need_inplace_update(inode))) { + need_inplace_update(inode, fio))) { f2fs_unlock_op(F2FS_I_SB(inode)); fio->cp_rwsem_locked = false; err = rewrite_data_page(fio); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 1da2ceaaac3e..1ed58a631bac 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1903,7 +1903,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, int err; /* if in-place-update policy is enabled, don't waste time here */ - if (need_inplace_update(inode)) + if (need_inplace_update(inode, NULL)) return -EINVAL; pg_start = range->start >> PAGE_SHIFT; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 5f6ef163aa8f..3cd780a42f51 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -556,9 +556,11 @@ enum { F2FS_IPU_UTIL, F2FS_IPU_SSR_UTIL, F2FS_IPU_FSYNC, + F2FS_IPU_ASYNC, }; -static inline bool need_inplace_update(struct inode *inode) +static inline bool need_inplace_update(struct inode *inode, + struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned int policy = SM_I(sbi)->ipu_policy; @@ -581,6 +583,14 @@ static inline bool need_inplace_update(struct inode *inode) utilization(sbi) > SM_I(sbi)->min_ipu_util) return true; + /* + * IPU for rewrite async pages + */ + if (policy & (0x1 << F2FS_IPU_ASYNC) && + fio && fio->op == REQ_OP_WRITE && + !(fio->op_flags & REQ_SYNC)) + return true; + /* this is only set during fdatasync */ if (policy & (0x1 << F2FS_IPU_FSYNC) && is_inode_flag_set(inode, FI_NEED_IPU)) From 1b73445838adfda63d6bba040e3aec87b7d9818a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 13 Apr 2017 15:17:00 -0700 Subject: [PATCH 0267/1212] f2fs: add ioctl to flush data from faster device to cold area This patch adds an ioctl to flush data in faster device to cold area. User can give device number and number of segments to move. It doesn't move it if there is only one device. The parameter looks like: struct f2fs_flush_device { u32 dev_num; /* device number to flush */ u32 segments; /* # of segments to flush */ }; Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 11 ++++++-- fs/f2fs/file.c | 69 +++++++++++++++++++++++++++++++++++++++++++++-- fs/f2fs/gc.c | 42 +++++++++++++++++++---------- fs/f2fs/segment.c | 14 +++++++--- fs/f2fs/segment.h | 7 ++++- 5 files changed, 120 insertions(+), 23 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1ca8f8963e61..b7052f911ea5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -367,6 +367,8 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_IOC_DEFRAGMENT _IO(F2FS_IOCTL_MAGIC, 8) #define F2FS_IOC_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \ struct f2fs_move_range) +#define F2FS_IOC_FLUSH_DEVICE _IOW(F2FS_IOCTL_MAGIC, 10, \ + struct f2fs_flush_device) #define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY #define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY @@ -403,6 +405,11 @@ struct f2fs_move_range { u64 len; /* size to move */ }; +struct f2fs_flush_device { + u32 dev_num; /* device number to flush */ + u32 segments; /* # of segments to flush */ +}; + /* * For INODE and NODE manager */ @@ -1047,7 +1054,6 @@ struct f2fs_sb_info { int bg_gc; /* background gc calls */ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ #endif - unsigned int last_victim[2]; /* last victim segment # */ spinlock_t stat_lock; /* lock for stat operations */ /* For sysfs suppport */ @@ -2429,7 +2435,8 @@ int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, int start_gc_thread(struct f2fs_sb_info *sbi); void stop_gc_thread(struct f2fs_sb_info *sbi); block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode); -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background); +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, + unsigned int segno); void build_gc_manager(struct f2fs_sb_info *sbi); /* diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 1ed58a631bac..fc1e6d048fd2 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1860,7 +1860,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) mutex_lock(&sbi->gc_mutex); } - ret = f2fs_gc(sbi, sync, true); + ret = f2fs_gc(sbi, sync, true, NULL_SEGNO); out: mnt_drop_write_file(filp); return ret; @@ -2216,6 +2216,69 @@ static int f2fs_ioc_move_range(struct file *filp, unsigned long arg) return err; } +static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct sit_info *sm = SIT_I(sbi); + unsigned int start_segno = 0, end_segno = 0; + unsigned int dev_start_segno = 0, dev_end_segno = 0; + struct f2fs_flush_device range; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + if (copy_from_user(&range, (struct f2fs_flush_device __user *)arg, + sizeof(range))) + return -EFAULT; + + if (sbi->s_ndevs <= 1 || sbi->s_ndevs - 1 <= range.dev_num || + sbi->segs_per_sec != 1) { + f2fs_msg(sbi->sb, KERN_WARNING, + "Can't flush %u in %d for segs_per_sec %u != 1\n", + range.dev_num, sbi->s_ndevs, + sbi->segs_per_sec); + return -EINVAL; + } + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + if (range.dev_num != 0) + dev_start_segno = GET_SEGNO(sbi, FDEV(range.dev_num).start_blk); + dev_end_segno = GET_SEGNO(sbi, FDEV(range.dev_num).end_blk); + + start_segno = sm->last_victim[FLUSH_DEVICE]; + if (start_segno < dev_start_segno || start_segno >= dev_end_segno) + start_segno = dev_start_segno; + end_segno = min(start_segno + range.segments, dev_end_segno); + + while (start_segno < end_segno) { + if (!mutex_trylock(&sbi->gc_mutex)) { + ret = -EBUSY; + goto out; + } + sm->last_victim[GC_CB] = end_segno + 1; + sm->last_victim[GC_GREEDY] = end_segno + 1; + sm->last_victim[ALLOC_NEXT] = end_segno + 1; + ret = f2fs_gc(sbi, true, true, start_segno); + if (ret == -EAGAIN) + ret = 0; + else if (ret < 0) + break; + start_segno++; + } +out: + mnt_drop_write_file(filp); + return ret; +} + + long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { @@ -2253,6 +2316,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_defragment(filp, arg); case F2FS_IOC_MOVE_RANGE: return f2fs_ioc_move_range(filp, arg); + case F2FS_IOC_FLUSH_DEVICE: + return f2fs_ioc_flush_device(filp, arg); default: return -ENOTTY; } @@ -2325,8 +2390,8 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_GARBAGE_COLLECT: case F2FS_IOC_WRITE_CHECKPOINT: case F2FS_IOC_DEFRAGMENT: - break; case F2FS_IOC_MOVE_RANGE: + case F2FS_IOC_FLUSH_DEVICE: break; default: return -ENOIOCTLCMD; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 84ade968d149..39b738dc36c7 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -84,7 +84,7 @@ static int gc_thread_func(void *data) stat_inc_bggc_count(sbi); /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true)) + if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true, NULL_SEGNO)) wait_ms = gc_th->no_gc_sleep_time; trace_f2fs_background_gc(sbi->sb, wait_ms, @@ -176,7 +176,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) p->offset = 0; else - p->offset = sbi->last_victim[p->gc_mode]; + p->offset = SIT_I(sbi)->last_victim[p->gc_mode]; } static unsigned int get_max_cost(struct f2fs_sb_info *sbi, @@ -295,6 +295,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, unsigned int *result, int gc_type, int type, char alloc_mode) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + struct sit_info *sm = SIT_I(sbi); struct victim_sel_policy p; unsigned int secno, last_victim; unsigned int last_segment = MAIN_SEGS(sbi); @@ -308,10 +309,18 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, p.min_segno = NULL_SEGNO; p.min_cost = get_max_cost(sbi, &p); + if (*result != NULL_SEGNO) { + if (IS_DATASEG(get_seg_entry(sbi, *result)->type) && + get_valid_blocks(sbi, *result, false) && + !sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result))) + p.min_segno = *result; + goto out; + } + if (p.max_search == 0) goto out; - last_victim = sbi->last_victim[p.gc_mode]; + last_victim = sm->last_victim[p.gc_mode]; if (p.alloc_mode == LFS && gc_type == FG_GC) { p.min_segno = check_bg_victims(sbi); if (p.min_segno != NULL_SEGNO) @@ -324,9 +333,10 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, segno = find_next_bit(p.dirty_segmap, last_segment, p.offset); if (segno >= last_segment) { - if (sbi->last_victim[p.gc_mode]) { - last_segment = sbi->last_victim[p.gc_mode]; - sbi->last_victim[p.gc_mode] = 0; + if (sm->last_victim[p.gc_mode]) { + last_segment = + sm->last_victim[p.gc_mode]; + sm->last_victim[p.gc_mode] = 0; p.offset = 0; continue; } @@ -361,11 +371,11 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, } next: if (nsearched >= p.max_search) { - if (!sbi->last_victim[p.gc_mode] && segno <= last_victim) - sbi->last_victim[p.gc_mode] = last_victim + 1; + if (!sm->last_victim[p.gc_mode] && segno <= last_victim) + sm->last_victim[p.gc_mode] = last_victim + 1; else - sbi->last_victim[p.gc_mode] = segno + 1; - sbi->last_victim[p.gc_mode] %= MAIN_SEGS(sbi); + sm->last_victim[p.gc_mode] = segno + 1; + sm->last_victim[p.gc_mode] %= MAIN_SEGS(sbi); break; } } @@ -912,7 +922,6 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, * - mutex_lock(sentry_lock) - change_curseg() * - lock_page(sum_page) */ - if (type == SUM_TYPE_NODE) gc_node_segment(sbi, sum->entries, segno, gc_type); else @@ -939,13 +948,14 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, return sec_freed; } -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background) +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, + bool background, unsigned int segno) { - unsigned int segno; int gc_type = sync ? FG_GC : BG_GC; int sec_freed = 0; int ret = -EINVAL; struct cp_control cpc; + unsigned int init_segno = segno; struct gc_inode_list gc_list = { .ilist = LIST_HEAD_INIT(gc_list.ilist), .iroot = RADIX_TREE_INIT(GFP_NOFS), @@ -990,13 +1000,17 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background) sbi->cur_victim_sec = NULL_SEGNO; if (!sync) { - if (has_not_enough_free_secs(sbi, sec_freed, 0)) + if (has_not_enough_free_secs(sbi, sec_freed, 0)) { + segno = NULL_SEGNO; goto gc_more; + } if (gc_type == FG_GC) ret = write_checkpoint(sbi, &cpc); } stop: + SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; + SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno; mutex_unlock(&sbi->gc_mutex); put_gc_inode(&gc_list); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ba46aa20db1a..888bde8cec34 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -401,7 +401,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) */ if (has_not_enough_free_secs(sbi, 0, 0)) { mutex_lock(&sbi->gc_mutex); - f2fs_gc(sbi, false, false); + f2fs_gc(sbi, false, false, NULL_SEGNO); } } @@ -1834,6 +1834,8 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) return 0; + if (SIT_I(sbi)->last_victim[ALLOC_NEXT]) + return SIT_I(sbi)->last_victim[ALLOC_NEXT]; return CURSEG_I(sbi, type)->segno; } @@ -1931,12 +1933,15 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; + unsigned segno = NULL_SEGNO; int i, cnt; bool reversed = false; /* need_SSR() already forces to do this */ - if (v_ops->get_victim(sbi, &(curseg)->next_segno, BG_GC, type, SSR)) + if (v_ops->get_victim(sbi, &segno, BG_GC, type, SSR)) { + curseg->next_segno = segno; return 1; + } /* For node segments, let's do SSR more intensively */ if (IS_NODESEG(type)) { @@ -1960,9 +1965,10 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) for (; cnt-- > 0; reversed ? i-- : i++) { if (i == type) continue; - if (v_ops->get_victim(sbi, &(curseg)->next_segno, - BG_GC, i, SSR)) + if (v_ops->get_victim(sbi, &segno, BG_GC, i, SSR)) { + curseg->next_segno = segno; return 1; + } } return 0; } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 3cd780a42f51..93cc4e504aab 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -138,7 +138,10 @@ enum { */ enum { GC_CB = 0, - GC_GREEDY + GC_GREEDY, + ALLOC_NEXT, + FLUSH_DEVICE, + MAX_GC_POLICY, }; /* @@ -233,6 +236,8 @@ struct sit_info { unsigned long long mounted_time; /* mount time */ unsigned long long min_mtime; /* min. modification time */ unsigned long long max_mtime; /* max. modification time */ + + unsigned int last_victim[MAX_GC_POLICY]; /* last victim segment # */ }; struct free_segmap_info { From b16a719c967a5e27c036f0435ce0632983d1a857 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 18 Apr 2017 13:47:25 -0700 Subject: [PATCH 0268/1212] f2fs: fix _IOW usage This patch fixes wrong _IOW usage. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b7052f911ea5..4a517cde5fd8 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -362,9 +362,10 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3) #define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4) #define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) -#define F2FS_IOC_GARBAGE_COLLECT _IO(F2FS_IOCTL_MAGIC, 6) +#define F2FS_IOC_GARBAGE_COLLECT _IOW(F2FS_IOCTL_MAGIC, 6, __u32) #define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7) -#define F2FS_IOC_DEFRAGMENT _IO(F2FS_IOCTL_MAGIC, 8) +#define F2FS_IOC_DEFRAGMENT _IOWR(F2FS_IOCTL_MAGIC, 8, \ + struct f2fs_defragment) #define F2FS_IOC_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \ struct f2fs_move_range) #define F2FS_IOC_FLUSH_DEVICE _IOW(F2FS_IOCTL_MAGIC, 10, \ From e9b7e2e3bbc3cb0dd0a4f3af2c1a679d307a0380 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 18 Apr 2017 15:03:15 -0700 Subject: [PATCH 0269/1212] f2fs: assign allocation hint for warm/cold data This patch gives slower device region to warm/cold data area more eagerly. Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 39b738dc36c7..f3102a895c48 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1033,4 +1033,9 @@ void build_gc_manager(struct f2fs_sb_info *sbi) sbi->fggc_threshold = div64_u64((main_count - ovp_count) * BLKS_PER_SEC(sbi), (main_count - resv_count)); + + /* give warm/cold data area from slower device */ + if (sbi->s_ndevs && sbi->segs_per_sec == 1) + SIT_I(sbi)->last_victim[ALLOC_NEXT] = + GET_SEGNO(sbi, FDEV(0).end_blk) + 1; } From 11538a935f968ab8aefaeedec772cae27e3f57eb Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 19 Apr 2017 19:38:33 +0200 Subject: [PATCH 0270/1212] f2fs: improve definition of statistic macros With a recent addition of f2fs_lookup_extent_tree(), we get a warning about the use of empty macros: fs/f2fs/extent_cache.c: In function 'f2fs_lookup_extent_tree': fs/f2fs/extent_cache.c:358:32: error: suggest braces around empty body in an 'else' statement [-Werror=empty-body] stat_inc_rbtree_node_hit(sbi); A good way to avoid the warning and make the code more robust is to define all no-op macros as 'do { } while (0)'. Fixes: 54c2258cd63a ("f2fs: extract rb-tree operation infrastructure") Signed-off-by: Arnd Bergmann Reivewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 58 +++++++++++++++++++++++++------------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4a517cde5fd8..c98c07cee464 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2599,35 +2599,35 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi); int __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); #else -#define stat_inc_cp_count(si) -#define stat_inc_bg_cp_count(si) -#define stat_inc_call_count(si) -#define stat_inc_bggc_count(si) -#define stat_inc_dirty_inode(sbi, type) -#define stat_dec_dirty_inode(sbi, type) -#define stat_inc_total_hit(sb) -#define stat_inc_rbtree_node_hit(sb) -#define stat_inc_largest_node_hit(sbi) -#define stat_inc_cached_node_hit(sbi) -#define stat_inc_inline_xattr(inode) -#define stat_dec_inline_xattr(inode) -#define stat_inc_inline_inode(inode) -#define stat_dec_inline_inode(inode) -#define stat_inc_inline_dir(inode) -#define stat_dec_inline_dir(inode) -#define stat_inc_atomic_write(inode) -#define stat_dec_atomic_write(inode) -#define stat_update_max_atomic_write(inode) -#define stat_inc_volatile_write(inode) -#define stat_dec_volatile_write(inode) -#define stat_update_max_volatile_write(inode) -#define stat_inc_seg_type(sbi, curseg) -#define stat_inc_block_count(sbi, curseg) -#define stat_inc_inplace_blocks(sbi) -#define stat_inc_seg_count(sbi, type, gc_type) -#define stat_inc_tot_blk_count(si, blks) -#define stat_inc_data_blk_count(sbi, blks, gc_type) -#define stat_inc_node_blk_count(sbi, blks, gc_type) +#define stat_inc_cp_count(si) do { } while (0) +#define stat_inc_bg_cp_count(si) do { } while (0) +#define stat_inc_call_count(si) do { } while (0) +#define stat_inc_bggc_count(si) do { } while (0) +#define stat_inc_dirty_inode(sbi, type) do { } while (0) +#define stat_dec_dirty_inode(sbi, type) do { } while (0) +#define stat_inc_total_hit(sb) do { } while (0) +#define stat_inc_rbtree_node_hit(sb) do { } while (0) +#define stat_inc_largest_node_hit(sbi) do { } while (0) +#define stat_inc_cached_node_hit(sbi) do { } while (0) +#define stat_inc_inline_xattr(inode) do { } while (0) +#define stat_dec_inline_xattr(inode) do { } while (0) +#define stat_inc_inline_inode(inode) do { } while (0) +#define stat_dec_inline_inode(inode) do { } while (0) +#define stat_inc_inline_dir(inode) do { } while (0) +#define stat_dec_inline_dir(inode) do { } while (0) +#define stat_inc_atomic_write(inode) do { } while (0) +#define stat_dec_atomic_write(inode) do { } while (0) +#define stat_update_max_atomic_write(inode) do { } while (0) +#define stat_inc_volatile_write(inode) do { } while (0) +#define stat_dec_volatile_write(inode) do { } while (0) +#define stat_update_max_volatile_write(inode) do { } while (0) +#define stat_inc_seg_type(sbi, curseg) do { } while (0) +#define stat_inc_block_count(sbi, curseg) do { } while (0) +#define stat_inc_inplace_blocks(sbi) do { } while (0) +#define stat_inc_seg_count(sbi, type, gc_type) do { } while (0) +#define stat_inc_tot_blk_count(si, blks) do { } while (0) +#define stat_inc_data_blk_count(sbi, blks, gc_type) do { } while (0) +#define stat_inc_node_blk_count(sbi, blks, gc_type) do { } while (0) static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } From 0cfd113b84607c3827f01049bc9cd75559a906d1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 20 Apr 2017 13:51:57 -0700 Subject: [PATCH 0271/1212] f2fs: fix out-of free segments This patch also reverts d0db7703ac1 ("f2fs: do SSR in higher priority"). This patch fixes out of free segments caused by many small file creation by 1) mkfs -s 1 2G 2) mount 3) untar - preoduce 60000 small files burstly 4) sync - flush node pages - flush imeta Here, when we do f2fs_balance_fs, we missed # of imeta blocks, resulting in skipping to check has_not_enough_free_secs. Another test is done by 1) mkfs -s 12 2G 2) mount 3) untar - preoduce 60000 small files burstly 4) sync - flush node pages - flush imeta In this case, this patch also fixes wrong block allocation under large section size. Reported-by: William Brana Cc: Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 ++- fs/f2fs/inode.c | 3 ++- fs/f2fs/segment.c | 26 +++++++++++++++++++++----- 3 files changed, 25 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index c990c4735505..67d05b001722 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1470,7 +1470,8 @@ static int __write_data_page(struct page *page, bool *submitted, } unlock_page(page); - f2fs_balance_fs(sbi, need_balance_fs); + if (!S_ISDIR(inode->i_mode)) + f2fs_balance_fs(sbi, need_balance_fs); if (unlikely(f2fs_cp_error(sbi))) { f2fs_submit_merged_bio(sbi, DATA, WRITE); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 0900814485c7..518f49643092 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -338,7 +338,8 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) * We need to balance fs here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. */ - if (update_inode_page(inode) && wbc && wbc->nr_to_write) + update_inode_page(inode); + if (wbc && wbc->nr_to_write) f2fs_balance_fs(sbi, true); return 0; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 888bde8cec34..f5dbb6ef8390 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -388,11 +388,8 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) } #endif - if (!need) - return; - /* balance_fs_bg is able to be pending */ - if (excess_cached_nats(sbi)) + if (need && excess_cached_nats(sbi)) f2fs_balance_fs_bg(sbi); /* @@ -1718,6 +1715,17 @@ static void write_current_sum_page(struct f2fs_sb_info *sbi, f2fs_put_page(page, 1); } +static int is_next_segment_free(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned int segno = curseg->segno + 1; + struct free_segmap_info *free_i = FREE_I(sbi); + + if (segno < MAIN_SEGS(sbi) && segno % sbi->segs_per_sec) + return !test_bit(segno, free_i->free_segmap); + return 0; +} + /* * Find a new segment from the free segments bitmap to right order * This function should be returned with success, otherwise BUG @@ -1831,6 +1839,10 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) { + /* if segs_per_sec is large than 1, we need to keep original policy. */ + if (sbi->segs_per_sec != 1) + return CURSEG_I(sbi, type)->segno; + if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) return 0; @@ -1980,17 +1992,21 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) static void allocate_segment_by_default(struct f2fs_sb_info *sbi, int type, bool force) { + struct curseg_info *curseg = CURSEG_I(sbi, type); + if (force) new_curseg(sbi, type, true); else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && type == CURSEG_WARM_NODE) new_curseg(sbi, type, false); + else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) + new_curseg(sbi, type, false); else if (need_SSR(sbi) && get_ssr_segment(sbi, type)) change_curseg(sbi, type, true); else new_curseg(sbi, type, false); - stat_inc_seg_type(sbi, CURSEG_I(sbi, type)); + stat_inc_seg_type(sbi, curseg); } void allocate_new_segments(struct f2fs_sb_info *sbi) From 352c91d0d482987cd5ed21f46671a80e2616c9a5 Mon Sep 17 00:00:00 2001 From: Hou Pengyang Date: Fri, 21 Apr 2017 12:41:48 +0000 Subject: [PATCH 0272/1212] f2fs: skip encrypted inode in ASYNC IPU policy Async request may be throttled in block layer, so page for async may keep WRITE_BACK for a long time. For encrytped inode, we need wait on page writeback no matter if the device supports BDI_CAP_STABLE_WRITES. This may result in a higher waiting page writeback time for async encrypted inode page. This patch skips IPU for encrypted inode's updating write. Signed-off-by: Hou Pengyang Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 93cc4e504aab..8ad22b8cbba7 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -593,7 +593,8 @@ static inline bool need_inplace_update(struct inode *inode, */ if (policy & (0x1 << F2FS_IPU_ASYNC) && fio && fio->op == REQ_OP_WRITE && - !(fio->op_flags & REQ_SYNC)) + !(fio->op_flags & REQ_SYNC) && + !f2fs_encrypted_inode(inode)) return true; /* this is only set during fdatasync */ From 38f30f047da2994e9b71b5411e2e60757b21f74d Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Sat, 22 Apr 2017 10:39:20 +0800 Subject: [PATCH 0273/1212] f2fs: fix multiple f2fs_add_link() having same name for inline dentry Commit 88c5c13a5027 (f2fs: fix multiple f2fs_add_link() calls having same name) does not cover the scenario where inline dentry is enabled. In that case, F2FS_I(dir)->task will be NULL, and __f2fs_add_link will lookup dentries one more time. This patch fixes it by moving the assigment of current task to a upper level to cover both normal and inline dentry. Cc: Fixes: 88c5c13a5027 (f2fs: fix multiple f2fs_add_link() calls having same name) Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index db077960e376..f44e1370890f 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -207,13 +207,9 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, f2fs_put_page(dentry_page, 0); } - /* This is to increase the speed of f2fs_create */ - if (!de && room) { - F2FS_I(dir)->task = current; - if (F2FS_I(dir)->chash != namehash) { - F2FS_I(dir)->chash = namehash; - F2FS_I(dir)->clevel = level; - } + if (!de && room && F2FS_I(dir)->chash != namehash) { + F2FS_I(dir)->chash = namehash; + F2FS_I(dir)->clevel = level; } return de; @@ -254,6 +250,9 @@ struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, break; } out: + /* This is to increase the speed of f2fs_create */ + if (!de) + F2FS_I(dir)->task = current; return de; } From 9933f6e186a32d6f2da5581d97523b7fd99a4eba Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Sat, 22 Apr 2017 18:06:26 +0800 Subject: [PATCH 0274/1212] f2fs: seperate read nat page from nat_tree_lock This patch seperate nat page read io from nat_tree_lock. -lock_page -get_node_info() -current_nat_addr ...... -> write_checkpoint -get_meta_page Because we lock node page, we can make sure no other threads modify this nid concurrently. So we just obtain current_nat_addr under nat_tree_lock, node info is always same in both nat pack. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index dbf0efeb1cde..a63399338ff4 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -376,6 +376,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) struct page *page = NULL; struct f2fs_nat_entry ne; struct nat_entry *e; + pgoff_t index; int i; ni->nid = nid; @@ -401,17 +402,21 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) node_info_from_raw_nat(ni, &ne); } up_read(&curseg->journal_rwsem); - if (i >= 0) + if (i >= 0) { + up_read(&nm_i->nat_tree_lock); goto cache; + } /* Fill node_info from nat page */ - page = get_current_nat_page(sbi, start_nid); + index = current_nat_addr(sbi, nid); + up_read(&nm_i->nat_tree_lock); + + page = get_meta_page(sbi, index); nat_blk = (struct f2fs_nat_block *)page_address(page); ne = nat_blk->entries[nid - start_nid]; node_info_from_raw_nat(ni, &ne); f2fs_put_page(page, 1); cache: - up_read(&nm_i->nat_tree_lock); /* cache nat entry */ down_write(&nm_i->nat_tree_lock); cache_nat_entry(sbi, nid, &ne); From e7a9ce2e7cc68d29ed27926a40eafb33c9bb62e0 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 25 Apr 2017 00:21:34 +0800 Subject: [PATCH 0275/1212] f2fs: delay awaking discard thread It's better to delay awaking discard thread while queuing discard commands in checkpoint, it will help to give more chances for merging big and small discard. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f5dbb6ef8390..93c6d8a00722 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1047,7 +1047,6 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, blkstart -= FDEV(devi).start_blk; } __update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen); - wake_up(&SM_I(sbi)->dcc_info->discard_wait_queue); return 0; } @@ -1414,6 +1413,8 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) SM_I(sbi)->dcc_info->nr_discards -= total_len; kmem_cache_free(discard_entry_slab, entry); } + + wake_up(&SM_I(sbi)->dcc_info->discard_wait_queue); } static int create_discard_cmd_control(struct f2fs_sb_info *sbi) From 9170805a6362eb449000f3e47a4a9c39e0f0dd8a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 25 Apr 2017 00:21:35 +0800 Subject: [PATCH 0276/1212] f2fs: enable small discard by default This patch start to enable 4K granularity small discard by default when realtime discard is on, so, in seriously fragmented space, small size discard can be issued in time to avoid useless storage space occupying of invalid filesystem's data, then performance of flash storage can be recovered. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 4 ++-- fs/f2fs/segment.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c98c07cee464..635bca168078 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -297,8 +297,8 @@ struct discard_cmd_control { struct list_head wait_list; /* store on-flushing entries */ wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ struct mutex cmd_lock; - int nr_discards; /* # of discards in the list */ - int max_discards; /* max. discards to be issued */ + unsigned int nr_discards; /* # of discards in the list */ + unsigned int max_discards; /* max. discards to be issued */ unsigned int undiscard_blks; /* # of undiscard blocks */ atomic_t issued_discard; /* # of issued discard */ atomic_t issing_discard; /* # of issing discard */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 93c6d8a00722..cc617da64d38 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1441,7 +1441,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) atomic_set(&dcc->issing_discard, 0); atomic_set(&dcc->discard_cmd_cnt, 0); dcc->nr_discards = 0; - dcc->max_discards = 0; + dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg; dcc->undiscard_blks = 0; dcc->root = RB_ROOT; From 72b8a76169d7b2743479a693f4b15d36ccf70e0c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 25 Apr 2017 20:21:37 +0800 Subject: [PATCH 0277/1212] f2fs: introduce __issue_discard_cmd Just cleanup, no logic change. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 63 ++++++++++++++++++++++------------------------- 1 file changed, 30 insertions(+), 33 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index cc617da64d38..b49818dd02c4 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1050,6 +1050,32 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, return 0; } +static void __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *pend_list; + struct discard_cmd *dc, *tmp; + struct blk_plug plug; + int i, iter = 0; + + mutex_lock(&dcc->cmd_lock); + blk_start_plug(&plug); + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + pend_list = &dcc->pend_list[i]; + list_for_each_entry_safe(dc, tmp, pend_list, list) { + f2fs_bug_on(sbi, dc->state != D_PREP); + + if (!issue_cond || is_idle(sbi)) + __submit_discard_cmd(sbi, dc); + if (issue_cond && iter++ > DISCARD_ISSUE_RATE) + goto out; + } + } +out: + blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); +} + /* This should be covered by global mutex, &sit_i->sentry_lock */ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) { @@ -1072,27 +1098,16 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct list_head *pend_list; struct list_head *wait_list = &(dcc->wait_list); struct discard_cmd *dc, *tmp; - struct blk_plug plug; - int i; + + __issue_discard_cmd(sbi, false); mutex_lock(&dcc->cmd_lock); - - blk_start_plug(&plug); - for (i = 0; i < MAX_PLIST_NUM; i++) { - pend_list = &dcc->pend_list[i]; - list_for_each_entry_safe(dc, tmp, pend_list, list) - __submit_discard_cmd(sbi, dc); - } - blk_finish_plug(&plug); - list_for_each_entry_safe(dc, tmp, wait_list, list) { wait_for_completion_io(&dc->wait); __remove_discard_cmd(sbi, dc); } - mutex_unlock(&dcc->cmd_lock); } @@ -1101,32 +1116,15 @@ static int issue_discard_thread(void *data) struct f2fs_sb_info *sbi = data; struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; wait_queue_head_t *q = &dcc->discard_wait_queue; - struct list_head *pend_list; struct list_head *wait_list = &dcc->wait_list; struct discard_cmd *dc, *tmp; - struct blk_plug plug; - int iter = 0, i; repeat: if (kthread_should_stop()) return 0; + __issue_discard_cmd(sbi, true); + mutex_lock(&dcc->cmd_lock); - blk_start_plug(&plug); - for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { - pend_list = &dcc->pend_list[i]; - list_for_each_entry_safe(dc, tmp, pend_list, list) { - f2fs_bug_on(sbi, dc->state != D_PREP); - - if (is_idle(sbi)) - __submit_discard_cmd(sbi, dc); - - if (iter++ > DISCARD_ISSUE_RATE) - goto next_step; - } - } -next_step: - blk_finish_plug(&plug); - list_for_each_entry_safe(dc, tmp, wait_list, list) { if (dc->state == D_DONE) { wait_for_completion_io(&dc->wait); @@ -1135,7 +1133,6 @@ static int issue_discard_thread(void *data) } mutex_unlock(&dcc->cmd_lock); - iter = 0; congestion_wait(BLK_RW_SYNC, HZ/50); wait_event_interruptible(*q, kthread_should_stop() || From 6cd09438a3311c769442e877eb5fc1ae32ccc3e7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 25 Apr 2017 20:21:38 +0800 Subject: [PATCH 0278/1212] f2fs: introduce __wait_discard_cmd Just cleanup, no logic change. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 40 ++++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b49818dd02c4..a0a0592e1681 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1076,6 +1076,22 @@ static void __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) mutex_unlock(&dcc->cmd_lock); } +static void __wait_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *wait_list = &(dcc->wait_list); + struct discard_cmd *dc, *tmp; + + mutex_lock(&dcc->cmd_lock); + list_for_each_entry_safe(dc, tmp, wait_list, list) { + if (!wait_cond || dc->state == D_DONE) { + wait_for_completion_io(&dc->wait); + __remove_discard_cmd(sbi, dc); + } + } + mutex_unlock(&dcc->cmd_lock); +} + /* This should be covered by global mutex, &sit_i->sentry_lock */ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) { @@ -1097,18 +1113,8 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) /* This comes from f2fs_put_super */ void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) { - struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct list_head *wait_list = &(dcc->wait_list); - struct discard_cmd *dc, *tmp; - __issue_discard_cmd(sbi, false); - - mutex_lock(&dcc->cmd_lock); - list_for_each_entry_safe(dc, tmp, wait_list, list) { - wait_for_completion_io(&dc->wait); - __remove_discard_cmd(sbi, dc); - } - mutex_unlock(&dcc->cmd_lock); + __wait_discard_cmd(sbi, false); } static int issue_discard_thread(void *data) @@ -1116,22 +1122,12 @@ static int issue_discard_thread(void *data) struct f2fs_sb_info *sbi = data; struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; wait_queue_head_t *q = &dcc->discard_wait_queue; - struct list_head *wait_list = &dcc->wait_list; - struct discard_cmd *dc, *tmp; repeat: if (kthread_should_stop()) return 0; __issue_discard_cmd(sbi, true); - - mutex_lock(&dcc->cmd_lock); - list_for_each_entry_safe(dc, tmp, wait_list, list) { - if (dc->state == D_DONE) { - wait_for_completion_io(&dc->wait); - __remove_discard_cmd(sbi, dc); - } - } - mutex_unlock(&dcc->cmd_lock); + __wait_discard_cmd(sbi, true); congestion_wait(BLK_RW_SYNC, HZ/50); From b465728ac362b0b069a3935fb65f1f019ed65ab2 Mon Sep 17 00:00:00 2001 From: Hou Pengyang Date: Tue, 25 Apr 2017 12:45:12 +0000 Subject: [PATCH 0279/1212] f2fs: reconstruct code to write a data page This patch introduces encrypt_one_page which encrypts one data page before submit_bio, and change the use of need_inplace_update. Signed-off-by: Hou Pengyang Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 81 +++++++++++++++++++++++++++++------------------ fs/f2fs/file.c | 4 +-- fs/f2fs/segment.h | 6 +--- 3 files changed, 54 insertions(+), 37 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 67d05b001722..db9ed78156e8 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1300,6 +1300,49 @@ static int f2fs_read_data_pages(struct file *file, return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages); } +static int encrypt_one_page(struct f2fs_io_info *fio) +{ + struct inode *inode = fio->page->mapping->host; + gfp_t gfp_flags = GFP_NOFS; + + if (!f2fs_encrypted_inode(inode) || !S_ISREG(inode->i_mode)) + return 0; + + /* wait for GCed encrypted page writeback */ + f2fs_wait_on_encrypted_page_writeback(fio->sbi, fio->old_blkaddr); + +retry_encrypt: + fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, + PAGE_SIZE, 0, fio->page->index, gfp_flags); + if (!IS_ERR(fio->encrypted_page)) + return 0; + + /* flush pending IOs and wait for a while in the ENOMEM case */ + if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { + f2fs_flush_merged_bios(fio->sbi); + congestion_wait(BLK_RW_ASYNC, HZ/50); + gfp_flags |= __GFP_NOFAIL; + goto retry_encrypt; + } + return PTR_ERR(fio->encrypted_page); +} + +static inline bool need_inplace_update(struct f2fs_io_info *fio) +{ + struct inode *inode = fio->page->mapping->host; + + if (fio->old_blkaddr == NEW_ADDR) + return false; + if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode)) + return false; + if (is_cold_data(fio->page)) + return false; + if (IS_ATOMIC_WRITTEN_PAGE(fio->page)) + return false; + + return need_inplace_update_policy(inode, fio); +} + int do_write_data_page(struct f2fs_io_info *fio) { struct page *page = fio->page; @@ -1320,30 +1363,9 @@ int do_write_data_page(struct f2fs_io_info *fio) goto out_writepage; } - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { - gfp_t gfp_flags = GFP_NOFS; - - /* wait for GCed encrypted page writeback */ - f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode), - fio->old_blkaddr); -retry_encrypt: - fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, - PAGE_SIZE, 0, - fio->page->index, - gfp_flags); - if (IS_ERR(fio->encrypted_page)) { - err = PTR_ERR(fio->encrypted_page); - if (err == -ENOMEM) { - /* flush pending ios and wait for a while */ - f2fs_flush_merged_bios(F2FS_I_SB(inode)); - congestion_wait(BLK_RW_ASYNC, HZ/50); - gfp_flags |= __GFP_NOFAIL; - err = 0; - goto retry_encrypt; - } - goto out_writepage; - } - } + err = encrypt_one_page(fio); + if (err) + goto out_writepage; set_page_writeback(page); @@ -1351,15 +1373,14 @@ int do_write_data_page(struct f2fs_io_info *fio) * If current allocation needs SSR, * it had better in-place writes for updated data. */ - if (unlikely(fio->old_blkaddr != NEW_ADDR && - !is_cold_data(page) && - !IS_ATOMIC_WRITTEN_PAGE(page) && - need_inplace_update(inode, fio))) { - f2fs_unlock_op(F2FS_I_SB(inode)); + if (need_inplace_update(fio)) { + f2fs_bug_on(fio->sbi, !fio->cp_rwsem_locked); + f2fs_unlock_op(fio->sbi); fio->cp_rwsem_locked = false; + err = rewrite_data_page(fio); + trace_f2fs_do_write_data_page(fio->page, IPU); set_inode_flag(inode, FI_UPDATE_WRITE); - trace_f2fs_do_write_data_page(page, IPU); } else { write_data_page(&dn, fio); trace_f2fs_do_write_data_page(page, OPU); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index fc1e6d048fd2..005129e03a67 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1903,7 +1903,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, int err; /* if in-place-update policy is enabled, don't waste time here */ - if (need_inplace_update(inode, NULL)) + if (need_inplace_update_policy(inode, NULL)) return -EINVAL; pg_start = range->start >> PAGE_SHIFT; @@ -2038,7 +2038,7 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!S_ISREG(inode->i_mode)) + if (!S_ISREG(inode->i_mode) || f2fs_is_atomic_file(inode)) return -EINVAL; if (f2fs_readonly(sbi->sb)) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 8ad22b8cbba7..10bf05d4cff4 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -564,16 +564,12 @@ enum { F2FS_IPU_ASYNC, }; -static inline bool need_inplace_update(struct inode *inode, +static inline bool need_inplace_update_policy(struct inode *inode, struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned int policy = SM_I(sbi)->ipu_policy; - /* IPU can be done only for the user data */ - if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode)) - return false; - if (test_opt(sbi, LFS)) return false; From cf1770e0fa436b62f454732cd0e7842bed61430a Mon Sep 17 00:00:00 2001 From: Hou Pengyang Date: Tue, 25 Apr 2017 12:45:13 +0000 Subject: [PATCH 0280/1212] f2fs: lookup extent cache first under IPU scenario If a page is cold, NOT atomit written and need_ipu now, there is a high probability that IPU should be adapted. For IPU, we try to check extent tree to get the block index first, instead of reading the dnode page, where may lead to an useless dnode IO, since no need to update the dnode index for IPU. Signed-off-by: Hou Pengyang Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/gc.c --- fs/f2fs/data.c | 16 ++++++++++++++-- fs/f2fs/gc.c | 3 ++- fs/f2fs/segment.c | 1 + 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index db9ed78156e8..b89b97be5ee4 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1348,9 +1348,20 @@ int do_write_data_page(struct f2fs_io_info *fio) struct page *page = fio->page; struct inode *inode = page->mapping->host; struct dnode_of_data dn; + struct extent_info ei = {0,0,0}; + bool ipu_force = false; int err = 0; set_new_dnode(&dn, inode, NULL, NULL, 0); + if (need_inplace_update(fio) && + f2fs_lookup_extent_cache(inode, page->index, &ei)) { + fio->old_blkaddr = ei.blk + page->index - ei.fofs; + if (fio->old_blkaddr != NULL_ADDR && + fio->old_blkaddr != NEW_ADDR) { + ipu_force = true; + goto got_it; + } + } err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); if (err) return err; @@ -1362,7 +1373,7 @@ int do_write_data_page(struct f2fs_io_info *fio) ClearPageUptodate(page); goto out_writepage; } - +got_it: err = encrypt_one_page(fio); if (err) goto out_writepage; @@ -1373,7 +1384,7 @@ int do_write_data_page(struct f2fs_io_info *fio) * If current allocation needs SSR, * it had better in-place writes for updated data. */ - if (need_inplace_update(fio)) { + if (ipu_force || need_inplace_update(fio)) { f2fs_bug_on(fio->sbi, !fio->cp_rwsem_locked); f2fs_unlock_op(fio->sbi); fio->cp_rwsem_locked = false; @@ -1410,6 +1421,7 @@ static int __write_data_page(struct page *page, bool *submitted, .type = DATA, .op = REQ_OP_WRITE, .op_flags = wbc_to_write_flags(wbc), + .old_blkaddr = NULL_ADDR, .page = page, .encrypted_page = NULL, .submitted = false, diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index f3102a895c48..32b3ae415260 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -713,7 +713,8 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, .sbi = F2FS_I_SB(inode), .type = DATA, .op = REQ_OP_WRITE, - .op_flags = REQ_SYNC | REQ_NOIDLE, + .op_flags = REQ_SYNC, + .old_blkaddr = NULL_ADDR, .page = page, .encrypted_page = NULL, }; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a0a0592e1681..69ead09ba06f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -311,6 +311,7 @@ static int __commit_inmem_pages(struct inode *inode, } fio.page = page; + fio.old_blkaddr = NULL_ADDR; err = do_write_data_page(&fio); if (err) { unlock_page(page); From 0905adc8c720177c5c85c95d4aeb99d9a8cd92d5 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 24 Apr 2017 15:20:16 -0700 Subject: [PATCH 0281/1212] f2fs: introduce valid_ipu_blkaddr to clean up This patch introduces valid_ipu_blkaddr to clean up checking block address for inplace-update. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b89b97be5ee4..fe27b2851336 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1331,8 +1331,6 @@ static inline bool need_inplace_update(struct f2fs_io_info *fio) { struct inode *inode = fio->page->mapping->host; - if (fio->old_blkaddr == NEW_ADDR) - return false; if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode)) return false; if (is_cold_data(fio->page)) @@ -1343,6 +1341,15 @@ static inline bool need_inplace_update(struct f2fs_io_info *fio) return need_inplace_update_policy(inode, fio); } +static inline bool valid_ipu_blkaddr(struct f2fs_io_info *fio) +{ + if (fio->old_blkaddr == NEW_ADDR) + return false; + if (fio->old_blkaddr == NULL_ADDR) + return false; + return true; +} + int do_write_data_page(struct f2fs_io_info *fio) { struct page *page = fio->page; @@ -1356,8 +1363,8 @@ int do_write_data_page(struct f2fs_io_info *fio) if (need_inplace_update(fio) && f2fs_lookup_extent_cache(inode, page->index, &ei)) { fio->old_blkaddr = ei.blk + page->index - ei.fofs; - if (fio->old_blkaddr != NULL_ADDR && - fio->old_blkaddr != NEW_ADDR) { + + if (valid_ipu_blkaddr(fio)) { ipu_force = true; goto got_it; } @@ -1384,7 +1391,7 @@ int do_write_data_page(struct f2fs_io_info *fio) * If current allocation needs SSR, * it had better in-place writes for updated data. */ - if (ipu_force || need_inplace_update(fio)) { + if (ipu_force || (valid_ipu_blkaddr(fio) && need_inplace_update(fio))) { f2fs_bug_on(fio->sbi, !fio->cp_rwsem_locked); f2fs_unlock_op(fio->sbi); fio->cp_rwsem_locked = false; From 95d6aa32c3c2a250b30562cc19db9dac602b93f4 Mon Sep 17 00:00:00 2001 From: Jin Qian Date: Tue, 25 Apr 2017 16:28:48 -0700 Subject: [PATCH 0282/1212] f2fs: sanity check segment count F2FS uses 4 bytes to represent block address. As a result, supported size of disk is 16 TB and it equals to 16 * 1024 * 1024 / 2 segments. Signed-off-by: Jin Qian Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 7 +++++++ include/linux/f2fs_fs.h | 6 ++++++ 2 files changed, 13 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index aa6ee31ef39b..9c310d8a6da1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1496,6 +1496,13 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return 1; } + if (le32_to_cpu(raw_super->segment_count) > F2FS_MAX_SEGMENT) { + f2fs_msg(sb, KERN_INFO, + "Invalid segment count (%u)", + le32_to_cpu(raw_super->segment_count)); + return 1; + } + /* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */ if (sanity_check_area_boundary(sbi, bh)) return 1; diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index e2d239ed4c60..661200e6d281 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -301,6 +301,12 @@ struct f2fs_nat_block { #define SIT_VBLOCK_MAP_SIZE 64 #define SIT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_sit_entry)) +/* + * F2FS uses 4 bytes to represent block address. As a result, supported size of + * disk is 16 TB and it equals to 16 * 1024 * 1024 / 2 segments. + */ +#define F2FS_MAX_SEGMENT ((16 * 1024 * 1024) / 2) + /* * Note that f2fs_sit_entry->vblocks has the following bit-field information. * [15:10] : allocation type such as CURSEG_XXXX_TYPE From 5abcd71d0fd8a642d848a13de041e2112df21a23 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 26 Apr 2017 11:11:12 -0700 Subject: [PATCH 0283/1212] f2fs: nullify fio->encrypted_page for each writes This makes sure each write request has nullified encrypted_page pointer. Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/segment.c --- fs/f2fs/segment.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 69ead09ba06f..9fcc2f9aa732 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -290,8 +290,7 @@ static int __commit_inmem_pages(struct inode *inode, .sbi = sbi, .type = DATA, .op = REQ_OP_WRITE, - .op_flags = REQ_SYNC | REQ_NOIDLE | REQ_PRIO, - .encrypted_page = NULL, + .op_flags = REQ_SYNC | REQ_PRIO, }; pgoff_t last_idx = ULONG_MAX; int err = 0; @@ -312,6 +311,7 @@ static int __commit_inmem_pages(struct inode *inode, fio.page = page; fio.old_blkaddr = NULL_ADDR; + fio.encrypted_page = NULL; err = do_write_data_page(&fio); if (err) { unlock_page(page); From 7ec84ed608e4fe4b00189ed36363da553c092eaf Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 26 Apr 2017 17:39:54 +0800 Subject: [PATCH 0284/1212] f2fs: don't hold cmd_lock during waiting discard command Previously, with protection of cmd_lock, we will wait for end io of discard command which potentially may lead long latency, making worse concurrency. So, in this patch, we try to add reference into discard entry to prevent the entry being released by other thread, then we can avoid holding global cmd_lock during waiting discard to finish. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/segment.c | 25 ++++++++++++++++++++----- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 635bca168078..b20b3b29bc27 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -286,6 +286,7 @@ struct discard_cmd { struct list_head list; /* command list */ struct completion wait; /* compleation */ struct block_device *bdev; /* bdev */ + unsigned short ref; /* reference count */ int state; /* state */ int error; /* bio error */ }; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 9fcc2f9aa732..6a79d0b3b423 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -688,6 +688,7 @@ static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, dc->lstart = lstart; dc->start = start; dc->len = len; + dc->ref = 0; dc->state = D_PREP; dc->error = 0; init_completion(&dc->wait); @@ -1086,6 +1087,8 @@ static void __wait_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond) mutex_lock(&dcc->cmd_lock); list_for_each_entry_safe(dc, tmp, wait_list, list) { if (!wait_cond || dc->state == D_DONE) { + if (dc->ref) + continue; wait_for_completion_io(&dc->wait); __remove_discard_cmd(sbi, dc); } @@ -1098,17 +1101,29 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_cmd *dc; + bool need_wait = false; mutex_lock(&dcc->cmd_lock); - dc = (struct discard_cmd *)__lookup_rb_tree(&dcc->root, NULL, blkaddr); if (dc) { - if (dc->state != D_PREP) - wait_for_completion_io(&dc->wait); - __punch_discard_cmd(sbi, dc, blkaddr); + if (dc->state == D_PREP) { + __punch_discard_cmd(sbi, dc, blkaddr); + } else { + dc->ref++; + need_wait = true; + } } - mutex_unlock(&dcc->cmd_lock); + + if (need_wait) { + wait_for_completion_io(&dc->wait); + mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, dc->state != D_DONE); + dc->ref--; + if (!dc->ref) + __remove_discard_cmd(sbi, dc); + mutex_unlock(&dcc->cmd_lock); + } } /* This comes from f2fs_put_super */ From 0756d8f7982ebaf4de78364a7cada24a6e31098c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 26 Apr 2017 17:39:55 +0800 Subject: [PATCH 0285/1212] f2fs: shrink size of struct discard_cmd In order to shrink size of struct discard_cmd, change variable type of @state in struct discard_cmd from int to unsigned char. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b20b3b29bc27..125d006ed5ff 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -287,7 +287,7 @@ struct discard_cmd { struct completion wait; /* compleation */ struct block_device *bdev; /* bdev */ unsigned short ref; /* reference count */ - int state; /* state */ + unsigned char state; /* state */ int error; /* bio error */ }; From 87c98567046f8f5890bbb7e5dab874ced61ccbf8 Mon Sep 17 00:00:00 2001 From: Hou Pengyang Date: Thu, 27 Apr 2017 00:17:21 +0800 Subject: [PATCH 0286/1212] f2fs: release cp and dnode lock before IPU We don't need to rewrite the page under cp_rwsem and dnode locks. Signed-off-by: Hou Pengyang Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 39 ++++++++++++++++++++++++--------------- fs/f2fs/f2fs.h | 2 +- fs/f2fs/gc.c | 1 + fs/f2fs/segment.c | 1 + 4 files changed, 27 insertions(+), 16 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index fe27b2851336..c1e881242d53 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1366,12 +1366,17 @@ int do_write_data_page(struct f2fs_io_info *fio) if (valid_ipu_blkaddr(fio)) { ipu_force = true; + fio->need_lock = false; goto got_it; } } + + if (fio->need_lock) + f2fs_lock_op(fio->sbi); + err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); if (err) - return err; + goto out; fio->old_blkaddr = dn.data_blkaddr; @@ -1392,22 +1397,26 @@ int do_write_data_page(struct f2fs_io_info *fio) * it had better in-place writes for updated data. */ if (ipu_force || (valid_ipu_blkaddr(fio) && need_inplace_update(fio))) { - f2fs_bug_on(fio->sbi, !fio->cp_rwsem_locked); - f2fs_unlock_op(fio->sbi); - fio->cp_rwsem_locked = false; - + f2fs_put_dnode(&dn); + if (fio->need_lock) + f2fs_unlock_op(fio->sbi); err = rewrite_data_page(fio); trace_f2fs_do_write_data_page(fio->page, IPU); set_inode_flag(inode, FI_UPDATE_WRITE); - } else { - write_data_page(&dn, fio); - trace_f2fs_do_write_data_page(page, OPU); - set_inode_flag(inode, FI_APPEND_WRITE); - if (page->index == 0) - set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); + return err; } + + /* LFS mode write path */ + write_data_page(&dn, fio); + trace_f2fs_do_write_data_page(page, OPU); + set_inode_flag(inode, FI_APPEND_WRITE); + if (page->index == 0) + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); out_writepage: f2fs_put_dnode(&dn); +out: + if (fio->need_lock) + f2fs_unlock_op(fio->sbi); return err; } @@ -1432,7 +1441,7 @@ static int __write_data_page(struct page *page, bool *submitted, .page = page, .encrypted_page = NULL, .submitted = false, - .cp_rwsem_locked = true, + .need_lock = true, }; trace_f2fs_writepage(page, DATA); @@ -1468,6 +1477,7 @@ static int __write_data_page(struct page *page, bool *submitted, /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { + fio.need_lock = false; err = do_write_data_page(&fio); goto done; } @@ -1485,13 +1495,12 @@ static int __write_data_page(struct page *page, bool *submitted, if (!err) goto out; } - f2fs_lock_op(sbi); + if (err == -EAGAIN) err = do_write_data_page(&fio); if (F2FS_I(inode)->last_disk_size < psize) F2FS_I(inode)->last_disk_size = psize; - if (fio.cp_rwsem_locked) - f2fs_unlock_op(sbi); + done: if (err && err != -ENOENT) goto redirty_out; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 125d006ed5ff..bb1b6ce66c1a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -870,7 +870,7 @@ struct f2fs_io_info { struct page *page; /* page to be written */ struct page *encrypted_page; /* encrypted page */ bool submitted; /* indicate IO submission */ - bool cp_rwsem_locked; /* indicate cp_rwsem is held */ + bool need_lock; /* indicate we need to lock cp_rwsem */ }; #define is_read_io(rw) ((rw) == READ) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 32b3ae415260..b527ab0eec1d 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -717,6 +717,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, .old_blkaddr = NULL_ADDR, .page = page, .encrypted_page = NULL, + .need_lock = true, }; bool is_dirty = PageDirty(page); int err; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6a79d0b3b423..9e15496036ff 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -312,6 +312,7 @@ static int __commit_inmem_pages(struct inode *inode, fio.page = page; fio.old_blkaddr = NULL_ADDR; fio.encrypted_page = NULL; + fio.need_lock = false, err = do_write_data_page(&fio); if (err) { unlock_page(page); From 30d60edd7becac313a6a3adb9733ea123695b2f6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 27 Apr 2017 20:40:39 +0800 Subject: [PATCH 0287/1212] f2fs: allow cpc->reason to indicate more than one reason Change to use different bits of cpc->reason to indicate different status, so cpc->reason can indicate more than one reason. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 14 +++++++------- fs/f2fs/f2fs.h | 16 +++++++--------- fs/f2fs/segment.c | 8 ++++---- 3 files changed, 18 insertions(+), 20 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 0983b7646444..b1a86997b115 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1055,17 +1055,17 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) spin_lock(&sbi->cp_lock); - if (cpc->reason == CP_UMOUNT && + if ((cpc->reason & CP_UMOUNT) && le32_to_cpu(ckpt->cp_pack_total_block_count) > sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) disable_nat_bits(sbi, false); - if (cpc->reason == CP_UMOUNT) + if (cpc->reason & CP_UMOUNT) __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); else __clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); - if (cpc->reason == CP_FASTBOOT) + if (cpc->reason & CP_FASTBOOT) __set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); else __clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); @@ -1273,8 +1273,8 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) mutex_lock(&sbi->cp_mutex); if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) && - (cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC || - (cpc->reason == CP_DISCARD && !sbi->discard_blks))) + ((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) || + ((cpc->reason & CP_DISCARD) && !sbi->discard_blks))) goto out; if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; @@ -1296,7 +1296,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_flush_merged_bios(sbi); /* this is the case of multiple fstrims without any changes */ - if (cpc->reason == CP_DISCARD) { + if (cpc->reason & CP_DISCARD) { if (!exist_trim_candidates(sbi, cpc)) { unblock_operations(sbi); goto out; @@ -1334,7 +1334,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); - if (cpc->reason == CP_RECOVERY) + if (cpc->reason & CP_RECOVERY) f2fs_msg(sbi->sb, KERN_NOTICE, "checkpoint: version = %llx", ckpt_ver); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index bb1b6ce66c1a..4c7eb0b6b4ca 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -192,13 +192,11 @@ enum { SIT_BITMAP }; -enum { - CP_UMOUNT, - CP_FASTBOOT, - CP_SYNC, - CP_RECOVERY, - CP_DISCARD, -}; +#define CP_UMOUNT 0x00000001 +#define CP_FASTBOOT 0x00000002 +#define CP_SYNC 0x00000004 +#define CP_RECOVERY 0x00000008 +#define CP_DISCARD 0x00000010 #define DEF_BATCHED_TRIM_SECTIONS 2048 #define BATCHED_TRIM_SEGMENTS(sbi) \ @@ -1332,7 +1330,7 @@ static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi, { bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); - return (cpc) ? (cpc->reason == CP_UMOUNT) && set : set; + return (cpc) ? (cpc->reason & CP_UMOUNT) && set : set; } static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) @@ -1368,7 +1366,7 @@ static inline int __get_cp_reason(struct f2fs_sb_info *sbi) static inline bool __remain_node_summaries(int reason) { - return (reason == CP_UMOUNT || reason == CP_FASTBOOT); + return (reason & (CP_UMOUNT | CP_FASTBOOT)); } static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 9e15496036ff..444ea2c4f671 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1262,7 +1262,7 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, unsigned long *discard_map = (unsigned long *)se->discard_map; unsigned long *dmap = SIT_I(sbi)->tmp_map; unsigned int start = 0, end = -1; - bool force = (cpc->reason == CP_DISCARD); + bool force = (cpc->reason & CP_DISCARD); struct discard_entry *de = NULL; struct list_head *head = &SM_I(sbi)->dcc_info->entry_list; int i; @@ -1345,7 +1345,7 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int start = 0, end = -1; unsigned int secno, start_segno; - bool force = (cpc->reason == CP_DISCARD); + bool force = (cpc->reason & CP_DISCARD); mutex_lock(&dirty_i->seglist_lock); @@ -2849,7 +2849,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) se = get_seg_entry(sbi, segno); /* add discard candidates */ - if (cpc->reason != CP_DISCARD) { + if (!(cpc->reason & CP_DISCARD)) { cpc->trim_start = segno; add_discard_addrs(sbi, cpc, false); } @@ -2885,7 +2885,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_bug_on(sbi, !list_empty(head)); f2fs_bug_on(sbi, sit_i->dirty_sentries); out: - if (cpc->reason == CP_DISCARD) { + if (cpc->reason & CP_DISCARD) { __u64 trim_start = cpc->trim_start; for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) From a725708ca434461dbaaefa7ec5004373ec3ba054 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 28 Apr 2017 13:56:08 +0800 Subject: [PATCH 0288/1212] f2fs: introduce CP_TRIMMED_FLAG to avoid unneeded discard Introduce CP_TRIMMED_FLAG to indicate all invalid block were trimmed before umount, so once we do mount with image which contain the flag, we don't record invalid blocks as undiscard one, when fstrim is being triggered, we can avoid issuing redundant discard commands. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 3 +++ fs/f2fs/f2fs.h | 1 + fs/f2fs/segment.c | 28 ++++++++++++++++++++-------- fs/f2fs/super.c | 7 +++++++ include/linux/f2fs_fs.h | 1 + include/trace/events/f2fs.h | 4 +++- 6 files changed, 35 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index b1a86997b115..d639fd9062d4 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1060,6 +1060,9 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) disable_nat_bits(sbi, false); + if (cpc->reason & CP_TRIMMED) + __set_ckpt_flags(ckpt, CP_TRIMMED_FLAG); + if (cpc->reason & CP_UMOUNT) __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); else diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4c7eb0b6b4ca..b34d527ba809 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -197,6 +197,7 @@ enum { #define CP_SYNC 0x00000004 #define CP_RECOVERY 0x00000008 #define CP_DISCARD 0x00000010 +#define CP_TRIMMED 0x00000020 #define DEF_BATCHED_TRIM_SECTIONS 2048 #define BATCHED_TRIM_SEGMENTS(sbi) \ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 444ea2c4f671..23e809f64ded 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3084,10 +3084,17 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) /* build discard map only one time */ if (f2fs_discard_en(sbi)) { - memcpy(se->discard_map, se->cur_valid_map, - SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += sbi->blocks_per_seg - - se->valid_blocks; + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, + SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, + se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += + sbi->blocks_per_seg - + se->valid_blocks; + } } if (sbi->segs_per_sec > 1) @@ -3111,10 +3118,15 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) seg_info_from_raw_sit(se, &sit); if (f2fs_discard_en(sbi)) { - memcpy(se->discard_map, se->cur_valid_map, - SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += old_valid_blocks - - se->valid_blocks; + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, + SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += old_valid_blocks - + se->valid_blocks; + } } if (sbi->segs_per_sec > 1) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 9c310d8a6da1..eeda97b54556 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -797,6 +797,13 @@ static void f2fs_put_super(struct super_block *sb) /* be sure to wait for any on-going discard commands */ f2fs_wait_discard_bios(sbi); + if (!sbi->discard_blks) { + struct cp_control cpc = { + .reason = CP_UMOUNT | CP_TRIMMED, + }; + write_checkpoint(sbi, &cpc); + } + /* write_checkpoint can update stat informaion */ f2fs_destroy_stats(sbi); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 661200e6d281..2b7183c5c9a3 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -114,6 +114,7 @@ struct f2fs_super_block { /* * For checkpoint */ +#define CP_TRIMMED_FLAG 0x00000100 #define CP_NAT_BITS_FLAG 0x00000080 #define CP_CRC_RECOVERY_FLAG 0x00000040 #define CP_FASTBOOT_FLAG 0x00000020 diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 0796b2bf6870..0d02af995547 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -44,6 +44,7 @@ TRACE_DEFINE_ENUM(CP_FASTBOOT); TRACE_DEFINE_ENUM(CP_SYNC); TRACE_DEFINE_ENUM(CP_RECOVERY); TRACE_DEFINE_ENUM(CP_DISCARD); +TRACE_DEFINE_ENUM(CP_TRIMMED); #define show_block_type(type) \ __print_symbolic(type, \ @@ -118,7 +119,8 @@ TRACE_DEFINE_ENUM(CP_DISCARD); { CP_FASTBOOT, "Fastboot" }, \ { CP_SYNC, "Sync" }, \ { CP_RECOVERY, "Recovery" }, \ - { CP_DISCARD, "Discard" }) + { CP_DISCARD, "Discard" }, \ + { CP_UMOUNT | CP_TRIMMED, "Umount,Trimmed" }) struct victim_sel_policy; struct f2fs_map_blocks; From 19023fdfb16384ce695bd7ce07902518d39f5435 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 1 May 2017 18:09:44 -0700 Subject: [PATCH 0289/1212] f2fs: flush dirty nats periodically This patch flushes dirty nats in order to acquire available nids by writing checkpoint. Otherwise, we can have no chance to get freed nids. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 23e809f64ded..5fdc995b1f1e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -419,7 +419,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) else build_free_nids(sbi, false, false); - if (!is_idle(sbi)) + if (!is_idle(sbi) && !excess_dirty_nats(sbi)) return; /* checkpoint is the only way to shrink partial cached entries */ From d15370b84bd18bbe31358e9ba110ac3b7c8ff18a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 1 May 2017 18:13:03 -0700 Subject: [PATCH 0290/1212] f2fs: show available_nids in f2fs/status This patch adds an entry in f2fs/status to show # of available nids. Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 5 +++-- fs/f2fs/f2fs.h | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 6102737473d4..87f449845f5f 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -97,6 +97,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->sits = MAIN_SEGS(sbi); si->dirty_sits = SIT_I(sbi)->dirty_sentries; si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID_LIST]; + si->avail_nids = NM_I(sbi)->available_nids; si->alloc_nids = NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]; si->bg_gc = sbi->bg_gc; si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) @@ -370,8 +371,8 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_imeta); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); - seq_printf(s, " - free_nids: %9d, alloc_nids: %9d\n", - si->free_nids, si->alloc_nids); + seq_printf(s, " - free_nids: %9d/%9d\n - alloc_nids: %9d\n", + si->free_nids, si->avail_nids, si->alloc_nids); seq_puts(s, "\nDistribution of User Blocks:"); seq_puts(s, " [ valid | invalid | free ]\n"); seq_puts(s, " ["); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b34d527ba809..4d086c7c2138 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2461,7 +2461,8 @@ struct f2fs_stat_info { int ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; int inmem_pages; unsigned int ndirty_dirs, ndirty_files, ndirty_all; - int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids; + int nats, dirty_nats, sits, dirty_sits; + int free_nids, avail_nids, alloc_nids; int total_count, utilization; int bg_gc, nr_wb_cp_data, nr_wb_data; int nr_flushing, nr_flushed, nr_discarding, nr_discarded; From 02ac4707889068a4b739a1ed52637cff8390a41f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 3 May 2017 23:59:13 +0800 Subject: [PATCH 0291/1212] f2fs: relocate inode_{,un}lock in F2FS_IOC_SETFLAGS This patch expands cover region of inode->i_rwsem to keep setting flag atomically. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 005129e03a67..70be377c2236 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1478,10 +1478,10 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) if (ret) return ret; - flags = f2fs_mask_flags(inode->i_mode, flags); - inode_lock(inode); + flags = f2fs_mask_flags(inode->i_mode, flags); + oldflags = fi->i_flags; if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { @@ -1495,10 +1495,11 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) flags = flags & FS_FL_USER_MODIFIABLE; flags |= oldflags & ~FS_FL_USER_MODIFIABLE; fi->i_flags = flags; - inode_unlock(inode); inode->i_ctime = current_time(inode); f2fs_set_inode_flags(inode); + + inode_unlock(inode); out: mnt_drop_write_file(filp); return ret; From 60a9766f27c7b0f5cae5db408edc6e6bb86538a2 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Wed, 26 Apr 2017 15:56:52 +0800 Subject: [PATCH 0292/1212] f2fs: fix a mount fail for wrong next_scan_nid -write_checkpoint -do_checkpoint -next_free_nid <--- something wrong with next free nid -f2fs_fill_super -build_node_manager -build_free_nids -get_current_nat_page -__get_meta_page <--- attempt to access beyond end of device Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index a63399338ff4..833f5fb9858c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1958,6 +1958,9 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) int i = 0; nid_t nid = nm_i->next_scan_nid; + if (unlikely(nid >= nm_i->max_nid)) + nid = 0; + /* Enough entries */ if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK) return; From b53229d776afa1626da3f362ad9f82884c8555ad Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 7 Apr 2017 10:58:39 -0700 Subject: [PATCH 0293/1212] f2fs: sync f2fs_lookup() with ext4_lookup() As for ext4, now that fscrypt_has_permitted_context() correctly handles the case where we have the key for the parent directory but not the child, f2fs_lookup() no longer has to work around it. Also add the same warning message that ext4 uses. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/f2fs/namei.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 35fca4c39993..77349d51f952 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -324,9 +324,10 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, if (f2fs_encrypted_inode(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !fscrypt_has_permitted_context(dir, inode)) { - bool nokey = f2fs_encrypted_inode(inode) && - !fscrypt_has_encryption_key(inode); - err = nokey ? -ENOKEY : -EPERM; + f2fs_msg(inode->i_sb, KERN_WARNING, + "Inconsistent encryption contexts: %lu/%lu", + dir->i_ino, inode->i_ino); + err = -EPERM; goto err_out; } return d_splice_alias(inode, dentry); From 64f3b27a09c34664fb3023fcfdd95834948a4a1f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 24 Apr 2017 10:00:08 -0700 Subject: [PATCH 0294/1212] f2fs: check entire encrypted bigname when finding a dentry If user has no key under an encrypted dir, fscrypt gives digested dentries. Previously, when looking up a dentry, f2fs only checks its hash value with first 4 bytes of the digested dentry, which didn't handle hash collisions fully. This patch enhances to check entire dentry bytes likewise ext4. Eric reported how to reproduce this issue by: # seq -f "edir/abcdefghijklmnopqrstuvwxyz012345%.0f" 100000 | xargs touch # find edir -type f | xargs stat -c %i | sort | uniq | wc -l 100000 # sync # echo 3 > /proc/sys/vm/drop_caches # keyctl new_session # find edir -type f | xargs stat -c %i | sort | uniq | wc -l 99999 Cc: Reported-by: Eric Biggers Signed-off-by: Jaegeuk Kim (fixed f2fs_dentry_hash() to work even when the hash is 0) Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o Conflicts: fs/f2fs/inline.c --- fs/f2fs/dir.c | 37 +++++++++++++++++++++---------------- fs/f2fs/f2fs.h | 3 ++- fs/f2fs/hash.c | 7 ++++++- fs/f2fs/inline.c | 4 ++-- 4 files changed, 31 insertions(+), 20 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index f44e1370890f..9dbf44a28520 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -130,19 +130,29 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, continue; } - /* encrypted case */ + if (de->hash_code != namehash) + goto not_match; + de_name.name = d->filename[bit_pos]; de_name.len = le16_to_cpu(de->name_len); - /* show encrypted name */ - if (fname->hash) { - if (de->hash_code == cpu_to_le32(fname->hash)) - goto found; - } else if (de_name.len == name->len && - de->hash_code == namehash && - !memcmp(de_name.name, name->name, name->len)) +#ifdef CONFIG_F2FS_FS_ENCRYPTION + if (unlikely(!name->name)) { + if (fname->usr_fname->name[0] == '_') { + if (de_name.len >= 16 && + !memcmp(de_name.name + de_name.len - 16, + fname->crypto_buf.name + 8, 16)) + goto found; + goto not_match; + } + name->name = fname->crypto_buf.name; + name->len = fname->crypto_buf.len; + } +#endif + if (de_name.len == name->len && + !memcmp(de_name.name, name->name, name->len)) goto found; - +not_match: if (max_slots && max_len > *max_slots) *max_slots = max_len; max_len = 0; @@ -170,12 +180,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, struct f2fs_dir_entry *de = NULL; bool room = false; int max_slots; - f2fs_hash_t namehash; - - if(fname->hash) - namehash = cpu_to_le32(fname->hash); - else - namehash = f2fs_dentry_hash(&name); + f2fs_hash_t namehash = f2fs_dentry_hash(&name, fname); nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); @@ -526,7 +531,7 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, level = 0; slots = GET_DENTRY_SLOTS(new_name->len); - dentry_hash = f2fs_dentry_hash(new_name); + dentry_hash = f2fs_dentry_hash(new_name, NULL); current_depth = F2FS_I(dir)->i_current_depth; if (F2FS_I(dir)->chash == dentry_hash) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4d086c7c2138..b0a093e38104 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2258,7 +2258,8 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi); /* * hash.c */ -f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info); +f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, + struct fscrypt_name *fname); /* * node.c diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index 71b7206c431e..eb2e031ea887 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -70,7 +70,8 @@ static void str2hashbuf(const unsigned char *msg, size_t len, *buf++ = pad; } -f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info) +f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, + struct fscrypt_name *fname) { __u32 hash; f2fs_hash_t f2fs_hash; @@ -79,6 +80,10 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info) const unsigned char *name = name_info->name; size_t len = name_info->len; + /* encrypted bigname case */ + if (fname && !fname->disk_name.name) + return cpu_to_le32(fname->hash); + if (is_dot_dotdot(name_info)) return 0; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index b3bd1012a4fc..fc8b49696b9d 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -298,7 +298,7 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, return NULL; } - namehash = f2fs_dentry_hash(&name); + namehash = f2fs_dentry_hash(&name, fname); inline_dentry = inline_data_addr(ipage); @@ -533,7 +533,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, f2fs_wait_on_page_writeback(ipage, NODE, true); - name_hash = f2fs_dentry_hash(new_name); + name_hash = f2fs_dentry_hash(new_name, NULL); make_dentry_ptr_inline(NULL, &d, dentry_blk); f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos); From 73c0288f1c9c90c39170b1246de431adb5a85fc8 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 24 Apr 2017 10:00:09 -0700 Subject: [PATCH 0295/1212] fscrypt: avoid collisions when presenting long encrypted filenames When accessing an encrypted directory without the key, userspace must operate on filenames derived from the ciphertext names, which contain arbitrary bytes. Since we must support filenames as long as NAME_MAX, we can't always just base64-encode the ciphertext, since that may make it too long. Currently, this is solved by presenting long names in an abbreviated form containing any needed filesystem-specific hashes (e.g. to identify a directory block), then the last 16 bytes of ciphertext. This needs to be sufficient to identify the actual name on lookup. However, there is a bug. It seems to have been assumed that due to the use of a CBC (ciphertext block chaining)-based encryption mode, the last 16 bytes (i.e. the AES block size) of ciphertext would depend on the full plaintext, preventing collisions. However, we actually use CBC with ciphertext stealing (CTS), which handles the last two blocks specially, causing them to appear "flipped". Thus, it's actually the second-to-last block which depends on the full plaintext. This caused long filenames that differ only near the end of their plaintexts to, when observed without the key, point to the wrong inode and be undeletable. For example, with ext4: # echo pass | e4crypt add_key -p 16 edir/ # seq -f "edir/abcdefghijklmnopqrstuvwxyz012345%.0f" 100000 | xargs touch # find edir/ -type f | xargs stat -c %i | sort | uniq | wc -l 100000 # sync # echo 3 > /proc/sys/vm/drop_caches # keyctl new_session # find edir/ -type f | xargs stat -c %i | sort | uniq | wc -l 2004 # rm -rf edir/ rm: cannot remove 'edir/_A7nNFi3rhkEQlJ6P,hdzluhODKOeWx5V': Structure needs cleaning ... To fix this, when presenting long encrypted filenames, encode the second-to-last block of ciphertext rather than the last 16 bytes. Although it would be nice to solve this without depending on a specific encryption mode, that would mean doing a cryptographic hash like SHA-256 which would be much less efficient. This way is sufficient for now, and it's still compatible with encryption modes like HEH which are strong pseudorandom permutations. Also, changing the presented names is still allowed at any time because they are only provided to allow applications to do things like delete encrypted directories. They're not designed to be used to persistently identify files --- which would be hard to do anyway, given that they're encrypted after all. For ease of backports, this patch only makes the minimal fix to both ext4 and f2fs. It leaves ubifs as-is, since ubifs doesn't compare the ciphertext block yet. Follow-on patches will clean things up properly and make the filesystems use a shared helper function. Fixes: 5de0b4d0cd15 ("ext4 crypto: simplify and speed up filename encryption") Reported-by: Gwendal Grignou Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/fname.c | 2 +- fs/ext4/namei.c | 4 ++-- fs/f2fs/dir.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 13052b85c393..932881f27f2f 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -300,7 +300,7 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, } else { memset(buf, 0, 8); } - memcpy(buf + 8, iname->name + iname->len - 16, 16); + memcpy(buf + 8, iname->name + ((iname->len - 17) & ~15), 16); oname->name[0] = '_'; oname->len = 1 + digest_encode(buf, 24, oname->name + 1); return 0; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 38eb0c8e43b9..dac159a226ad 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1243,9 +1243,9 @@ static inline int ext4_match(struct ext4_filename *fname, if (unlikely(!name)) { if (fname->usr_fname->name[0] == '_') { int ret; - if (de->name_len < 16) + if (de->name_len <= 32) return 0; - ret = memcmp(de->name + de->name_len - 16, + ret = memcmp(de->name + ((de->name_len - 17) & ~15), fname->crypto_buf.name + 8, 16); return (ret == 0) ? 1 : 0; } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 9dbf44a28520..c716ab0baf1d 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -139,8 +139,8 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, #ifdef CONFIG_F2FS_FS_ENCRYPTION if (unlikely(!name->name)) { if (fname->usr_fname->name[0] == '_') { - if (de_name.len >= 16 && - !memcmp(de_name.name + de_name.len - 16, + if (de_name.len > 32 && + !memcmp(de_name.name + ((de_name.len - 17) & ~15), fname->crypto_buf.name + 8, 16)) goto found; goto not_match; From e9dbf926ed236a065dfd8a8f930564fe0f1f2b73 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 21 Feb 2017 15:07:11 -0800 Subject: [PATCH 0296/1212] fscrypt: remove broken support for detecting keyring key revocation Filesystem encryption ostensibly supported revoking a keyring key that had been used to "unlock" encrypted files, causing those files to become "locked" again. This was, however, buggy for several reasons, the most severe of which was that when key revocation happened to be detected for an inode, its fscrypt_info was immediately freed, even while other threads could be using it for encryption or decryption concurrently. This could be exploited to crash the kernel or worse. This patch fixes the use-after-free by removing the code which detects the keyring key having been revoked, invalidated, or expired. Instead, an encrypted inode that is "unlocked" now simply remains unlocked until it is evicted from memory. Note that this is no worse than the case for block device-level encryption, e.g. dm-crypt, and it still remains possible for a privileged user to evict unused pages, inodes, and dentries by running 'sync; echo 3 > /proc/sys/vm/drop_caches', or by simply unmounting the filesystem. In fact, one of those actions was already needed anyway for key revocation to work even somewhat sanely. This change is not expected to break any applications. In the future I'd like to implement a real API for fscrypt key revocation that interacts sanely with ongoing filesystem operations --- waiting for existing operations to complete and blocking new operations, and invalidating and sanitizing key material and plaintext from the VFS caches. But this is a hard problem, and for now this bug must be fixed. This bug affected almost all versions of ext4, f2fs, and ubifs encryption, and it was potentially reachable in any kernel configured with encryption support (CONFIG_EXT4_ENCRYPTION=y, CONFIG_EXT4_FS_ENCRYPTION=y, CONFIG_F2FS_FS_ENCRYPTION=y, or CONFIG_UBIFS_FS_ENCRYPTION=y). Note that older kernels did not use the shared fs/crypto/ code, but due to the potential security implications of this bug, it may still be worthwhile to backport this fix to them. Fixes: b7236e21d55f ("ext4 crypto: reorganize how we store keys in the inode") Cc: stable@vger.kernel.org # v4.2+ Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o Acked-by: Michael Halcrow --- fs/crypto/crypto.c | 10 +------ fs/crypto/fname.c | 2 +- fs/crypto/fscrypt_private.h | 4 --- fs/crypto/keyinfo.c | 52 +++++++------------------------------ 4 files changed, 11 insertions(+), 57 deletions(-) diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 02a7a9286449..6d6eca394d4d 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -327,7 +327,6 @@ EXPORT_SYMBOL(fscrypt_decrypt_page); static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) { struct dentry *dir; - struct fscrypt_info *ci; int dir_has_key, cached_with_key; if (flags & LOOKUP_RCU) @@ -339,18 +338,11 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) return 0; } - ci = d_inode(dir)->i_crypt_info; - if (ci && ci->ci_keyring_key && - (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | - (1 << KEY_FLAG_REVOKED) | - (1 << KEY_FLAG_DEAD)))) - ci = NULL; - /* this should eventually be an flag in d_flags */ spin_lock(&dentry->d_lock); cached_with_key = dentry->d_flags & DCACHE_ENCRYPTED_WITH_KEY; spin_unlock(&dentry->d_lock); - dir_has_key = (ci != NULL); + dir_has_key = (d_inode(dir)->i_crypt_info != NULL); dput(dir); /* diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 932881f27f2f..15bf9c31a34d 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -350,7 +350,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, fname->disk_name.len = iname->len; return 0; } - ret = fscrypt_get_crypt_info(dir); + ret = fscrypt_get_encryption_info(dir); if (ret && ret != -EOPNOTSUPP) return ret; diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index ea01e5279675..ab0440274630 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -67,7 +67,6 @@ struct fscrypt_info { u8 ci_filename_mode; u8 ci_flags; struct crypto_skcipher *ci_ctfm; - struct key *ci_keyring_key; u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE]; }; @@ -111,7 +110,4 @@ extern int fscrypt_do_page_crypto(const struct inode *inode, extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags); -/* keyinfo.c */ -extern int fscrypt_get_crypt_info(struct inode *); - #endif /* _FSCRYPT_PRIVATE_H */ diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 02eb6b9e4438..cb3e82abf034 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -95,6 +95,7 @@ static int validate_user_key(struct fscrypt_info *crypt_info, kfree(description); if (IS_ERR(keyring_key)) return PTR_ERR(keyring_key); + down_read(&keyring_key->sem); if (keyring_key->type != &key_type_logon) { printk_once(KERN_WARNING @@ -102,11 +103,9 @@ static int validate_user_key(struct fscrypt_info *crypt_info, res = -ENOKEY; goto out; } - down_read(&keyring_key->sem); ukp = user_key_payload(keyring_key); if (ukp->datalen != sizeof(struct fscrypt_key)) { res = -EINVAL; - up_read(&keyring_key->sem); goto out; } master_key = (struct fscrypt_key *)ukp->data; @@ -117,17 +116,11 @@ static int validate_user_key(struct fscrypt_info *crypt_info, "%s: key size incorrect: %d\n", __func__, master_key->size); res = -ENOKEY; - up_read(&keyring_key->sem); goto out; } res = derive_key_aes(ctx->nonce, master_key->raw, raw_key); - up_read(&keyring_key->sem); - if (res) - goto out; - - crypt_info->ci_keyring_key = keyring_key; - return 0; out: + up_read(&keyring_key->sem); key_put(keyring_key); return res; } @@ -169,12 +162,11 @@ static void put_crypt_info(struct fscrypt_info *ci) if (!ci) return; - key_put(ci->ci_keyring_key); crypto_free_skcipher(ci->ci_ctfm); kmem_cache_free(fscrypt_info_cachep, ci); } -int fscrypt_get_crypt_info(struct inode *inode) +int fscrypt_get_encryption_info(struct inode *inode) { struct fscrypt_info *crypt_info; struct fscrypt_context ctx; @@ -184,21 +176,15 @@ int fscrypt_get_crypt_info(struct inode *inode) u8 *raw_key = NULL; int res; + if (inode->i_crypt_info) + return 0; + res = fscrypt_initialize(inode->i_sb->s_cop->flags); if (res) return res; if (!inode->i_sb->s_cop->get_context) return -EOPNOTSUPP; -retry: - crypt_info = ACCESS_ONCE(inode->i_crypt_info); - if (crypt_info) { - if (!crypt_info->ci_keyring_key || - key_validate(crypt_info->ci_keyring_key) == 0) - return 0; - fscrypt_put_encryption_info(inode, crypt_info); - goto retry; - } res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (res < 0) { @@ -229,7 +215,6 @@ int fscrypt_get_crypt_info(struct inode *inode) crypt_info->ci_data_mode = ctx.contents_encryption_mode; crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; crypt_info->ci_ctfm = NULL; - crypt_info->ci_keyring_key = NULL; memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, sizeof(crypt_info->ci_master_key)); @@ -273,14 +258,8 @@ int fscrypt_get_crypt_info(struct inode *inode) if (res) goto out; - kzfree(raw_key); - raw_key = NULL; - if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) != NULL) { - put_crypt_info(crypt_info); - goto retry; - } - return 0; - + if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) == NULL) + crypt_info = NULL; out: if (res == -ENOKEY) res = 0; @@ -288,6 +267,7 @@ int fscrypt_get_crypt_info(struct inode *inode) kzfree(raw_key); return res; } +EXPORT_SYMBOL(fscrypt_get_encryption_info); void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci) { @@ -305,17 +285,3 @@ void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci) put_crypt_info(ci); } EXPORT_SYMBOL(fscrypt_put_encryption_info); - -int fscrypt_get_encryption_info(struct inode *inode) -{ - struct fscrypt_info *ci = inode->i_crypt_info; - - if (!ci || - (ci->ci_keyring_key && - (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | - (1 << KEY_FLAG_REVOKED) | - (1 << KEY_FLAG_DEAD))))) - return fscrypt_get_crypt_info(inode); - return 0; -} -EXPORT_SYMBOL(fscrypt_get_encryption_info); From 0addb61dc70fff224c344a30bf38345d96ce7fdb Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 22 Feb 2017 13:25:14 -0800 Subject: [PATCH 0297/1212] fscrypt: eliminate ->prepare_context() operation The only use of the ->prepare_context() fscrypt operation was to allow ext4 to evict inline data from the inode before ->set_context(). However, there is no reason why this cannot be done as simply the first step in ->set_context(), and in fact it makes more sense to do it that way because then the policy modes and flags get validated before any real work is done. Therefore, merge ext4_prepare_context() into ext4_set_context(), and remove ->prepare_context(). Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o Conflicts: fs/ext4/super.c --- fs/crypto/policy.c | 7 ------- include/linux/fscrypt_common.h | 1 - 2 files changed, 8 deletions(-) diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 14b76da71269..4908906d54d5 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -33,17 +33,10 @@ static int create_encryption_context_from_policy(struct inode *inode, const struct fscrypt_policy *policy) { struct fscrypt_context ctx; - int res; if (!inode->i_sb->s_cop->set_context) return -EOPNOTSUPP; - if (inode->i_sb->s_cop->prepare_context) { - res = inode->i_sb->s_cop->prepare_context(inode); - if (res) - return res; - } - ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, FS_KEY_DESCRIPTOR_SIZE); diff --git a/include/linux/fscrypt_common.h b/include/linux/fscrypt_common.h index 547f81592ba1..10c1abfbac6c 100644 --- a/include/linux/fscrypt_common.h +++ b/include/linux/fscrypt_common.h @@ -87,7 +87,6 @@ struct fscrypt_operations { unsigned int flags; const char *key_prefix; int (*get_context)(struct inode *, void *, size_t); - int (*prepare_context)(struct inode *); int (*set_context)(struct inode *, const void *, size_t, void *); int (*dummy_context)(struct inode *); bool (*is_encrypted)(struct inode *); From 31469fc2488f66e7b43f80088690dd386e0d12b6 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 4 Apr 2017 14:39:41 -0700 Subject: [PATCH 0298/1212] fscrypt: remove unnecessary checks for NULL operations The functions in fs/crypto/*.c are only called by filesystems configured with encryption support. Since the ->get_context(), ->set_context(), and ->empty_dir() operations are always provided in that case (and must be, otherwise there would be no way to get/set encryption policies, or in the case of ->get_context() even access encrypted files at all), there is no need to check for these operations being NULL and we can remove these unneeded checks. Signed-off-by: Eric Biggers Reviewed-by: Richard Weinberger Signed-off-by: Theodore Ts'o --- fs/crypto/keyinfo.c | 3 --- fs/crypto/policy.c | 11 +---------- 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index cb3e82abf034..4636c18c2fb9 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -183,9 +183,6 @@ int fscrypt_get_encryption_info(struct inode *inode) if (res) return res; - if (!inode->i_sb->s_cop->get_context) - return -EOPNOTSUPP; - res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (res < 0) { if (!fscrypt_dummy_context_enabled(inode) || diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 4908906d54d5..d71ec3780d0c 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -34,9 +34,6 @@ static int create_encryption_context_from_policy(struct inode *inode, { struct fscrypt_context ctx; - if (!inode->i_sb->s_cop->set_context) - return -EOPNOTSUPP; - ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, FS_KEY_DESCRIPTOR_SIZE); @@ -87,8 +84,6 @@ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) if (ret == -ENODATA) { if (!S_ISDIR(inode->i_mode)) ret = -ENOTDIR; - else if (!inode->i_sb->s_cop->empty_dir) - ret = -EOPNOTSUPP; else if (!inode->i_sb->s_cop->empty_dir(inode)) ret = -ENOTEMPTY; else @@ -118,8 +113,7 @@ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) struct fscrypt_policy policy; int res; - if (!inode->i_sb->s_cop->get_context || - !inode->i_sb->s_cop->is_encrypted(inode)) + if (!inode->i_sb->s_cop->is_encrypted(inode)) return -ENODATA; res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); @@ -202,9 +196,6 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child, struct fscrypt_info *ci; int res; - if (!parent->i_sb->s_cop->set_context) - return -EOPNOTSUPP; - res = fscrypt_get_encryption_info(parent); if (res < 0) return res; From c24873a651517a8a247a2f90f4ca8631747e793e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 4 Apr 2017 14:43:34 -0700 Subject: [PATCH 0299/1212] fscrypt: remove fscrypt_symlink_data_len() fscrypt_symlink_data_len() is never called and can be removed. Signed-off-by: Eric Biggers Reviewed-by: Richard Weinberger Signed-off-by: Theodore Ts'o --- include/linux/fscrypt_common.h | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/include/linux/fscrypt_common.h b/include/linux/fscrypt_common.h index 10c1abfbac6c..0a30c106c1e5 100644 --- a/include/linux/fscrypt_common.h +++ b/include/linux/fscrypt_common.h @@ -46,17 +46,6 @@ struct fscrypt_symlink_data { char encrypted_path[1]; } __packed; -/** - * This function is used to calculate the disk space required to - * store a filename of length l in encrypted symlink format. - */ -static inline u32 fscrypt_symlink_data_len(u32 l) -{ - if (l < FS_CRYPTO_BLOCK_SIZE) - l = FS_CRYPTO_BLOCK_SIZE; - return (l + sizeof(struct fscrypt_symlink_data) - 1); -} - struct fscrypt_str { unsigned char *name; u32 len; From 171695f2abca44894f48e90401b0ebb46035893f Mon Sep 17 00:00:00 2001 From: Joe Richey Date: Thu, 6 Apr 2017 16:14:05 -0700 Subject: [PATCH 0300/1212] fscrypt: Move key structure and constants to uapi This commit exposes the necessary constants and structures for a userspace program to pass filesystem encryption keys into the keyring. The fscrypt_key structure was already part of the kernel ABI, this change just makes it so programs no longer have to redeclare these structures (like e4crypt in e2fsprogs currently does). Note that we do not expose the other FS_*_KEY_SIZE constants as they are not necessary. Only XTS is supported for contents_encryption_mode, so currently FS_MAX_KEY_SIZE bytes of key material must always be passed to the kernel. This commit also removes __packed from fscrypt_key as it does not contain any implicit padding and does not refer to an on-disk structure. Signed-off-by: Joe Richey Signed-off-by: Theodore Ts'o --- fs/crypto/fscrypt_private.h | 11 ----------- include/uapi/linux/fs.h | 13 +++++++++++++ 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index ab0440274630..6ed4ad422fc3 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -22,10 +22,6 @@ #define FS_AES_256_CBC_KEY_SIZE 32 #define FS_AES_256_CTS_KEY_SIZE 32 #define FS_AES_256_XTS_KEY_SIZE 64 -#define FS_MAX_KEY_SIZE 64 - -#define FS_KEY_DESC_PREFIX "fscrypt:" -#define FS_KEY_DESC_PREFIX_SIZE 8 #define FS_KEY_DERIVATION_NONCE_SIZE 16 @@ -51,13 +47,6 @@ struct fscrypt_context { #define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1 -/* This is passed in from userspace into the kernel keyring */ -struct fscrypt_key { - u32 mode; - u8 raw[FS_MAX_KEY_SIZE]; - u32 size; -} __packed; - /* * A pointer to this structure is stored in the file system's in-core * representation of an inode. diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index ea33e08d9d75..a1533084395c 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -203,6 +203,19 @@ struct fscrypt_policy { #define FS_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16]) #define FS_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct fscrypt_policy) +/* Parameters for passing an encryption key into the kernel keyring */ +#define FS_KEY_DESC_PREFIX "fscrypt:" +#define FS_KEY_DESC_PREFIX_SIZE 8 + +/* Structure that userspace passes to the kernel keyring */ +#define FS_MAX_KEY_SIZE 64 + +struct fscrypt_key { + __u32 mode; + __u8 raw[FS_MAX_KEY_SIZE]; + __u32 size; +}; + /* * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) */ From 48c7f9c819ac97658839140d25e7505b397c6ffe Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 7 Apr 2017 10:58:37 -0700 Subject: [PATCH 0301/1212] fscrypt: fix context consistency check when key(s) unavailable To mitigate some types of offline attacks, filesystem encryption is designed to enforce that all files in an encrypted directory tree use the same encryption policy (i.e. the same encryption context excluding the nonce). However, the fscrypt_has_permitted_context() function which enforces this relies on comparing struct fscrypt_info's, which are only available when we have the encryption keys. This can cause two incorrect behaviors: 1. If we have the parent directory's key but not the child's key, or vice versa, then fscrypt_has_permitted_context() returned false, causing applications to see EPERM or ENOKEY. This is incorrect if the encryption contexts are in fact consistent. Although we'd normally have either both keys or neither key in that case since the master_key_descriptors would be the same, this is not guaranteed because keys can be added or removed from keyrings at any time. 2. If we have neither the parent's key nor the child's key, then fscrypt_has_permitted_context() returned true, causing applications to see no error (or else an error for some other reason). This is incorrect if the encryption contexts are in fact inconsistent, since in that case we should deny access. To fix this, retrieve and compare the fscrypt_contexts if we are unable to set up both fscrypt_infos. While this slightly hurts performance when accessing an encrypted directory tree without the key, this isn't a case we really need to be optimizing for; access *with* the key is much more important. Furthermore, the performance hit is barely noticeable given that we are already retrieving the fscrypt_context and doing two keyring searches in fscrypt_get_encryption_info(). If we ever actually wanted to optimize this case we might start by caching the fscrypt_contexts. Cc: stable@vger.kernel.org # 4.0+ Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/policy.c | 87 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 68 insertions(+), 19 deletions(-) diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index d71ec3780d0c..210976e7a269 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -137,27 +137,61 @@ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) } EXPORT_SYMBOL(fscrypt_ioctl_get_policy); +/** + * fscrypt_has_permitted_context() - is a file's encryption policy permitted + * within its directory? + * + * @parent: inode for parent directory + * @child: inode for file being looked up, opened, or linked into @parent + * + * Filesystems must call this before permitting access to an inode in a + * situation where the parent directory is encrypted (either before allowing + * ->lookup() to succeed, or for a regular file before allowing it to be opened) + * and before any operation that involves linking an inode into an encrypted + * directory, including link, rename, and cross rename. It enforces the + * constraint that within a given encrypted directory tree, all files use the + * same encryption policy. The pre-access check is needed to detect potentially + * malicious offline violations of this constraint, while the link and rename + * checks are needed to prevent online violations of this constraint. + * + * Return: 1 if permitted, 0 if forbidden. If forbidden, the caller must fail + * the filesystem operation with EPERM. + */ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) { - struct fscrypt_info *parent_ci, *child_ci; + const struct fscrypt_operations *cops = parent->i_sb->s_cop; + const struct fscrypt_info *parent_ci, *child_ci; + struct fscrypt_context parent_ctx, child_ctx; int res; - if ((parent == NULL) || (child == NULL)) { - printk(KERN_ERR "parent %p child %p\n", parent, child); - BUG_ON(1); - } - /* No restrictions on file types which are never encrypted */ if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) && !S_ISLNK(child->i_mode)) return 1; - /* no restrictions if the parent directory is not encrypted */ - if (!parent->i_sb->s_cop->is_encrypted(parent)) + /* No restrictions if the parent directory is unencrypted */ + if (!cops->is_encrypted(parent)) return 1; - /* if the child directory is not encrypted, this is always a problem */ - if (!parent->i_sb->s_cop->is_encrypted(child)) + + /* Encrypted directories must not contain unencrypted files */ + if (!cops->is_encrypted(child)) return 0; + + /* + * Both parent and child are encrypted, so verify they use the same + * encryption policy. Compare the fscrypt_info structs if the keys are + * available, otherwise retrieve and compare the fscrypt_contexts. + * + * Note that the fscrypt_context retrieval will be required frequently + * when accessing an encrypted directory tree without the key. + * Performance-wise this is not a big deal because we already don't + * really optimize for file access without the key (to the extent that + * such access is even possible), given that any attempted access + * already causes a fscrypt_context retrieval and keyring search. + * + * In any case, if an unexpected error occurs, fall back to "forbidden". + */ + res = fscrypt_get_encryption_info(parent); if (res) return 0; @@ -166,17 +200,32 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) return 0; parent_ci = parent->i_crypt_info; child_ci = child->i_crypt_info; - if (!parent_ci && !child_ci) - return 1; - if (!parent_ci || !child_ci) + + if (parent_ci && child_ci) { + return memcmp(parent_ci->ci_master_key, child_ci->ci_master_key, + FS_KEY_DESCRIPTOR_SIZE) == 0 && + (parent_ci->ci_data_mode == child_ci->ci_data_mode) && + (parent_ci->ci_filename_mode == + child_ci->ci_filename_mode) && + (parent_ci->ci_flags == child_ci->ci_flags); + } + + res = cops->get_context(parent, &parent_ctx, sizeof(parent_ctx)); + if (res != sizeof(parent_ctx)) return 0; - return (memcmp(parent_ci->ci_master_key, - child_ci->ci_master_key, - FS_KEY_DESCRIPTOR_SIZE) == 0 && - (parent_ci->ci_data_mode == child_ci->ci_data_mode) && - (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) && - (parent_ci->ci_flags == child_ci->ci_flags)); + res = cops->get_context(child, &child_ctx, sizeof(child_ctx)); + if (res != sizeof(child_ctx)) + return 0; + + return memcmp(parent_ctx.master_key_descriptor, + child_ctx.master_key_descriptor, + FS_KEY_DESCRIPTOR_SIZE) == 0 && + (parent_ctx.contents_encryption_mode == + child_ctx.contents_encryption_mode) && + (parent_ctx.filenames_encryption_mode == + child_ctx.filenames_encryption_mode) && + (parent_ctx.flags == child_ctx.flags); } EXPORT_SYMBOL(fscrypt_has_permitted_context); From 8c66df6c7a8b22e94c37a6d8374a5ee2ad1dd27f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 24 Apr 2017 10:00:10 -0700 Subject: [PATCH 0302/1212] fscrypt: introduce helper function for filename matching Introduce a helper function fscrypt_match_name() which tests whether a fscrypt_name matches a directory entry. Also clean up the magic numbers and document things properly. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/fname.c | 90 +++++++++++++++++++++++++-------- fs/crypto/fscrypt_private.h | 2 - include/linux/fscrypt_notsupp.h | 9 ++++ include/linux/fscrypt_supp.h | 78 ++++++++++++++++++++++++++++ 4 files changed, 157 insertions(+), 22 deletions(-) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 15bf9c31a34d..d1bb02b1ee58 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -159,6 +159,8 @@ static int fname_decrypt(struct inode *inode, static const char *lookup_table = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; +#define BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3) + /** * digest_encode() - * @@ -230,11 +232,14 @@ EXPORT_SYMBOL(fscrypt_fname_encrypted_size); int fscrypt_fname_alloc_buffer(const struct inode *inode, u32 ilen, struct fscrypt_str *crypto_str) { - unsigned int olen = fscrypt_fname_encrypted_size(inode, ilen); + u32 olen = fscrypt_fname_encrypted_size(inode, ilen); + const u32 max_encoded_len = + max_t(u32, BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE), + 1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name))); crypto_str->len = olen; - if (olen < FS_FNAME_CRYPTO_DIGEST_SIZE * 2) - olen = FS_FNAME_CRYPTO_DIGEST_SIZE * 2; + olen = max(olen, max_encoded_len); + /* * Allocated buffer can hold one more character to null-terminate the * string @@ -266,6 +271,10 @@ EXPORT_SYMBOL(fscrypt_fname_free_buffer); * * The caller must have allocated sufficient memory for the @oname string. * + * If the key is available, we'll decrypt the disk name; otherwise, we'll encode + * it for presentation. Short names are directly base64-encoded, while long + * names are encoded in fscrypt_digested_name format. + * * Return: 0 on success, -errno on failure */ int fscrypt_fname_disk_to_usr(struct inode *inode, @@ -274,7 +283,7 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, struct fscrypt_str *oname) { const struct qstr qname = FSTR_TO_QSTR(iname); - char buf[24]; + struct fscrypt_digested_name digested_name; if (fscrypt_is_dot_dotdot(&qname)) { oname->name[0] = '.'; @@ -289,20 +298,24 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, if (inode->i_crypt_info) return fname_decrypt(inode, iname, oname); - if (iname->len <= FS_FNAME_CRYPTO_DIGEST_SIZE) { + if (iname->len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE) { oname->len = digest_encode(iname->name, iname->len, oname->name); return 0; } if (hash) { - memcpy(buf, &hash, 4); - memcpy(buf + 4, &minor_hash, 4); + digested_name.hash = hash; + digested_name.minor_hash = minor_hash; } else { - memset(buf, 0, 8); + digested_name.hash = 0; + digested_name.minor_hash = 0; } - memcpy(buf + 8, iname->name + ((iname->len - 17) & ~15), 16); + memcpy(digested_name.digest, + FSCRYPT_FNAME_DIGEST(iname->name, iname->len), + FSCRYPT_FNAME_DIGEST_SIZE); oname->name[0] = '_'; - oname->len = 1 + digest_encode(buf, 24, oname->name + 1); + oname->len = 1 + digest_encode((const char *)&digested_name, + sizeof(digested_name), oname->name + 1); return 0; } EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); @@ -336,10 +349,35 @@ int fscrypt_fname_usr_to_disk(struct inode *inode, } EXPORT_SYMBOL(fscrypt_fname_usr_to_disk); +/** + * fscrypt_setup_filename() - prepare to search a possibly encrypted directory + * @dir: the directory that will be searched + * @iname: the user-provided filename being searched for + * @lookup: 1 if we're allowed to proceed without the key because it's + * ->lookup() or we're finding the dir_entry for deletion; 0 if we cannot + * proceed without the key because we're going to create the dir_entry. + * @fname: the filename information to be filled in + * + * Given a user-provided filename @iname, this function sets @fname->disk_name + * to the name that would be stored in the on-disk directory entry, if possible. + * If the directory is unencrypted this is simply @iname. Else, if we have the + * directory's encryption key, then @iname is the plaintext, so we encrypt it to + * get the disk_name. + * + * Else, for keyless @lookup operations, @iname is the presented ciphertext, so + * we decode it to get either the ciphertext disk_name (for short names) or the + * fscrypt_digested_name (for long names). Non-@lookup operations will be + * impossible in this case, so we fail them with ENOKEY. + * + * If successful, fscrypt_free_filename() must be called later to clean up. + * + * Return: 0 on success, -errno on failure + */ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct fscrypt_name *fname) { - int ret = 0, bigname = 0; + int ret; + int digested; memset(fname, 0, sizeof(struct fscrypt_name)); fname->usr_fname = iname; @@ -373,25 +411,37 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, * We don't have the key and we are doing a lookup; decode the * user-supplied name */ - if (iname->name[0] == '_') - bigname = 1; - if ((bigname && (iname->len != 33)) || (!bigname && (iname->len > 43))) - return -ENOENT; + if (iname->name[0] == '_') { + if (iname->len != + 1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name))) + return -ENOENT; + digested = 1; + } else { + if (iname->len > + BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE)) + return -ENOENT; + digested = 0; + } - fname->crypto_buf.name = kmalloc(32, GFP_KERNEL); + fname->crypto_buf.name = + kmalloc(max_t(size_t, FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE, + sizeof(struct fscrypt_digested_name)), + GFP_KERNEL); if (fname->crypto_buf.name == NULL) return -ENOMEM; - ret = digest_decode(iname->name + bigname, iname->len - bigname, + ret = digest_decode(iname->name + digested, iname->len - digested, fname->crypto_buf.name); if (ret < 0) { ret = -ENOENT; goto errout; } fname->crypto_buf.len = ret; - if (bigname) { - memcpy(&fname->hash, fname->crypto_buf.name, 4); - memcpy(&fname->minor_hash, fname->crypto_buf.name + 4, 4); + if (digested) { + const struct fscrypt_digested_name *n = + (const void *)fname->crypto_buf.name; + fname->hash = n->hash; + fname->minor_hash = n->minor_hash; } else { fname->disk_name.name = fname->crypto_buf.name; fname->disk_name.len = fname->crypto_buf.len; diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 6ed4ad422fc3..0b65491de28a 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -13,8 +13,6 @@ #include -#define FS_FNAME_CRYPTO_DIGEST_SIZE 32 - /* Encryption parameters */ #define FS_XTS_TWEAK_SIZE 16 #define FS_AES_128_ECB_KEY_SIZE 16 diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index 3511ca798804..ec406aed2f2f 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -147,6 +147,15 @@ static inline int fscrypt_fname_usr_to_disk(struct inode *inode, return -EOPNOTSUPP; } +static inline bool fscrypt_match_name(const struct fscrypt_name *fname, + const u8 *de_name, u32 de_name_len) +{ + /* Encryption support disabled; use standard comparison */ + if (de_name_len != fname->disk_name.len) + return false; + return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len); +} + /* bio.c */ static inline void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio) diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index a140f47e9b27..e12c224a0d1e 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -57,6 +57,84 @@ extern int fscrypt_fname_disk_to_usr(struct inode *, u32, u32, extern int fscrypt_fname_usr_to_disk(struct inode *, const struct qstr *, struct fscrypt_str *); +#define FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE 32 + +/* Extracts the second-to-last ciphertext block; see explanation below */ +#define FSCRYPT_FNAME_DIGEST(name, len) \ + ((name) + round_down((len) - FS_CRYPTO_BLOCK_SIZE - 1, \ + FS_CRYPTO_BLOCK_SIZE)) + +#define FSCRYPT_FNAME_DIGEST_SIZE FS_CRYPTO_BLOCK_SIZE + +/** + * fscrypt_digested_name - alternate identifier for an on-disk filename + * + * When userspace lists an encrypted directory without access to the key, + * filenames whose ciphertext is longer than FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE + * bytes are shown in this abbreviated form (base64-encoded) rather than as the + * full ciphertext (base64-encoded). This is necessary to allow supporting + * filenames up to NAME_MAX bytes, since base64 encoding expands the length. + * + * To make it possible for filesystems to still find the correct directory entry + * despite not knowing the full on-disk name, we encode any filesystem-specific + * 'hash' and/or 'minor_hash' which the filesystem may need for its lookups, + * followed by the second-to-last ciphertext block of the filename. Due to the + * use of the CBC-CTS encryption mode, the second-to-last ciphertext block + * depends on the full plaintext. (Note that ciphertext stealing causes the + * last two blocks to appear "flipped".) This makes collisions very unlikely: + * just a 1 in 2^128 chance for two filenames to collide even if they share the + * same filesystem-specific hashes. + * + * This scheme isn't strictly immune to intentional collisions because it's + * basically like a CBC-MAC, which isn't secure on variable-length inputs. + * However, generating a CBC-MAC collision requires the ability to choose + * arbitrary ciphertext, which won't normally be possible with filename + * encryption since it would require write access to the raw disk. + * + * Taking a real cryptographic hash like SHA-256 over the full ciphertext would + * be better in theory but would be less efficient and more complicated to + * implement, especially since the filesystem would need to calculate it for + * each directory entry examined during a search. + */ +struct fscrypt_digested_name { + u32 hash; + u32 minor_hash; + u8 digest[FSCRYPT_FNAME_DIGEST_SIZE]; +}; + +/** + * fscrypt_match_name() - test whether the given name matches a directory entry + * @fname: the name being searched for + * @de_name: the name from the directory entry + * @de_name_len: the length of @de_name in bytes + * + * Normally @fname->disk_name will be set, and in that case we simply compare + * that to the name stored in the directory entry. The only exception is that + * if we don't have the key for an encrypted directory and a filename in it is + * very long, then we won't have the full disk_name and we'll instead need to + * match against the fscrypt_digested_name. + * + * Return: %true if the name matches, otherwise %false. + */ +static inline bool fscrypt_match_name(const struct fscrypt_name *fname, + const u8 *de_name, u32 de_name_len) +{ + if (unlikely(!fname->disk_name.name)) { + const struct fscrypt_digested_name *n = + (const void *)fname->crypto_buf.name; + if (WARN_ON_ONCE(fname->usr_fname->name[0] != '_')) + return false; + if (de_name_len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE) + return false; + return !memcmp(FSCRYPT_FNAME_DIGEST(de_name, de_name_len), + n->digest, FSCRYPT_FNAME_DIGEST_SIZE); + } + + if (de_name_len != fname->disk_name.len) + return false; + return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len); +} + /* bio.c */ extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *); extern void fscrypt_pullback_bio_page(struct page **, bool); From a1425ed23988ee78ab0bde734117b382da3bba39 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 24 Apr 2017 10:00:12 -0700 Subject: [PATCH 0303/1212] f2fs: switch to using fscrypt_match_name() Switch f2fs directory searches to use the fscrypt_match_name() helper function. There should be no functional change. Signed-off-by: Eric Biggers Acked-by: Jaegeuk Kim Signed-off-by: Theodore Ts'o --- fs/f2fs/dir.c | 28 ++++------------------------ 1 file changed, 4 insertions(+), 24 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index c716ab0baf1d..a87a5ecca74d 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -111,8 +111,6 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, struct f2fs_dir_entry *de; unsigned long bit_pos = 0; int max_len = 0; - struct fscrypt_str de_name = FSTR_INIT(NULL, 0); - struct fscrypt_str *name = &fname->disk_name; if (max_slots) *max_slots = 0; @@ -130,29 +128,11 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, continue; } - if (de->hash_code != namehash) - goto not_match; - - de_name.name = d->filename[bit_pos]; - de_name.len = le16_to_cpu(de->name_len); - -#ifdef CONFIG_F2FS_FS_ENCRYPTION - if (unlikely(!name->name)) { - if (fname->usr_fname->name[0] == '_') { - if (de_name.len > 32 && - !memcmp(de_name.name + ((de_name.len - 17) & ~15), - fname->crypto_buf.name + 8, 16)) - goto found; - goto not_match; - } - name->name = fname->crypto_buf.name; - name->len = fname->crypto_buf.len; - } -#endif - if (de_name.len == name->len && - !memcmp(de_name.name, name->name, name->len)) + if (de->hash_code == namehash && + fscrypt_match_name(fname, d->filename[bit_pos], + le16_to_cpu(de->name_len))) goto found; -not_match: + if (max_slots && max_len > *max_slots) *max_slots = max_len; max_len = 0; From a156aa8444353737f3e23aa7b1646852b9f0dea2 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 1 May 2017 11:43:32 -0700 Subject: [PATCH 0304/1212] fscrypt: correct collision claim for digested names As I noted on the mailing list, it's easier than I originally thought to create intentional collisions in the digested names. Unfortunately it's not too easy to solve this, so for now just fix the comment to not lie. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- include/linux/fscrypt_supp.h | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index e12c224a0d1e..cd4e82c17304 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -81,20 +81,16 @@ extern int fscrypt_fname_usr_to_disk(struct inode *, const struct qstr *, * followed by the second-to-last ciphertext block of the filename. Due to the * use of the CBC-CTS encryption mode, the second-to-last ciphertext block * depends on the full plaintext. (Note that ciphertext stealing causes the - * last two blocks to appear "flipped".) This makes collisions very unlikely: - * just a 1 in 2^128 chance for two filenames to collide even if they share the - * same filesystem-specific hashes. + * last two blocks to appear "flipped".) This makes accidental collisions very + * unlikely: just a 1 in 2^128 chance for two filenames to collide even if they + * share the same filesystem-specific hashes. * - * This scheme isn't strictly immune to intentional collisions because it's - * basically like a CBC-MAC, which isn't secure on variable-length inputs. - * However, generating a CBC-MAC collision requires the ability to choose - * arbitrary ciphertext, which won't normally be possible with filename - * encryption since it would require write access to the raw disk. - * - * Taking a real cryptographic hash like SHA-256 over the full ciphertext would - * be better in theory but would be less efficient and more complicated to - * implement, especially since the filesystem would need to calculate it for - * each directory entry examined during a search. + * However, this scheme isn't immune to intentional collisions, which can be + * created by anyone able to create arbitrary plaintext filenames and view them + * without the key. Making the "digest" be a real cryptographic hash like + * SHA-256 over the full ciphertext would prevent this, although it would be + * less efficient and harder to implement, especially since the filesystem would + * need to calculate it for each directory entry examined during a search. */ struct fscrypt_digested_name { u32 hash; From 6190400da0498c63e01f1984f3386e95c487d2f7 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 6 Jul 2017 12:24:49 -0700 Subject: [PATCH 0305/1212] f2fs, block_dump: give WRITE direction to submit_bio The block_dump in submit_bio uses rw, instead of bio->bi_rw. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/segment.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index c1e881242d53..ae6c1353529f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -224,7 +224,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, trace_f2fs_submit_read_bio(sbi->sb, type, bio); else trace_f2fs_submit_write_bio(sbi->sb, type, bio); - submit_bio(0, bio); + submit_bio(bio_op(bio), bio); } static void __submit_merged_bio(struct f2fs_bio_info *io) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5fdc995b1f1e..c35e70e72e8b 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -809,7 +809,7 @@ static int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, } if (bio) { - int ret = submit_bio_wait(0, bio); + int ret = submit_bio_wait(op, bio); bio_put(bio); if (ret) return ret; From cc4611491956055e8a414351e6a180677c46ff08 Mon Sep 17 00:00:00 2001 From: Jin Qian Date: Mon, 15 May 2017 10:45:08 -0700 Subject: [PATCH 0306/1212] f2fs: sanity check checkpoint segno and blkoff Make sure segno and blkoff read from raw image are valid. Cc: stable@vger.kernel.org Signed-off-by: Jin Qian [Jaegeuk Kim: adjust minor coding style] Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index eeda97b54556..fb68af6b04c5 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1523,6 +1523,8 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned int ovp_segments, reserved_segments; + unsigned int main_segs, blocks_per_seg; + int i; total = le32_to_cpu(raw_super->segment_count); fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); @@ -1544,6 +1546,20 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) return 1; } + main_segs = le32_to_cpu(raw_super->segment_count_main); + blocks_per_seg = sbi->blocks_per_seg; + + for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { + if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs || + le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg) + return 1; + } + for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { + if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs || + le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg) + return 1; + } + if (unlikely(f2fs_cp_error(sbi))) { f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); return 1; From 34c2b318e3ce93a2d7a96f7545fc7c279e492b2d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 16 May 2017 13:20:16 -0700 Subject: [PATCH 0307/1212] f2fs: load inode's flag from disk This patch fixes missing inode flag loaded from disk, reported by Tom. [tom@localhost ~]$ sudo mount /dev/loop0 /mnt/ [tom@localhost ~]$ sudo chown tom:tom /mnt/ [tom@localhost ~]$ touch /mnt/testfile [tom@localhost ~]$ sudo chattr +i /mnt/testfile [tom@localhost ~]$ echo test > /mnt/testfile bash: /mnt/testfile: Operation not permitted [tom@localhost ~]$ rm /mnt/testfile rm: cannot remove '/mnt/testfile': Operation not permitted [tom@localhost ~]$ sudo umount /mnt/ [tom@localhost ~]$ sudo mount /dev/loop0 /mnt/ [tom@localhost ~]$ lsattr /mnt/testfile ----i-------------- /mnt/testfile [tom@localhost ~]$ echo test > /mnt/testfile [tom@localhost ~]$ rm /mnt/testfile [tom@localhost ~]$ sudo umount /mnt/ Cc: stable@vger.kernel.org Reported-by: Tom Yan Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 1 + fs/f2fs/inode.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 70be377c2236..aee781394c87 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1498,6 +1498,7 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) inode->i_ctime = current_time(inode); f2fs_set_inode_flags(inode); + f2fs_mark_inode_dirty_sync(inode, false); inode_unlock(inode); out: diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 518f49643092..e53c784ab11e 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -44,7 +44,6 @@ void f2fs_set_inode_flags(struct inode *inode) new_fl |= S_DIRSYNC; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); - f2fs_mark_inode_dirty_sync(inode, false); } static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) @@ -226,6 +225,7 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) ret = -EIO; goto bad_inode; } + f2fs_set_inode_flags(inode); unlock_new_inode(inode); trace_f2fs_iget(inode); return inode; From 71a2058ae62a60a00de63e450d5f3d251cff604d Mon Sep 17 00:00:00 2001 From: Weichao Guo Date: Thu, 11 May 2017 04:28:00 +0800 Subject: [PATCH 0308/1212] f2fs: make sure f2fs_gc returns consistent errno By default, f2fs_gc returns -EINVAL in general error cases, e.g., no victim was selected. However, the default errno may be overwritten in two cases: gc_more and BG_GC -> FG_GC. We should return consistent errno in such cases. Signed-off-by: Weichao Guo Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index b527ab0eec1d..afa2b2cf9f7e 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -955,7 +955,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, { int gc_type = sync ? FG_GC : BG_GC; int sec_freed = 0; - int ret = -EINVAL; + int ret; struct cp_control cpc; unsigned int init_segno = segno; struct gc_inode_list gc_list = { @@ -965,8 +965,10 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, cpc.reason = __get_cp_reason(sbi); gc_more: - if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) + if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) { + ret = -EINVAL; goto stop; + } if (unlikely(f2fs_cp_error(sbi))) { ret = -EIO; goto stop; @@ -987,6 +989,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, gc_type = FG_GC; } + ret = -EINVAL; /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ if (gc_type == BG_GC && !background) goto stop; From 74683b0ea0208d4c37594920e0d1b6499dc5ddc3 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 10 May 2017 11:23:36 -0700 Subject: [PATCH 0309/1212] f2fs: use f2fs_submit_page_bio for ra_meta_pages This patch avoids to use f2fs_submit_merged_bio for read, which was the only read case. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index d639fd9062d4..81d6de1336d0 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -208,12 +208,10 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, } fio.page = page; - fio.old_blkaddr = fio.new_blkaddr; - f2fs_submit_page_mbio(&fio); + f2fs_submit_page_bio(&fio); f2fs_put_page(page, 0); } out: - f2fs_submit_merged_bio(sbi, META, READ); blk_finish_plug(&plug); return blkno - start; } From 4a6ac1475b49371eda81c62150e9a626882f2029 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 10 May 2017 11:28:38 -0700 Subject: [PATCH 0310/1212] f2fs: remove unnecessary read cases in merged IO flow Merged IO flow doesn't need to care about read IOs. f2fs_submit_merged_bio -> f2fs_submit_merged_write f2fs_submit_merged_bios -> f2fs_submit_merged_writes f2fs_submit_merged_bio_cond -> f2fs_submit_merged_write_cond Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 14 +++++----- fs/f2fs/data.c | 55 ++++++++++++++++--------------------- fs/f2fs/f2fs.h | 12 ++++---- fs/f2fs/gc.c | 6 ++-- fs/f2fs/node.c | 11 ++++---- fs/f2fs/segment.c | 11 ++++---- fs/f2fs/super.c | 5 +--- include/trace/events/f2fs.h | 2 +- 8 files changed, 51 insertions(+), 65 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 81d6de1336d0..b7580cf84f94 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -31,7 +31,7 @@ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) set_ckpt_flags(sbi, CP_ERROR_FLAG); sbi->sb->s_flags |= MS_RDONLY; if (!end_io) - f2fs_flush_merged_bios(sbi); + f2fs_flush_merged_writes(sbi); } /* @@ -248,13 +248,13 @@ static int f2fs_write_meta_page(struct page *page, dec_page_count(sbi, F2FS_DIRTY_META); if (wbc->for_reclaim) - f2fs_submit_merged_bio_cond(sbi, page->mapping->host, - 0, page->index, META, WRITE); + f2fs_submit_merged_write_cond(sbi, page->mapping->host, + 0, page->index, META); unlock_page(page); if (unlikely(f2fs_cp_error(sbi))) - f2fs_submit_merged_bio(sbi, META, WRITE); + f2fs_submit_merged_write(sbi, META); return 0; @@ -357,7 +357,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, } stop: if (nwritten) - f2fs_submit_merged_bio(sbi, type, WRITE); + f2fs_submit_merged_write(sbi, type); blk_finish_plug(&plug); @@ -905,7 +905,7 @@ int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) * We should submit bio, since it exists several * wribacking dentry pages in the freeing inode. */ - f2fs_submit_merged_bio(sbi, DATA, WRITE); + f2fs_submit_merged_write(sbi, DATA); cond_resched(); } goto retry; @@ -1294,7 +1294,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops"); - f2fs_flush_merged_bios(sbi); + f2fs_flush_merged_writes(sbi); /* this is the case of multiple fstrims without any changes */ if (cpc->reason & CP_DISCARD) { diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ae6c1353529f..5afdd9455d43 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -290,14 +290,12 @@ static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode, return ret; } -static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, +static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, struct inode *inode, nid_t ino, pgoff_t idx, - enum page_type type, int rw) + enum page_type type) { enum page_type btype = PAGE_TYPE_OF_BIO(type); - struct f2fs_bio_info *io; - - io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype]; + struct f2fs_bio_info *io = &sbi->write_io[btype]; down_write(&io->io_rwsem); @@ -317,25 +315,24 @@ static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, up_write(&io->io_rwsem); } -void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, enum page_type type, - int rw) +void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type) { - __f2fs_submit_merged_bio(sbi, NULL, 0, 0, type, rw); + __f2fs_submit_merged_write(sbi, NULL, 0, 0, type); } -void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *sbi, +void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, nid_t ino, pgoff_t idx, - enum page_type type, int rw) + enum page_type type) { if (has_merged_page(sbi, inode, ino, idx, type)) - __f2fs_submit_merged_bio(sbi, inode, ino, idx, type, rw); + __f2fs_submit_merged_write(sbi, inode, ino, idx, type); } -void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi) +void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi) { - f2fs_submit_merged_bio(sbi, DATA, WRITE); - f2fs_submit_merged_bio(sbi, NODE, WRITE); - f2fs_submit_merged_bio(sbi, META, WRITE); + f2fs_submit_merged_write(sbi, DATA); + f2fs_submit_merged_write(sbi, NODE); + f2fs_submit_merged_write(sbi, META); } /* @@ -367,16 +364,15 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) return 0; } -int f2fs_submit_page_mbio(struct f2fs_io_info *fio) +int f2fs_submit_page_write(struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = fio->sbi; enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); - struct f2fs_bio_info *io; - bool is_read = is_read_io(fio->op); + struct f2fs_bio_info *io = &sbi->write_io[btype]; struct page *bio_page; int err = 0; - io = is_read ? &sbi->read_io : &sbi->write_io[btype]; + f2fs_bug_on(sbi, is_read_io(fio->op)); if (fio->old_blkaddr != NEW_ADDR) verify_block_addr(sbi, fio->old_blkaddr); @@ -387,8 +383,7 @@ int f2fs_submit_page_mbio(struct f2fs_io_info *fio) /* set submitted = 1 as a return value */ fio->submitted = 1; - if (!is_read) - inc_page_count(sbi, WB_DATA_TYPE(bio_page)); + inc_page_count(sbi, WB_DATA_TYPE(bio_page)); down_write(&io->io_rwsem); @@ -401,12 +396,11 @@ int f2fs_submit_page_mbio(struct f2fs_io_info *fio) if ((fio->type == DATA || fio->type == NODE) && fio->new_blkaddr & F2FS_IO_SIZE_MASK(sbi)) { err = -EAGAIN; - if (!is_read) - dec_page_count(sbi, WB_DATA_TYPE(bio_page)); + dec_page_count(sbi, WB_DATA_TYPE(bio_page)); goto out_fail; } io->bio = __bio_alloc(sbi, fio->new_blkaddr, - BIO_MAX_PAGES, is_read); + BIO_MAX_PAGES, false); io->fio = *fio; } @@ -420,7 +414,7 @@ int f2fs_submit_page_mbio(struct f2fs_io_info *fio) f2fs_trace_ios(fio, 0); out_fail: up_write(&io->io_rwsem); - trace_f2fs_submit_page_mbio(fio->page, fio); + trace_f2fs_submit_page_write(fio->page, fio); return err; } @@ -1319,7 +1313,7 @@ static int encrypt_one_page(struct f2fs_io_info *fio) /* flush pending IOs and wait for a while in the ENOMEM case */ if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { - f2fs_flush_merged_bios(fio->sbi); + f2fs_flush_merged_writes(fio->sbi); congestion_wait(BLK_RW_ASYNC, HZ/50); gfp_flags |= __GFP_NOFAIL; goto retry_encrypt; @@ -1511,8 +1505,7 @@ static int __write_data_page(struct page *page, bool *submitted, ClearPageUptodate(page); if (wbc->for_reclaim) { - f2fs_submit_merged_bio_cond(sbi, inode, 0, page->index, - DATA, WRITE); + f2fs_submit_merged_write_cond(sbi, inode, 0, page->index, DATA); clear_inode_flag(inode, FI_HOT_DATA); remove_dirty_inode(inode); submitted = NULL; @@ -1523,7 +1516,7 @@ static int __write_data_page(struct page *page, bool *submitted, f2fs_balance_fs(sbi, need_balance_fs); if (unlikely(f2fs_cp_error(sbi))) { - f2fs_submit_merged_bio(sbi, DATA, WRITE); + f2fs_submit_merged_write(sbi, DATA); submitted = NULL; } @@ -1682,8 +1675,8 @@ static int f2fs_write_cache_pages(struct address_space *mapping, mapping->writeback_index = done_index; if (last_idx != ULONG_MAX) - f2fs_submit_merged_bio_cond(F2FS_M_SB(mapping), mapping->host, - 0, last_idx, DATA, WRITE); + f2fs_submit_merged_write_cond(F2FS_M_SB(mapping), mapping->host, + 0, last_idx, DATA); return ret; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b0a093e38104..3fec9d6d3962 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -946,7 +946,6 @@ struct f2fs_sb_info { struct f2fs_sm_info *sm_info; /* segment manager */ /* for bio operations */ - struct f2fs_bio_info read_io; /* for read bios */ struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */ struct mutex wio_mutex[NODE + 1]; /* bio ordering for NODE/DATA */ int write_io_size_bits; /* Write IO size bits */ @@ -2392,14 +2391,13 @@ void destroy_checkpoint_caches(void); /* * data.c */ -void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, enum page_type type, - int rw); -void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *sbi, +void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); +void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, nid_t ino, pgoff_t idx, - enum page_type type, int rw); -void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi); + enum page_type type); +void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi); int f2fs_submit_page_bio(struct f2fs_io_info *fio); -int f2fs_submit_page_mbio(struct f2fs_io_info *fio); +int f2fs_submit_page_write(struct f2fs_io_info *fio); struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, block_t blk_addr, struct bio *bio); int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index afa2b2cf9f7e..deb20100d0be 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -670,7 +670,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx, fio.op = REQ_OP_WRITE; fio.op_flags = REQ_SYNC | REQ_NOIDLE; fio.new_blkaddr = newaddr; - f2fs_submit_page_mbio(&fio); + f2fs_submit_page_write(&fio); f2fs_update_data_blkaddr(&dn, newaddr); set_inode_flag(inode, FI_APPEND_WRITE); @@ -936,8 +936,8 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, } if (gc_type == FG_GC) - f2fs_submit_merged_bio(sbi, - (type == SUM_TYPE_NODE) ? NODE : DATA, WRITE); + f2fs_submit_merged_write(sbi, + (type == SUM_TYPE_NODE) ? NODE : DATA); blk_finish_plug(&plug); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 833f5fb9858c..90715dade918 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1373,15 +1373,15 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, up_read(&sbi->node_write); if (wbc->for_reclaim) { - f2fs_submit_merged_bio_cond(sbi, page->mapping->host, 0, - page->index, NODE, WRITE); + f2fs_submit_merged_write_cond(sbi, page->mapping->host, 0, + page->index, NODE); submitted = NULL; } unlock_page(page); if (unlikely(f2fs_cp_error(sbi))) { - f2fs_submit_merged_bio(sbi, NODE, WRITE); + f2fs_submit_merged_write(sbi, NODE); submitted = NULL; } if (submitted) @@ -1518,8 +1518,7 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, } out: if (last_idx != ULONG_MAX) - f2fs_submit_merged_bio_cond(sbi, NULL, ino, last_idx, - NODE, WRITE); + f2fs_submit_merged_write_cond(sbi, NULL, ino, last_idx, NODE); return ret ? -EIO: 0; } @@ -1625,7 +1624,7 @@ int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc) } out: if (nwritten) - f2fs_submit_merged_bio(sbi, NODE, WRITE); + f2fs_submit_merged_write(sbi, NODE); return ret; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c35e70e72e8b..5331cbefd681 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -328,8 +328,7 @@ static int __commit_inmem_pages(struct inode *inode, } if (last_idx != ULONG_MAX) - f2fs_submit_merged_bio_cond(sbi, inode, 0, last_idx, - DATA, WRITE); + f2fs_submit_merged_write_cond(sbi, inode, 0, last_idx, DATA); if (!err) __revoke_inmem_pages(inode, revoke_list, false, false); @@ -2229,7 +2228,7 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) &fio->new_blkaddr, sum, type); /* writeout dirty page into bdev */ - err = f2fs_submit_page_mbio(fio); + err = f2fs_submit_page_write(fio); if (err == -EAGAIN) { fio->old_blkaddr = fio->new_blkaddr; goto reallocate; @@ -2256,7 +2255,7 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) fio.op_flags &= ~REQ_META; set_page_writeback(page); - f2fs_submit_page_mbio(&fio); + f2fs_submit_page_write(&fio); } void write_node_page(unsigned int nid, struct f2fs_io_info *fio) @@ -2375,8 +2374,8 @@ void f2fs_wait_on_page_writeback(struct page *page, if (PageWriteback(page)) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); - f2fs_submit_merged_bio_cond(sbi, page->mapping->host, - 0, page->index, type, WRITE); + f2fs_submit_merged_write_cond(sbi, page->mapping->host, + 0, page->index, type); if (ordered) wait_on_page_writeback(page); else diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index fb68af6b04c5..528b5198a5e2 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -817,7 +817,7 @@ static void f2fs_put_super(struct super_block *sb) mutex_unlock(&sbi->umount_mutex); /* our cp_error case, we can wait for any writeback page */ - f2fs_flush_merged_bios(sbi); + f2fs_flush_merged_writes(sbi); iput(sbi->node_inode); iput(sbi->meta_inode); @@ -1972,9 +1972,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) set_sbi_flag(sbi, SBI_POR_DOING); spin_lock_init(&sbi->stat_lock); - init_rwsem(&sbi->read_io.io_rwsem); - sbi->read_io.sbi = sbi; - sbi->read_io.bio = NULL; for (i = 0; i < NR_PAGE_TYPE; i++) { init_rwsem(&sbi->write_io[i].io_rwsem); sbi->write_io[i].sbi = sbi; diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 0d02af995547..fa0d8b07a1bf 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -781,7 +781,7 @@ DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_bio, TP_CONDITION(page->mapping) ); -DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_mbio, +DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_write, TP_PROTO(struct page *page, struct f2fs_io_info *fio), From e61d6504368df2834b9c2b76debe8b1b557d08e0 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 10 May 2017 14:19:54 -0700 Subject: [PATCH 0311/1212] f2fs: use fio instead of multiple parameters This patch just changes using fio instead of parameters. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5331cbefd681..ca5f815c6eab 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2118,61 +2118,62 @@ static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) return false; } -static int __get_segment_type_2(struct page *page, enum page_type p_type) +static int __get_segment_type_2(struct f2fs_io_info *fio) { - if (p_type == DATA) + if (fio->type == DATA) return CURSEG_HOT_DATA; else return CURSEG_HOT_NODE; } -static int __get_segment_type_4(struct page *page, enum page_type p_type) +static int __get_segment_type_4(struct f2fs_io_info *fio) { - if (p_type == DATA) { - struct inode *inode = page->mapping->host; + if (fio->type == DATA) { + struct inode *inode = fio->page->mapping->host; if (S_ISDIR(inode->i_mode)) return CURSEG_HOT_DATA; else return CURSEG_COLD_DATA; } else { - if (IS_DNODE(page) && is_cold_node(page)) + if (IS_DNODE(fio->page) && is_cold_node(fio->page)) return CURSEG_WARM_NODE; else return CURSEG_COLD_NODE; } } -static int __get_segment_type_6(struct page *page, enum page_type p_type) +static int __get_segment_type_6(struct f2fs_io_info *fio) { - if (p_type == DATA) { - struct inode *inode = page->mapping->host; + if (fio->type == DATA) { + struct inode *inode = fio->page->mapping->host; - if (is_cold_data(page) || file_is_cold(inode)) + if (is_cold_data(fio->page) || file_is_cold(inode)) return CURSEG_COLD_DATA; if (is_inode_flag_set(inode, FI_HOT_DATA)) return CURSEG_HOT_DATA; return CURSEG_WARM_DATA; } else { - if (IS_DNODE(page)) - return is_cold_node(page) ? CURSEG_WARM_NODE : + if (IS_DNODE(fio->page)) + return is_cold_node(fio->page) ? CURSEG_WARM_NODE : CURSEG_HOT_NODE; return CURSEG_COLD_NODE; } } -static int __get_segment_type(struct page *page, enum page_type p_type) +static int __get_segment_type(struct f2fs_io_info *fio) { - switch (F2FS_P_SB(page)->active_logs) { + switch (fio->sbi->active_logs) { case 2: - return __get_segment_type_2(page, p_type); + return __get_segment_type_2(fio); case 4: - return __get_segment_type_4(page, p_type); + return __get_segment_type_4(fio); } + /* NR_CURSEG_TYPE(6) logs by default */ - f2fs_bug_on(F2FS_P_SB(page), - F2FS_P_SB(page)->active_logs != NR_CURSEG_TYPE); - return __get_segment_type_6(page, p_type); + f2fs_bug_on(fio->sbi, fio->sbi->active_logs != NR_CURSEG_TYPE); + + return __get_segment_type_6(fio); } void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, @@ -2218,7 +2219,7 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { - int type = __get_segment_type(fio->page, fio->type); + int type = __get_segment_type(fio); int err; if (fio->type == NODE || fio->type == DATA) From a3b6a409692bee072eec659b9d18766d53f96c36 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 10 May 2017 11:18:25 -0700 Subject: [PATCH 0312/1212] f2fs: split bio cache Split DATA/NODE type bio cache according to different temperature, so write IOs with the same temperature can be merged in corresponding bio cache as much as possible, otherwise, different temperature write IOs submitting into one bio cache will always cause split of bio. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Conflicts: include/trace/events/f2fs.h --- fs/f2fs/data.c | 57 +++++++++++++++++++++++++------------ fs/f2fs/f2fs.h | 10 ++++++- fs/f2fs/gc.c | 2 ++ fs/f2fs/segment.c | 24 ++++++++++++---- fs/f2fs/segment.h | 4 +++ fs/f2fs/super.c | 21 ++++++++++++-- include/trace/events/f2fs.h | 11 ++++++- 7 files changed, 100 insertions(+), 29 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 5afdd9455d43..f7e597a1d984 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -281,27 +281,32 @@ static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode, nid_t ino, pgoff_t idx, enum page_type type) { enum page_type btype = PAGE_TYPE_OF_BIO(type); - struct f2fs_bio_info *io = &sbi->write_io[btype]; - bool ret; + enum temp_type temp; + struct f2fs_bio_info *io; + bool ret = false; - down_read(&io->io_rwsem); - ret = __has_merged_page(io, inode, ino, idx); - up_read(&io->io_rwsem); + for (temp = HOT; temp < NR_TEMP_TYPE; temp++) { + io = sbi->write_io[btype] + temp; + + down_read(&io->io_rwsem); + ret = __has_merged_page(io, inode, ino, idx); + up_read(&io->io_rwsem); + + /* TODO: use HOT temp only for meta pages now. */ + if (ret || btype == META) + break; + } return ret; } static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, - struct inode *inode, nid_t ino, pgoff_t idx, - enum page_type type) + enum page_type type, enum temp_type temp) { enum page_type btype = PAGE_TYPE_OF_BIO(type); - struct f2fs_bio_info *io = &sbi->write_io[btype]; + struct f2fs_bio_info *io = sbi->write_io[btype] + temp; down_write(&io->io_rwsem); - if (!__has_merged_page(io, inode, ino, idx)) - goto out; - /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { io->fio.type = META_FLUSH; @@ -311,21 +316,38 @@ static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, io->fio.op_flags |= WRITE_FLUSH | REQ_FUA; } __submit_merged_bio(io); -out: up_write(&io->io_rwsem); } +static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, + struct inode *inode, nid_t ino, pgoff_t idx, + enum page_type type, bool force) +{ + enum temp_type temp; + + if (!force && !has_merged_page(sbi, inode, ino, idx, type)) + return; + + for (temp = HOT; temp < NR_TEMP_TYPE; temp++) { + + __f2fs_submit_merged_write(sbi, type, temp); + + /* TODO: use HOT temp only for meta pages now. */ + if (type >= META) + break; + } +} + void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type) { - __f2fs_submit_merged_write(sbi, NULL, 0, 0, type); + __submit_merged_write_cond(sbi, NULL, 0, 0, type, true); } void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, nid_t ino, pgoff_t idx, enum page_type type) { - if (has_merged_page(sbi, inode, ino, idx, type)) - __f2fs_submit_merged_write(sbi, inode, ino, idx, type); + __submit_merged_write_cond(sbi, inode, ino, idx, type, false); } void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi) @@ -368,7 +390,7 @@ int f2fs_submit_page_write(struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = fio->sbi; enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); - struct f2fs_bio_info *io = &sbi->write_io[btype]; + struct f2fs_bio_info *io = sbi->write_io[btype] + fio->temp; struct page *bio_page; int err = 0; @@ -404,8 +426,7 @@ int f2fs_submit_page_write(struct f2fs_io_info *fio) io->fio = *fio; } - if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < - PAGE_SIZE) { + if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < PAGE_SIZE) { __submit_merged_bio(io); goto alloc_new; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3fec9d6d3962..b0a34ae19a3f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -859,9 +859,17 @@ enum page_type { OPU, }; +enum temp_type { + HOT = 0, /* must be zero for meta bio */ + WARM, + COLD, + NR_TEMP_TYPE, +}; + struct f2fs_io_info { struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ + enum temp_type temp; /* contains HOT/WARM/COLD */ int op; /* contains REQ_OP_ */ int op_flags; /* req_flag_bits */ block_t new_blkaddr; /* new block address to be written */ @@ -946,7 +954,7 @@ struct f2fs_sb_info { struct f2fs_sm_info *sm_info; /* segment manager */ /* for bio operations */ - struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */ + struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */ struct mutex wio_mutex[NODE + 1]; /* bio ordering for NODE/DATA */ int write_io_size_bits; /* Write IO size bits */ mempool_t *write_io_dummy; /* Dummy pages */ diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index deb20100d0be..50c7864eb0d9 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -586,6 +586,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx, struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .type = DATA, + .temp = COLD, .op = REQ_OP_READ, .op_flags = REQ_SYNC, .encrypted_page = NULL, @@ -712,6 +713,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .type = DATA, + .temp = COLD, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC, .old_blkaddr = NULL_ADDR, diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ca5f815c6eab..f6cbacf66ddc 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2163,17 +2163,29 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) static int __get_segment_type(struct f2fs_io_info *fio) { + int type = 0; + switch (fio->sbi->active_logs) { case 2: - return __get_segment_type_2(fio); + type = __get_segment_type_2(fio); + break; case 4: - return __get_segment_type_4(fio); + type = __get_segment_type_4(fio); + break; + case 6: + type = __get_segment_type_6(fio); + break; + default: + f2fs_bug_on(fio->sbi, true); } - /* NR_CURSEG_TYPE(6) logs by default */ - f2fs_bug_on(fio->sbi, fio->sbi->active_logs != NR_CURSEG_TYPE); - - return __get_segment_type_6(fio); + if (IS_HOT(type)) + fio->temp = HOT; + else if (IS_WARM(type)) + fio->temp = WARM; + else + fio->temp = COLD; + return type; } void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 10bf05d4cff4..e9ba1f1d9723 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -27,6 +27,10 @@ #define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA) #define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE) +#define IS_HOT(t) ((t) == CURSEG_HOT_NODE || (t) == CURSEG_HOT_DATA) +#define IS_WARM(t) ((t) == CURSEG_WARM_NODE || (t) == CURSEG_WARM_DATA) +#define IS_COLD(t) ((t) == CURSEG_COLD_NODE || (t) == CURSEG_COLD_DATA) + #define IS_CURSEG(sbi, seg) \ (((seg) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ ((seg) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 528b5198a5e2..1cb9ca9cab33 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -768,6 +768,7 @@ static void destroy_device_list(struct f2fs_sb_info *sbi) static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); + int i; if (sbi->s_proc) { remove_proc_entry("segment_info", sbi->s_proc); @@ -839,6 +840,8 @@ static void f2fs_put_super(struct super_block *sb) if (sbi->write_io_dummy) mempool_destroy(sbi->write_io_dummy); destroy_percpu_info(sbi); + for (i = 0; i < NR_PAGE_TYPE; i++) + kfree(sbi->write_io[i]); kfree(sbi); } @@ -1973,9 +1976,19 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) spin_lock_init(&sbi->stat_lock); for (i = 0; i < NR_PAGE_TYPE; i++) { - init_rwsem(&sbi->write_io[i].io_rwsem); - sbi->write_io[i].sbi = sbi; - sbi->write_io[i].bio = NULL; + int n = (i == META) ? 1: NR_TEMP_TYPE; + int j; + + sbi->write_io[i] = kmalloc(n * sizeof(struct f2fs_bio_info), + GFP_KERNEL); + if (!sbi->write_io[i]) + goto free_options; + + for (j = HOT; j < n; j++) { + init_rwsem(&sbi->write_io[i][j].io_rwsem); + sbi->write_io[i][j].sbi = sbi; + sbi->write_io[i][j].bio = NULL; + } } init_rwsem(&sbi->cp_rwsem); @@ -2221,6 +2234,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) free_io_dummy: mempool_destroy(sbi->write_io_dummy); free_options: + for (i = 0; i < NR_PAGE_TYPE; i++) + kfree(sbi->write_io[i]); destroy_percpu_info(sbi); kfree(options); free_sb_buf: diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index fa0d8b07a1bf..20c4556ab56d 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -83,6 +83,12 @@ TRACE_DEFINE_ENUM(CP_TRIMMED); { REQ_META | REQ_PRIO, "(MP)" }, \ { 0, " \b" }) +#define show_block_temp(temp) \ + __print_symbolic(temp, \ + { HOT, "HOT" }, \ + { WARM, "WARM" }, \ + { COLD, "COLD" }) + #define show_data_type(type) \ __print_symbolic(type, \ { CURSEG_HOT_DATA, "Hot DATA" }, \ @@ -748,6 +754,7 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, __field(block_t, new_blkaddr) __field(int, op) __field(int, op_flags) + __field(int, temp) __field(int, type) ), @@ -759,16 +766,18 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, __entry->new_blkaddr = fio->new_blkaddr; __entry->op = fio->op; __entry->op_flags = fio->op_flags; + __entry->temp = fio->temp; __entry->type = fio->type; ), TP_printk("dev = (%d,%d), ino = %lu, page_index = 0x%lx, " - "oldaddr = 0x%llx, newaddr = 0x%llx rw = %s%s, type = %s", + "oldaddr = 0x%llx, newaddr = 0x%llx, rw = %s(%s), type = %s_%s", show_dev_ino(__entry), (unsigned long)__entry->index, (unsigned long long)__entry->old_blkaddr, (unsigned long long)__entry->new_blkaddr, show_bio_type(__entry->op, __entry->op_flags), + show_block_temp(__entry->temp), show_block_type(__entry->type)) ); From 5d6951b8e115161e940f46690a2c971833769584 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 12 May 2017 13:51:34 -0700 Subject: [PATCH 0313/1212] f2fs: avoid f2fs_lock_op for IPU writes Currently, if we do get_node_of_data before f2fs_lock_op, there may be dead lock as follows, where process A would be in infinite loop, and B will NOT be awaked. Process A(cp): Process B: f2fs_lock_all(sbi) get_dnode_of_data <---- lock dn.node_page flush_nodes f2fs_lock_op So, this patch adds f2fs_trylock_op to avoid f2fs_lock_op done by IPU. Signed-off-by: Hou Pengyang Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 44 +++++++++++++++++++++++++++++++------------- fs/f2fs/f2fs.h | 13 ++++++++++++- fs/f2fs/gc.c | 2 +- fs/f2fs/segment.c | 2 +- 4 files changed, 45 insertions(+), 16 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f7e597a1d984..8211bab93e06 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1381,12 +1381,12 @@ int do_write_data_page(struct f2fs_io_info *fio) if (valid_ipu_blkaddr(fio)) { ipu_force = true; - fio->need_lock = false; + fio->need_lock = LOCK_DONE; goto got_it; } } - if (fio->need_lock) + if (fio->need_lock == LOCK_REQ) f2fs_lock_op(fio->sbi); err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); @@ -1401,19 +1401,18 @@ int do_write_data_page(struct f2fs_io_info *fio) goto out_writepage; } got_it: - err = encrypt_one_page(fio); - if (err) - goto out_writepage; - - set_page_writeback(page); - /* * If current allocation needs SSR, * it had better in-place writes for updated data. */ if (ipu_force || (valid_ipu_blkaddr(fio) && need_inplace_update(fio))) { + err = encrypt_one_page(fio); + if (err) + goto out_writepage; + + set_page_writeback(page); f2fs_put_dnode(&dn); - if (fio->need_lock) + if (fio->need_lock == LOCK_REQ) f2fs_unlock_op(fio->sbi); err = rewrite_data_page(fio); trace_f2fs_do_write_data_page(fio->page, IPU); @@ -1421,6 +1420,20 @@ int do_write_data_page(struct f2fs_io_info *fio) return err; } + if (fio->need_lock == LOCK_RETRY) { + if (!f2fs_trylock_op(fio->sbi)) { + err = -EAGAIN; + goto out_writepage; + } + fio->need_lock = LOCK_REQ; + } + + err = encrypt_one_page(fio); + if (err) + goto out_writepage; + + set_page_writeback(page); + /* LFS mode write path */ write_data_page(&dn, fio); trace_f2fs_do_write_data_page(page, OPU); @@ -1430,7 +1443,7 @@ int do_write_data_page(struct f2fs_io_info *fio) out_writepage: f2fs_put_dnode(&dn); out: - if (fio->need_lock) + if (fio->need_lock == LOCK_REQ) f2fs_unlock_op(fio->sbi); return err; } @@ -1456,7 +1469,7 @@ static int __write_data_page(struct page *page, bool *submitted, .page = page, .encrypted_page = NULL, .submitted = false, - .need_lock = true, + .need_lock = LOCK_RETRY, }; trace_f2fs_writepage(page, DATA); @@ -1492,7 +1505,7 @@ static int __write_data_page(struct page *page, bool *submitted, /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { - fio.need_lock = false; + fio.need_lock = LOCK_DONE; err = do_write_data_page(&fio); goto done; } @@ -1511,8 +1524,13 @@ static int __write_data_page(struct page *page, bool *submitted, goto out; } - if (err == -EAGAIN) + if (err == -EAGAIN) { err = do_write_data_page(&fio); + if (err == -EAGAIN) { + fio.need_lock = LOCK_REQ; + err = do_write_data_page(&fio); + } + } if (F2FS_I(inode)->last_disk_size < psize) F2FS_I(inode)->last_disk_size = psize; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b0a34ae19a3f..4a33399e277c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -866,6 +866,12 @@ enum temp_type { NR_TEMP_TYPE, }; +enum need_lock_type { + LOCK_REQ = 0, + LOCK_DONE, + LOCK_RETRY, +}; + struct f2fs_io_info { struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ @@ -877,7 +883,7 @@ struct f2fs_io_info { struct page *page; /* page to be written */ struct page *encrypted_page; /* encrypted page */ bool submitted; /* indicate IO submission */ - bool need_lock; /* indicate we need to lock cp_rwsem */ + int need_lock; /* indicate we need to lock cp_rwsem */ }; #define is_read_io(rw) ((rw) == READ) @@ -1346,6 +1352,11 @@ static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) down_read(&sbi->cp_rwsem); } +static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi) +{ + return down_read_trylock(&sbi->cp_rwsem); +} + static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi) { up_read(&sbi->cp_rwsem); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 50c7864eb0d9..93ba82c968c6 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -719,7 +719,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, .old_blkaddr = NULL_ADDR, .page = page, .encrypted_page = NULL, - .need_lock = true, + .need_lock = LOCK_REQ, }; bool is_dirty = PageDirty(page); int err; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f6cbacf66ddc..1a280099da8f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -312,7 +312,7 @@ static int __commit_inmem_pages(struct inode *inode, fio.page = page; fio.old_blkaddr = NULL_ADDR; fio.encrypted_page = NULL; - fio.need_lock = false, + fio.need_lock = LOCK_DONE; err = do_write_data_page(&fio); if (err) { unlock_page(page); From a34df1bce787535738fa89ab1968482c6ff28f26 Mon Sep 17 00:00:00 2001 From: Hou Pengyang Date: Wed, 17 May 2017 02:48:48 +0000 Subject: [PATCH 0314/1212] f2fs: declare load_free_nid_bitmap static Signed-off-by: Hou Pengyang Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 90715dade918..8e27e853ed11 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2555,7 +2555,7 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) return 0; } -inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi) +static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned int i = 0; From 800a48334fcd12b868392f031b6ae7547e469eec Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Wed, 17 May 2017 17:22:51 +0800 Subject: [PATCH 0315/1212] f2fs: add a new function get_ssr_cost This patch add a new method get_ssr_cost to select SSR segment more accurately. Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 93ba82c968c6..3c901bc2f917 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -258,11 +258,20 @@ static unsigned int get_greedy_cost(struct f2fs_sb_info *sbi, valid_blocks * 2 : valid_blocks; } +static unsigned int get_ssr_cost(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + struct seg_entry *se = get_seg_entry(sbi, segno); + + return se->ckpt_valid_blocks > se->valid_blocks ? + se->ckpt_valid_blocks : se->valid_blocks; +} + static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno, struct victim_sel_policy *p) { if (p->alloc_mode == SSR) - return get_seg_entry(sbi, segno)->ckpt_valid_blocks; + return get_ssr_cost(sbi, segno); /* alloc_mode == LFS */ if (p->gc_mode == GC_GREEDY) From 842ce444fd86167f3e9dc858f22e4c90639764e0 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 17 May 2017 10:36:58 -0700 Subject: [PATCH 0316/1212] f2fs: try to freeze in gc and discard threads This allows to freeze gc and discard threads. Cc: stable@vger.kernel.org Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 9 +++++---- fs/f2fs/segment.c | 25 ++++++++++++++++--------- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 3c901bc2f917..1e6716ee64c1 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -32,13 +32,14 @@ static int gc_thread_func(void *data) wait_ms = gc_th->min_sleep_time; + set_freezable(); do { + wait_event_interruptible_timeout(*wq, + kthread_should_stop() || freezing(current), + msecs_to_jiffies(wait_ms)); + if (try_to_freeze()) continue; - else - wait_event_interruptible_timeout(*wq, - kthread_should_stop(), - msecs_to_jiffies(wait_ms)); if (kthread_should_stop()) break; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1a280099da8f..46ee1139046c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "f2fs.h" #include "segment.h" @@ -1138,18 +1139,24 @@ static int issue_discard_thread(void *data) struct f2fs_sb_info *sbi = data; struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; wait_queue_head_t *q = &dcc->discard_wait_queue; -repeat: - if (kthread_should_stop()) - return 0; - __issue_discard_cmd(sbi, true); - __wait_discard_cmd(sbi, true); + set_freezable(); - congestion_wait(BLK_RW_SYNC, HZ/50); + do { + wait_event_interruptible(*q, kthread_should_stop() || + freezing(current) || + atomic_read(&dcc->discard_cmd_cnt)); + if (try_to_freeze()) + continue; + if (kthread_should_stop()) + return 0; - wait_event_interruptible(*q, kthread_should_stop() || - atomic_read(&dcc->discard_cmd_cnt)); - goto repeat; + __issue_discard_cmd(sbi, true); + __wait_discard_cmd(sbi, true); + + congestion_wait(BLK_RW_SYNC, HZ/50); + } while (!kthread_should_stop()); + return 0; } #ifdef CONFIG_BLK_DEV_ZONED From e2b2bed0961e34d6fd1c293af66bd39b10a317f4 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Fri, 19 May 2017 15:06:12 +0800 Subject: [PATCH 0317/1212] f2fs: fix a bug caused by NULL extent tree Thread A: Thread B: -f2fs_remount -sbi->mount_opt.opt = 0; <--- -f2fs_iget -do_read_inode -f2fs_init_extent_tree -F2FS_I(inode)->extent_tree is NULL -default_options && parse_options -remount return <--- -f2fs_map_blocks -f2fs_lookup_extent_tree -f2fs_bug_on(sbi, !et); The same problem with f2fs_new_inode. Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 2f98d7039701..ff2352a0ed15 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -320,7 +320,7 @@ static void __drop_largest_extent(struct inode *inode, } /* return true, if inode page is changed */ -bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) +static bool __f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree *et; @@ -358,6 +358,16 @@ bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) return false; } +bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) +{ + bool ret = __f2fs_init_extent_tree(inode, i_ext); + + if (!F2FS_I(inode)->extent_tree) + set_inode_flag(inode, FI_NO_EXTENT); + + return ret; +} + static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, struct extent_info *ei) { From b89cdaf6b93047d2f916d8c34100b1239665fd20 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Fri, 19 May 2017 14:42:12 +0800 Subject: [PATCH 0318/1212] f2fs: combine huge num of discard rb tree consistence checks Came across a hungtask caused by huge number of rb tree traversing during adding discard addrs in cp. This patch combine these consistence checks and move it to discard thread. Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 46ee1139046c..3c24a8ca0283 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -917,7 +917,6 @@ static void __punch_discard_cmd(struct f2fs_sb_info *sbi, dc->len = blkaddr - dc->lstart; dcc->undiscard_blks += dc->len; __relocate_discard_cmd(dcc, dc); - f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); modified = true; } @@ -927,16 +926,12 @@ static void __punch_discard_cmd(struct f2fs_sb_info *sbi, di.start + blkaddr + 1 - di.lstart, di.lstart + di.len - 1 - blkaddr, NULL, NULL); - f2fs_bug_on(sbi, - !__check_rb_tree_consistence(sbi, &dcc->root)); } else { dc->lstart++; dc->len--; dc->start++; dcc->undiscard_blks += dc->len; __relocate_discard_cmd(dcc, dc); - f2fs_bug_on(sbi, - !__check_rb_tree_consistence(sbi, &dcc->root)); } } } @@ -997,8 +992,6 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, prev_dc->di.len += di.len; dcc->undiscard_blks += di.len; __relocate_discard_cmd(dcc, prev_dc); - f2fs_bug_on(sbi, - !__check_rb_tree_consistence(sbi, &dcc->root)); di = prev_dc->di; tdc = prev_dc; merged = true; @@ -1014,16 +1007,12 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, __relocate_discard_cmd(dcc, next_dc); if (tdc) __remove_discard_cmd(sbi, tdc); - f2fs_bug_on(sbi, - !__check_rb_tree_consistence(sbi, &dcc->root)); merged = true; } if (!merged) { __insert_discard_tree(sbi, bdev, di.lstart, di.start, di.len, NULL, NULL); - f2fs_bug_on(sbi, - !__check_rb_tree_consistence(sbi, &dcc->root)); } next: prev_dc = next_dc; @@ -1062,6 +1051,8 @@ static void __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) int i, iter = 0; mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, + !__check_rb_tree_consistence(sbi, &dcc->root)); blk_start_plug(&plug); for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { pend_list = &dcc->pend_list[i]; From f152939829d14d6fd6e0f8a461df1996acc8269d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 19 May 2017 23:37:00 +0800 Subject: [PATCH 0319/1212] f2fs: split wio_mutex Split wio_mutex to adjust different temperature bio cache. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 ++- fs/f2fs/segment.c | 4 ++-- fs/f2fs/super.c | 7 ++++--- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4a33399e277c..dc9de0418621 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -961,7 +961,8 @@ struct f2fs_sb_info { /* for bio operations */ struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */ - struct mutex wio_mutex[NODE + 1]; /* bio ordering for NODE/DATA */ + struct mutex wio_mutex[NR_PAGE_TYPE - 1][NR_TEMP_TYPE]; + /* bio ordering for NODE/DATA */ int write_io_size_bits; /* Write IO size bits */ mempool_t *write_io_dummy; /* Dummy pages */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 3c24a8ca0283..00503627c1d1 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2233,7 +2233,7 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) int err; if (fio->type == NODE || fio->type == DATA) - mutex_lock(&fio->sbi->wio_mutex[fio->type]); + mutex_lock(&fio->sbi->wio_mutex[fio->type][fio->temp]); reallocate: allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, &fio->new_blkaddr, sum, type); @@ -2246,7 +2246,7 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) } if (fio->type == NODE || fio->type == DATA) - mutex_unlock(&fio->sbi->wio_mutex[fio->type]); + mutex_unlock(&fio->sbi->wio_mutex[fio->type][fio->temp]); } void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1cb9ca9cab33..68d4285f635c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1573,7 +1573,7 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) static void init_sb_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = sbi->raw_super; - int i; + int i, j; sbi->log_sectors_per_block = le32_to_cpu(raw_super->log_sectors_per_block); @@ -1605,8 +1605,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); - mutex_init(&sbi->wio_mutex[NODE]); - mutex_init(&sbi->wio_mutex[DATA]); + for (i = 0; i < NR_PAGE_TYPE - 1; i++) + for (j = HOT; j < NR_TEMP_TYPE; j++) + mutex_init(&sbi->wio_mutex[i][j]); spin_lock_init(&sbi->cp_lock); } From 843d3364d7996211e38545bcd484d2eebeb1e5a5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 19 May 2017 23:37:01 +0800 Subject: [PATCH 0320/1212] f2fs: introduce io_list for serialize data/node IOs Serialize data/node IOs by using fifo list instead of mutex lock, it will help to enhance concurrency of f2fs, meanwhile keeping LFS IO semantics. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 1 + fs/f2fs/data.c | 24 ++++++++++++++++++++---- fs/f2fs/f2fs.h | 7 ++++++- fs/f2fs/gc.c | 3 ++- fs/f2fs/segment.c | 22 +++++++++++++++------- fs/f2fs/super.c | 2 ++ 6 files changed, 46 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index b7580cf84f94..69641cf7fd6f 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -163,6 +163,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, .op_flags = sync ? (REQ_SYNC | REQ_META | REQ_PRIO) : REQ_RAHEAD, .encrypted_page = NULL, + .in_list = false, }; struct blk_plug plug; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8211bab93e06..f61ab7539229 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -396,6 +396,20 @@ int f2fs_submit_page_write(struct f2fs_io_info *fio) f2fs_bug_on(sbi, is_read_io(fio->op)); + down_write(&io->io_rwsem); +next: + if (fio->in_list) { + spin_lock(&io->io_lock); + if (list_empty(&io->io_list)) { + spin_unlock(&io->io_lock); + goto out_fail; + } + fio = list_first_entry(&io->io_list, + struct f2fs_io_info, list); + list_del(&fio->list); + spin_unlock(&io->io_lock); + } + if (fio->old_blkaddr != NEW_ADDR) verify_block_addr(sbi, fio->old_blkaddr); verify_block_addr(sbi, fio->new_blkaddr); @@ -407,8 +421,6 @@ int f2fs_submit_page_write(struct f2fs_io_info *fio) inc_page_count(sbi, WB_DATA_TYPE(bio_page)); - down_write(&io->io_rwsem); - if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 || (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags) || !__same_bdev(sbi, fio->new_blkaddr, io->bio))) @@ -433,9 +445,13 @@ int f2fs_submit_page_write(struct f2fs_io_info *fio) io->last_block_in_bio = fio->new_blkaddr; f2fs_trace_ios(fio, 0); + + trace_f2fs_submit_page_write(fio->page, fio); + + if (fio->in_list) + goto next; out_fail: up_write(&io->io_rwsem); - trace_f2fs_submit_page_write(fio->page, fio); return err; } @@ -748,7 +764,7 @@ static int __allocate_data_block(struct dnode_of_data *dn) set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr, - &sum, CURSEG_WARM_DATA); + &sum, CURSEG_WARM_DATA, NULL, false); set_data_blkaddr(dn); /* update i_size */ diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index dc9de0418621..f35473293e46 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -882,8 +882,10 @@ struct f2fs_io_info { block_t old_blkaddr; /* old block address before Cow */ struct page *page; /* page to be written */ struct page *encrypted_page; /* encrypted page */ + struct list_head list; /* serialize IOs */ bool submitted; /* indicate IO submission */ int need_lock; /* indicate we need to lock cp_rwsem */ + bool in_list; /* indicate fio is in io_list */ }; #define is_read_io(rw) ((rw) == READ) @@ -893,6 +895,8 @@ struct f2fs_bio_info { sector_t last_block_in_bio; /* last block number */ struct f2fs_io_info fio; /* store buffered io info. */ struct rw_semaphore io_rwsem; /* blocking op for bio */ + spinlock_t io_lock; /* serialize DATA/NODE IOs */ + struct list_head io_list; /* track fios */ }; #define FDEV(i) (sbi->devs[i]) @@ -2361,7 +2365,8 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, bool recover_newaddr); void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, - struct f2fs_summary *sum, int type); + struct f2fs_summary *sum, int type, + struct f2fs_io_info *fio, bool add_list); void f2fs_wait_on_page_writeback(struct page *page, enum page_type type, bool ordered); void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 1e6716ee64c1..c72da8733ba6 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -600,6 +600,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx, .op = REQ_OP_READ, .op_flags = REQ_SYNC, .encrypted_page = NULL, + .in_list = false, }; struct dnode_of_data dn; struct f2fs_summary sum; @@ -643,7 +644,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx, fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, - &sum, CURSEG_COLD_DATA); + &sum, CURSEG_COLD_DATA, NULL, false); fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi), newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 00503627c1d1..1be5947ae1fe 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2188,7 +2188,8 @@ static int __get_segment_type(struct f2fs_io_info *fio) void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, - struct f2fs_summary *sum, int type) + struct f2fs_summary *sum, int type, + struct f2fs_io_info *fio, bool add_list) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -2224,6 +2225,17 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, if (page && IS_NODESEG(type)) fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg)); + if (add_list) { + struct f2fs_bio_info *io; + + INIT_LIST_HEAD(&fio->list); + fio->in_list = true; + io = sbi->write_io[fio->type] + fio->temp; + spin_lock(&io->io_lock); + list_add_tail(&fio->list, &io->io_list); + spin_unlock(&io->io_lock); + } + mutex_unlock(&curseg->curseg_mutex); } @@ -2232,11 +2244,9 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) int type = __get_segment_type(fio); int err; - if (fio->type == NODE || fio->type == DATA) - mutex_lock(&fio->sbi->wio_mutex[fio->type][fio->temp]); reallocate: allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, - &fio->new_blkaddr, sum, type); + &fio->new_blkaddr, sum, type, fio, true); /* writeout dirty page into bdev */ err = f2fs_submit_page_write(fio); @@ -2244,9 +2254,6 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) fio->old_blkaddr = fio->new_blkaddr; goto reallocate; } - - if (fio->type == NODE || fio->type == DATA) - mutex_unlock(&fio->sbi->wio_mutex[fio->type][fio->temp]); } void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) @@ -2260,6 +2267,7 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) .new_blkaddr = page->index, .page = page, .encrypted_page = NULL, + .in_list = false, }; if (unlikely(page->index >= MAIN_BLKADDR(sbi))) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 68d4285f635c..9c9a01f776dc 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1989,6 +1989,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) init_rwsem(&sbi->write_io[i][j].io_rwsem); sbi->write_io[i][j].sbi = sbi; sbi->write_io[i][j].bio = NULL; + spin_lock_init(&sbi->write_io[i][j].io_lock); + INIT_LIST_HEAD(&sbi->write_io[i][j].io_list); } } From 732de6bf9e328f15a85606744ddab0559ab7bf65 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 19 May 2017 23:46:43 +0800 Subject: [PATCH 0321/1212] f2fs: show more info if fail to issue discard Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1be5947ae1fe..fcdc45c8ba1b 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -741,7 +741,8 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi, if (dc->error) f2fs_msg(sbi->sb, KERN_INFO, - "Issue discard failed, ret: %d", dc->error); + "Issue discard(%u, %u, %u) failed, ret: %d", + dc->lstart, dc->start, dc->len, dc->error); __detach_discard_cmd(dcc, dc); } From aa9d75d2f522fccac641f5cb535acd53b0238f2e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 19 May 2017 23:46:44 +0800 Subject: [PATCH 0322/1212] f2fs: wake up all waiters in f2fs_submit_discard_endio There could be more than one waiter waiting discard IO completion, so we need use complete_all() instead of complete() in f2fs_submit_discard_endio to avoid hungtask. Fixes: ec9895add2c5 ("f2fs: don't hold cmd_lock during waiting discard command") Cc: Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index fcdc45c8ba1b..66cbd3da0404 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -752,7 +752,7 @@ static void f2fs_submit_discard_endio(struct bio *bio) dc->error = bio->bi_error; dc->state = D_DONE; - complete(&dc->wait); + complete_all(&dc->wait); bio_put(bio); } From b3df3669b73c7e48e6b808e1b17522f585bd69ea Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 19 May 2017 23:46:45 +0800 Subject: [PATCH 0323/1212] f2fs: wait discard IO completion without cmd_lock held Wait discard IO completion outside cmd_lock to avoid long latency of holding cmd_lock in IO busy scenario. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 66cbd3da0404..c8f5d8feac44 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1076,17 +1076,34 @@ static void __wait_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond) struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *wait_list = &(dcc->wait_list); struct discard_cmd *dc, *tmp; + bool need_wait; + +next: + need_wait = false; mutex_lock(&dcc->cmd_lock); list_for_each_entry_safe(dc, tmp, wait_list, list) { - if (!wait_cond || dc->state == D_DONE) { - if (dc->ref) - continue; + if (!wait_cond || (dc->state == D_DONE && !dc->ref)) { wait_for_completion_io(&dc->wait); __remove_discard_cmd(sbi, dc); + } else { + dc->ref++; + need_wait = true; + break; } } mutex_unlock(&dcc->cmd_lock); + + if (need_wait) { + wait_for_completion_io(&dc->wait); + mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, dc->state != D_DONE); + dc->ref--; + if (!dc->ref) + __remove_discard_cmd(sbi, dc); + mutex_unlock(&dcc->cmd_lock); + goto next; + } } /* This should be covered by global mutex, &sit_i->sentry_lock */ From f4afd85fc4212df5b75ab8e261214b41f0eb8007 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 22 May 2017 17:39:43 -0700 Subject: [PATCH 0324/1212] f2fs: don't bother checking for encryption key in ->mmap() Since only an open file can be mmap'ed, and we only allow open()ing an encrypted file when its key is available, there is no need to check for the key again before permitting each mmap(). This f2fs copy of this code was also broken in that it wouldn't actually have failed if the key was in fact unavailable. Signed-off-by: Eric Biggers Reviewed-by: David Gstir Acked-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index aee781394c87..8f8bd3e49f1f 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -417,14 +417,6 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) struct inode *inode = file_inode(file); int err; - if (f2fs_encrypted_inode(inode)) { - err = fscrypt_get_encryption_info(inode); - if (err) - return 0; - if (!f2fs_encrypted_inode(inode)) - return -ENOKEY; - } - /* we don't need to use inline_data strictly */ err = f2fs_convert_inline_inode(inode); if (err) From 6dbf9cbf28ab6e5134ad1a3f2fc1ba4c9e195be0 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 22 May 2017 17:39:45 -0700 Subject: [PATCH 0325/1212] f2fs: don't bother checking for encryption key in ->write_iter() Since only an open file can be written to, and we only allow open()ing an encrypted file when its key is available, there is no need to check for the key again before permitting each ->write_iter(). This code was also broken in that it wouldn't actually have failed if the key was in fact unavailable. Signed-off-by: Eric Biggers Reviewed-by: David Gstir Acked-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 8f8bd3e49f1f..f87eeb04ea8d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2324,11 +2324,6 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct blk_plug plug; ssize_t ret; - if (f2fs_encrypted_inode(inode) && - !fscrypt_has_encryption_key(inode) && - fscrypt_get_encryption_info(inode)) - return -EACCES; - inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret > 0) { From 46281c4ef52648cf84e16013bd196a78f9d84d3d Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 26 May 2017 17:04:40 +0900 Subject: [PATCH 0326/1212] f2fs: Do not issue small discards in LFS mode clear_prefree_segments() issues small discards after discarding full segments. These small discards may not be section aligned, so not zone aligned on a zoned block device, causing __f2fs_iissue_discard_zone() to fail. Fix this by not issuing small discards for a volume mounted with the BLKZONED feature enabled. Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c8f5d8feac44..33b1628245e7 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1416,7 +1416,8 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) sbi->blocks_per_seg, cur_pos); len = next_pos - cur_pos; - if (force && len < cpc->trim_minlen) + if (f2fs_sb_mounted_blkzoned(sbi->sb) || + (force && len < cpc->trim_minlen)) goto skip; f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos, From 97c0717bd043d751969e0d7cc40e12b719bc271f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 1 Jun 2017 15:39:27 -0700 Subject: [PATCH 0327/1212] f2fs: remove false-positive bug_on For example, f2fs_create - new_node_page is failed - handle_failed_inode - skip to add it into orphan list, since ni.blk_addr == NULL_ADDR : set_inode_flag(inode, FI_FREE_NID) f2fs_evict_inode - EIO due to fault injection - f2fs_bug_on() is triggered So, we don't need to call f2fs_bug_on in this case. Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index e53c784ab11e..868d71436ebc 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -425,9 +425,10 @@ void f2fs_evict_inode(struct inode *inode) if (is_inode_flag_set(inode, FI_FREE_NID)) { alloc_nid_failed(sbi, inode->i_ino); clear_inode_flag(inode, FI_FREE_NID); + } else { + f2fs_bug_on(sbi, err && + !exist_written_data(sbi, inode->i_ino, ORPHAN_INO)); } - f2fs_bug_on(sbi, err && - !exist_written_data(sbi, inode->i_ino, ORPHAN_INO)); out_clear: fscrypt_put_encryption_info(inode, NULL); clear_inode(inode); From 0a3354135a308bb16420547ea91c8fbaabbb8dcd Mon Sep 17 00:00:00 2001 From: Zhang Shengju Date: Thu, 1 Jun 2017 16:50:10 +0800 Subject: [PATCH 0328/1212] f2fs: remove the unnecessary cast for PTR_ERR It's not necessary to specify 'int' casting for PTR_ERR. Signed-off-by: Zhang Shengju Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index a45d1f4b7b0f..ad26f59ba464 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -236,7 +236,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, value = f2fs_acl_to_disk(F2FS_I_SB(inode), acl, &size); if (IS_ERR(value)) { clear_inode_flag(inode, FI_ACL_MODE); - return (int)PTR_ERR(value); + return PTR_ERR(value); } } From 83f42def8ae0598dfa66c3419c9df4078df60a48 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Thu, 1 Jun 2017 16:43:51 +0800 Subject: [PATCH 0329/1212] f2fs: fix a panic caused by NULL flush_cmd_control Mount fs with option noflush_merge, boot failed for illegal address fcc in function f2fs_issue_flush: if (!test_opt(sbi, FLUSH_MERGE)) { ret = submit_flush_wait(sbi); atomic_inc(&fcc->issued_flush); -> Here, fcc illegal return ret; } Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 33b1628245e7..3e2121dcaded 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -566,6 +566,9 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); SM_I(sbi)->fcc_info = fcc; + if (!test_opt(sbi, FLUSH_MERGE)) + return err; + init_thread: fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); @@ -3319,7 +3322,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&sm_info->sit_entry_set); - if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) { + if (!f2fs_readonly(sbi->sb)) { err = create_flush_cmd_control(sbi); if (err) return err; From 1479b6ffeb2eba6575600fb941f73f47d74a91a0 Mon Sep 17 00:00:00 2001 From: Jin Qian Date: Thu, 1 Jun 2017 11:18:30 -0700 Subject: [PATCH 0330/1212] f2fs: sanity check size of nat and sit cache Make sure number of entires doesn't exceed max journal size. Cc: stable@vger.kernel.org Signed-off-by: Jin Qian Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 3e2121dcaded..f86fd003f932 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2574,6 +2574,8 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) static int restore_curseg_summaries(struct f2fs_sb_info *sbi) { + struct f2fs_journal *sit_j = CURSEG_I(sbi, CURSEG_COLD_DATA)->journal; + struct f2fs_journal *nat_j = CURSEG_I(sbi, CURSEG_HOT_DATA)->journal; int type = CURSEG_HOT_DATA; int err; @@ -2600,6 +2602,11 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) return err; } + /* sanity check for summary blocks */ + if (nats_in_cursum(nat_j) > NAT_JOURNAL_ENTRIES || + sits_in_cursum(sit_j) > SIT_JOURNAL_ENTRIES) + return -EINVAL; + return 0; } From 9ddde7dfd1c9446a26224353393d145c60426093 Mon Sep 17 00:00:00 2001 From: Fan Li Date: Fri, 2 Jun 2017 15:45:42 +0800 Subject: [PATCH 0331/1212] f2fs: simplify the way of calulating next nat address The index of segment which the next nat block is in has only one different bit than the current one, so to get the next nat address, we can simply alter that one bit. Signed-off-by: Fan Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 558048e33cf9..bb53e9955ff2 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -224,11 +224,7 @@ static inline pgoff_t next_nat_addr(struct f2fs_sb_info *sbi, struct f2fs_nm_info *nm_i = NM_I(sbi); block_addr -= nm_i->nat_blkaddr; - if ((block_addr >> sbi->log_blocks_per_seg) % 2) - block_addr -= sbi->blocks_per_seg; - else - block_addr += sbi->blocks_per_seg; - + block_addr ^= 1 << sbi->log_blocks_per_seg; return block_addr + nm_i->nat_blkaddr; } From 11de19dfa757695700c28278ff1f0a289c036a9e Mon Sep 17 00:00:00 2001 From: Qiuyang Sun Date: Thu, 18 May 2017 11:06:45 +0800 Subject: [PATCH 0332/1212] f2fs: dax: fix races between page faults and truncating pages Currently in F2FS, page faults and operations that truncate the pagecahe or data blocks, are completely unsynchronized. This can result in page fault faulting in a page into a range that we are changing after truncating, and thus we can end up with a page mapped to disk blocks that will be shortly freed. Filesystem corruption will shortly follow. This patch fixes the problem by creating new rw semaphore i_mmap_sem in f2fs_inode_info and grab it for functions removing blocks from extent tree and for read over page faults. The mechanism is similar to that in ext4. Signed-off-by: Qiuyang Sun Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/file.c --- fs/f2fs/data.c | 2 ++ fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 49 ++++++++++++++++++++++++++++++++++++++++--------- fs/f2fs/super.c | 1 + 4 files changed, 44 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f61ab7539229..50048986bed9 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1799,8 +1799,10 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to) loff_t i_size = i_size_read(inode); if (to > i_size) { + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_pagecache(inode, i_size); truncate_blocks(inode, i_size, true); + up_write(&F2FS_I(inode)->i_mmap_sem); } } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f35473293e46..da7bb61a678a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -586,6 +586,7 @@ struct f2fs_inode_info { struct mutex inmem_lock; /* lock for inmemory pages */ struct extent_tree *extent_tree; /* cached extent_tree entry */ struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */ + struct rw_semaphore i_mmap_sem; }; static inline void get_extent_info(struct extent_info *ext, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f87eeb04ea8d..447dd1221167 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -34,6 +34,19 @@ #include "trace.h" #include +static int f2fs_filemap_fault(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vma->vm_file); + int err; + + down_read(&F2FS_I(inode)->i_mmap_sem); + err = filemap_fault(vma, vmf); + up_read(&F2FS_I(inode)->i_mmap_sem); + + return err; +} + static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { @@ -61,13 +74,14 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, f2fs_balance_fs(sbi, dn.node_changed); file_update_time(vma->vm_file); + down_read(&F2FS_I(inode)->i_mmap_sem); lock_page(page); if (unlikely(page->mapping != inode->i_mapping || page_offset(page) > i_size_read(inode) || !PageUptodate(page))) { unlock_page(page); err = -EFAULT; - goto out; + goto out_sem; } /* @@ -96,6 +110,8 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr); +out_sem: + up_read(&F2FS_I(inode)->i_mmap_sem); out: sb_end_pagefault(inode->i_sb); f2fs_update_time(sbi, REQ_TIME); @@ -103,7 +119,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, } static const struct vm_operations_struct f2fs_file_vm_ops = { - .fault = filemap_fault, + .fault = f2fs_filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = f2fs_vm_page_mkwrite, }; @@ -681,8 +697,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) return -EACCES; if (attr->ia_size <= i_size_read(inode)) { + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_setsize(inode, attr->ia_size); err = f2fs_truncate(inode); + up_write(&F2FS_I(inode)->i_mmap_sem); if (err) return err; } else { @@ -690,7 +708,9 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) * do not trim all blocks after i_size if target size is * larger than i_size. */ + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_setsize(inode, attr->ia_size); + up_write(&F2FS_I(inode)->i_mmap_sem); /* should convert inline inode here */ if (!f2fs_may_inline_data(inode)) { @@ -836,12 +856,14 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) blk_start = (loff_t)pg_start << PAGE_SHIFT; blk_end = (loff_t)pg_end << PAGE_SHIFT; + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_inode_pages_range(mapping, blk_start, blk_end - 1); f2fs_lock_op(sbi); ret = truncate_hole(inode, pg_start, pg_end); f2fs_unlock_op(sbi); + up_write(&F2FS_I(inode)->i_mmap_sem); } } @@ -1080,16 +1102,17 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) pg_start = offset >> PAGE_SHIFT; pg_end = (offset + len) >> PAGE_SHIFT; + down_write(&F2FS_I(inode)->i_mmap_sem); /* write out all dirty pages from offset */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); if (ret) - return ret; + goto out; truncate_pagecache(inode, offset); ret = f2fs_do_collapse(inode, pg_start, pg_end); if (ret) - return ret; + goto out; /* write out all moved pages, if possible */ filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); @@ -1102,6 +1125,8 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (!ret) f2fs_i_size_write(inode, new_size); +out: + up_write(&F2FS_I(inode)->i_mmap_sem); return ret; } @@ -1166,9 +1191,10 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) return ret; + down_write(&F2FS_I(inode)->i_mmap_sem); ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1); if (ret) - return ret; + goto out_sem; truncate_pagecache_range(inode, offset, offset + len - 1); @@ -1182,7 +1208,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = fill_zero(inode, pg_start, off_start, off_end - off_start); if (ret) - return ret; + goto out_sem; new_size = max_t(loff_t, new_size, offset + len); } else { @@ -1190,7 +1216,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = fill_zero(inode, pg_start++, off_start, PAGE_SIZE - off_start); if (ret) - return ret; + goto out_sem; new_size = max_t(loff_t, new_size, (loff_t)pg_start << PAGE_SHIFT); @@ -1239,6 +1265,8 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, out: if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) f2fs_i_size_write(inode, new_size); +out_sem: + up_write(&F2FS_I(inode)->i_mmap_sem); return ret; } @@ -1268,14 +1296,15 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); + down_write(&F2FS_I(inode)->i_mmap_sem); ret = truncate_blocks(inode, i_size_read(inode), true); if (ret) - return ret; + goto out; /* write out all dirty pages from offset */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); if (ret) - return ret; + goto out; truncate_pagecache(inode, offset); @@ -1304,6 +1333,8 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) if (!ret) f2fs_i_size_write(inode, new_size); +out: + up_write(&F2FS_I(inode)->i_mmap_sem); return ret; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 9c9a01f776dc..dc69af8ed028 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -624,6 +624,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) mutex_init(&fi->inmem_lock); init_rwsem(&fi->dio_rwsem[READ]); init_rwsem(&fi->dio_rwsem[WRITE]); + init_rwsem(&fi->i_mmap_sem); /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; From 4798bcc8085173a5f747ba9a597bc8fc27cbd8d6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 5 Jun 2017 18:29:06 +0800 Subject: [PATCH 0333/1212] f2fs: introduce __wait_one_discard_bio In order to avoid copied codes. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f86fd003f932..e10d4e5f2193 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1074,6 +1074,20 @@ static void __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) mutex_unlock(&dcc->cmd_lock); } +static void __wait_one_discard_bio(struct f2fs_sb_info *sbi, + struct discard_cmd *dc) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + + wait_for_completion_io(&dc->wait); + mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, dc->state != D_DONE); + dc->ref--; + if (!dc->ref) + __remove_discard_cmd(sbi, dc); + mutex_unlock(&dcc->cmd_lock); +} + static void __wait_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; @@ -1098,13 +1112,7 @@ static void __wait_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond) mutex_unlock(&dcc->cmd_lock); if (need_wait) { - wait_for_completion_io(&dc->wait); - mutex_lock(&dcc->cmd_lock); - f2fs_bug_on(sbi, dc->state != D_DONE); - dc->ref--; - if (!dc->ref) - __remove_discard_cmd(sbi, dc); - mutex_unlock(&dcc->cmd_lock); + __wait_one_discard_bio(sbi, dc); goto next; } } @@ -1128,15 +1136,8 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) } mutex_unlock(&dcc->cmd_lock); - if (need_wait) { - wait_for_completion_io(&dc->wait); - mutex_lock(&dcc->cmd_lock); - f2fs_bug_on(sbi, dc->state != D_DONE); - dc->ref--; - if (!dc->ref) - __remove_discard_cmd(sbi, dc); - mutex_unlock(&dcc->cmd_lock); - } + if (need_wait) + __wait_one_discard_bio(sbi, dc); } /* This comes from f2fs_put_super */ From f3fb4448d8f0843b6aec068fdcb0c8ea8a5321da Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 5 Jun 2017 18:29:07 +0800 Subject: [PATCH 0334/1212] f2fs: add f2fs_bug_on in __remove_discard_cmd Recently, discard related codes have changed a lot, so add f2fs_bug_on to detect potential bug. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e10d4e5f2193..9b08a6660d13 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -739,6 +739,8 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi, { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + f2fs_bug_on(sbi, dc->ref); + if (dc->error == -EOPNOTSUPP) dc->error = 0; From 958022601a8cd3bd95aa2673f9de42ce98c9c4fa Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 5 Jun 2017 18:29:08 +0800 Subject: [PATCH 0335/1212] f2fs: don't track newly allocated nat entry in list We will never persist newly allocated nat entries during checkpoint(), so we don't need to track such nat entries in nat dirty list in order to avoid: - more latency during traversing dirty list; - sorting nat sets incorrectly due to recording wrong entry_cnt in nat entry set. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 8e27e853ed11..c1b940ea01db 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -158,9 +158,6 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid); struct nat_entry_set *head; - if (get_nat_flag(ne, IS_DIRTY)) - return; - head = radix_tree_lookup(&nm_i->nat_set_root, set); if (!head) { head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_NOFS); @@ -171,10 +168,18 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, head->entry_cnt = 0; f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head); } - list_move_tail(&ne->list, &head->entry_list); + + if (get_nat_flag(ne, IS_DIRTY)) + goto refresh_list; + nm_i->dirty_nat_cnt++; head->entry_cnt++; set_nat_flag(ne, IS_DIRTY, true); +refresh_list: + if (nat_get_blkaddr(ne) == NEW_ADDR) + list_del_init(&ne->list); + else + list_move_tail(&ne->list, &head->entry_list); } static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i, @@ -2426,8 +2431,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, nid_t nid = nat_get_nid(ne); int offset; - if (nat_get_blkaddr(ne) == NEW_ADDR) - continue; + f2fs_bug_on(sbi, nat_get_blkaddr(ne) == NEW_ADDR); if (to_journal) { offset = lookup_journal_in_cursum(journal, From 82aed6f9d75f503b3dfabc7be02a25b168348511 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 7 Jun 2017 11:17:35 +0800 Subject: [PATCH 0336/1212] f2fs: fix to avoid panic when encountering corrupt node With fault_injection option, generic/361 of fstests will complain us with below message: Call Trace: get_node_page+0x12/0x20 [f2fs] f2fs_iget+0x92/0x7d0 [f2fs] f2fs_fill_super+0x10fb/0x15e0 [f2fs] mount_bdev+0x184/0x1c0 f2fs_mount+0x15/0x20 [f2fs] mount_fs+0x39/0x150 vfs_kern_mount+0x67/0x110 do_mount+0x1bb/0xc70 SyS_mount+0x83/0xd0 do_syscall_64+0x6e/0x160 entry_SYSCALL64_slow_path+0x25/0x25 Since mkfs loop device in f2fs partition can be failed silently due to checkpoint error injection, so root inode page can be corrupted, in order to avoid needless panic, in get_node_page, it's better to leave message and return error to caller, and let fsck repaire it later. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index c1b940ea01db..70f3c01a806f 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1157,6 +1157,7 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, f2fs_put_page(page, 1); return ERR_PTR(err); } else if (err == LOCKED_PAGE) { + err = 0; goto page_hit; } @@ -1170,15 +1171,22 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, goto repeat; } - if (unlikely(!PageUptodate(page))) + if (unlikely(!PageUptodate(page))) { + err = -EIO; goto out_err; + } page_hit: if(unlikely(nid != nid_of_node(page))) { - f2fs_bug_on(sbi, 1); + f2fs_msg(sbi->sb, KERN_WARNING, "inconsistent node block, " + "nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", + nid, nid_of_node(page), ino_of_node(page), + ofs_of_node(page), cpver_of_node(page), + next_blkaddr_of_node(page)); ClearPageUptodate(page); + err = -EINVAL; out_err: f2fs_put_page(page, 1); - return ERR_PTR(-EIO); + return ERR_PTR(err); } return page; } From e1640316b9d82b2f12ca0165a082e3f357b26d9f Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Fri, 9 Jun 2017 06:32:54 +0800 Subject: [PATCH 0337/1212] f2fs: use proper variable name It is better to use variable name "inline_dentry" instead of "dentry_blk" when data type is "struct f2fs_inline_dentry". This patch has no functional changes, just to make code more readable especially when call the function make_dentry_ptr_inline() and f2fs_convert_inline_dir(). Signed-off-by: Tiezhu Yang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inline.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index fc8b49696b9d..03c86e55e4a7 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -316,12 +316,12 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, int make_empty_inline_dir(struct inode *inode, struct inode *parent, struct page *ipage) { - struct f2fs_inline_dentry *dentry_blk; + struct f2fs_inline_dentry *inline_dentry; struct f2fs_dentry_ptr d; - dentry_blk = inline_data_addr(ipage); + inline_dentry = inline_data_addr(ipage); - make_dentry_ptr_inline(NULL, &d, dentry_blk); + make_dentry_ptr_inline(NULL, &d, inline_dentry); do_make_empty_dir(inode, parent, &d); set_page_dirty(ipage); @@ -500,7 +500,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, struct page *ipage; unsigned int bit_pos; f2fs_hash_t name_hash; - struct f2fs_inline_dentry *dentry_blk = NULL; + struct f2fs_inline_dentry *inline_dentry = NULL; struct f2fs_dentry_ptr d; int slots = GET_DENTRY_SLOTS(new_name->len); struct page *page = NULL; @@ -510,11 +510,11 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, if (IS_ERR(ipage)) return PTR_ERR(ipage); - dentry_blk = inline_data_addr(ipage); - bit_pos = room_for_filename(&dentry_blk->dentry_bitmap, + inline_dentry = inline_data_addr(ipage); + bit_pos = room_for_filename(&inline_dentry->dentry_bitmap, slots, NR_INLINE_DENTRY); if (bit_pos >= NR_INLINE_DENTRY) { - err = f2fs_convert_inline_dir(dir, ipage, dentry_blk); + err = f2fs_convert_inline_dir(dir, ipage, inline_dentry); if (err) return err; err = -EAGAIN; @@ -534,7 +534,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, f2fs_wait_on_page_writeback(ipage, NODE, true); name_hash = f2fs_dentry_hash(new_name, NULL); - make_dentry_ptr_inline(NULL, &d, dentry_blk); + make_dentry_ptr_inline(NULL, &d, inline_dentry); f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos); set_page_dirty(ipage); @@ -586,14 +586,14 @@ bool f2fs_empty_inline_dir(struct inode *dir) struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct page *ipage; unsigned int bit_pos = 2; - struct f2fs_inline_dentry *dentry_blk; + struct f2fs_inline_dentry *inline_dentry; ipage = get_node_page(sbi, dir->i_ino); if (IS_ERR(ipage)) return false; - dentry_blk = inline_data_addr(ipage); - bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, + inline_dentry = inline_data_addr(ipage); + bit_pos = find_next_bit_le(&inline_dentry->dentry_bitmap, NR_INLINE_DENTRY, bit_pos); From fb359654223525f89041c14de65acf1ef081607d Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 11 Jun 2017 09:21:11 +0200 Subject: [PATCH 0338/1212] f2fs: Fix a return value in case of error in 'f2fs_fill_super' err must be set to -ENOMEM, otherwise we return 0. Fixes: a912b54d3aaa0 ("f2fs: split bio cache") Signed-off-by: Christophe JAILLET Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index dc69af8ed028..fb51fd248d5b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1983,8 +1983,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) sbi->write_io[i] = kmalloc(n * sizeof(struct f2fs_bio_info), GFP_KERNEL); - if (!sbi->write_io[i]) + if (!sbi->write_io[i]) { + err = -ENOMEM; goto free_options; + } for (j = HOT; j < n; j++) { init_rwsem(&sbi->write_io[i][j].io_rwsem); From 47871a830a29b6327cda1e5e2866c39c9f3650f4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 12 Jun 2017 09:44:24 +0800 Subject: [PATCH 0339/1212] f2fs: fix to show injection rate in ->show_options If fault injection functionality is enabled, show additional injection rate in ->show_options. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index fb51fd248d5b..26addfca7baa 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -985,7 +985,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",io_size=%uKB", F2FS_IO_SIZE_KB(sbi)); #ifdef CONFIG_F2FS_FAULT_INJECTION if (test_opt(sbi, FAULT_INJECTION)) - seq_puts(seq, ",fault_injection"); + seq_printf(seq, ",fault_injection=%u", + sbi->fault_info.inject_rate); #endif return 0; From b9ee759fbcae5bccc24f0520b6e1d5db0b5cc916 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 12 Jun 2017 09:44:27 +0800 Subject: [PATCH 0340/1212] f2fs: fix wrong error number of fill_super This patch fixes incorrect error number in error path of fill_super. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 26addfca7baa..d98fdb211cdf 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1936,6 +1936,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (f2fs_sb_mounted_blkzoned(sb)) { f2fs_msg(sb, KERN_ERR, "Zoned block device support is not enabled\n"); + err = -EOPNOTSUPP; goto free_sb_buf; } #endif @@ -2009,8 +2010,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (F2FS_IO_SIZE(sbi) > 1) { sbi->write_io_dummy = mempool_create_page_pool(2 * (F2FS_IO_SIZE(sbi) - 1), 0); - if (!sbi->write_io_dummy) + if (!sbi->write_io_dummy) { + err = -ENOMEM; goto free_options; + } } /* get an inode for meta space */ From 1fb6bf71cd86f0f8433ff513f795f8248d18f59d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 14 Jun 2017 17:39:46 +0800 Subject: [PATCH 0341/1212] f2fs: clean up sysfs codes Just cleanup. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 121 +++++++++++++++++++++++++++++------------------- 1 file changed, 74 insertions(+), 47 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d98fdb211cdf..70e82c4c210d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -349,6 +349,22 @@ static struct kobj_type f2fs_ktype = { .release = f2fs_sb_release, }; +int __init f2fs_register_sysfs(void) +{ + f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); + + f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj); + if (!f2fs_kset) + return -ENOMEM; + return 0; +} + +void f2fs_unregister_sysfs(void) +{ + kset_unregister(f2fs_kset); + remove_proc_entry("fs/f2fs", NULL); +} + void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) { struct va_format vaf; @@ -766,17 +782,23 @@ static void destroy_device_list(struct f2fs_sb_info *sbi) kfree(sbi->devs); } -static void f2fs_put_super(struct super_block *sb) +void f2fs_exit_sysfs(struct f2fs_sb_info *sbi) { - struct f2fs_sb_info *sbi = F2FS_SB(sb); - int i; + kobject_del(&sbi->s_kobj); + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); if (sbi->s_proc) { remove_proc_entry("segment_info", sbi->s_proc); remove_proc_entry("segment_bits", sbi->s_proc); - remove_proc_entry(sb->s_id, f2fs_proc_root); + remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); } - kobject_del(&sbi->s_kobj); +} + +static void f2fs_put_super(struct super_block *sb) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + int i; stop_gc_thread(sbi); @@ -829,8 +851,8 @@ static void f2fs_put_super(struct super_block *sb) destroy_segment_manager(sbi); kfree(sbi->ckpt); - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); + + f2fs_exit_sysfs(sbi); sb->s_fs_info = NULL; if (sbi->s_chksum_driver) @@ -1060,6 +1082,37 @@ static const struct file_operations f2fs_seq_##_name##_fops = { \ F2FS_PROC_FILE_DEF(segment_info); F2FS_PROC_FILE_DEF(segment_bits); +int f2fs_init_sysfs(struct f2fs_sb_info *sbi) +{ + struct super_block *sb = sbi->sb; + int err; + + if (f2fs_proc_root) + sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); + + if (sbi->s_proc) { + proc_create_data("segment_info", S_IRUGO, sbi->s_proc, + &f2fs_seq_segment_info_fops, sb); + proc_create_data("segment_bits", S_IRUGO, sbi->s_proc, + &f2fs_seq_segment_bits_fops, sb); + } + + sbi->s_kobj.kset = f2fs_kset; + init_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL, + "%s", sb->s_id); + if (err) + goto err_out; + return 0; +err_out: + if (sbi->s_proc) { + remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry("segment_bits", sbi->s_proc); + remove_proc_entry(sb->s_id, f2fs_proc_root); + } + return err; +} + static void default_options(struct f2fs_sb_info *sbi) { /* init some FS parameters */ @@ -2120,22 +2173,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) goto free_root_inode; } - if (f2fs_proc_root) - sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); - - if (sbi->s_proc) { - proc_create_data("segment_info", S_IRUGO, sbi->s_proc, - &f2fs_seq_segment_info_fops, sb); - proc_create_data("segment_bits", S_IRUGO, sbi->s_proc, - &f2fs_seq_segment_bits_fops, sb); - } - - sbi->s_kobj.kset = f2fs_kset; - init_completion(&sbi->s_kobj_unregister); - err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL, - "%s", sb->s_id); + err = f2fs_init_sysfs(sbi); if (err) - goto free_proc; + goto free_root_inode; /* recover fsynced data */ if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { @@ -2146,7 +2186,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (bdev_read_only(sb->s_bdev) && !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { err = -EROFS; - goto free_kobj; + goto free_sysfs; } if (need_fsck) @@ -2160,7 +2200,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) need_fsck = true; f2fs_msg(sb, KERN_ERR, "Cannot recover all fsync data errno=%d", err); - goto free_kobj; + goto free_sysfs; } } else { err = recover_fsync_data(sbi, true); @@ -2169,7 +2209,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) err = -EINVAL; f2fs_msg(sb, KERN_ERR, "Need to recover fsync data"); - goto free_kobj; + goto free_sysfs; } } skip_recovery: @@ -2184,7 +2224,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) /* After POR, we can run background GC thread.*/ err = start_gc_thread(sbi); if (err) - goto free_kobj; + goto free_sysfs; } kfree(options); @@ -2202,17 +2242,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) f2fs_update_time(sbi, REQ_TIME); return 0; -free_kobj: +free_sysfs: f2fs_sync_inode_meta(sbi); - kobject_del(&sbi->s_kobj); - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); -free_proc: - if (sbi->s_proc) { - remove_proc_entry("segment_info", sbi->s_proc); - remove_proc_entry("segment_bits", sbi->s_proc); - remove_proc_entry(sb->s_id, f2fs_proc_root); - } + f2fs_exit_sysfs(sbi); free_root_inode: dput(sb->s_root); sb->s_root = NULL; @@ -2327,30 +2359,26 @@ static int __init init_f2fs_fs(void) err = create_extent_cache(); if (err) goto free_checkpoint_caches; - f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj); - if (!f2fs_kset) { - err = -ENOMEM; + err = f2fs_register_sysfs(); + if (err) goto free_extent_cache; - } err = register_shrinker(&f2fs_shrinker_info); if (err) - goto free_kset; - + goto free_sysfs; err = register_filesystem(&f2fs_fs_type); if (err) goto free_shrinker; err = f2fs_create_root_stats(); if (err) goto free_filesystem; - f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); return 0; free_filesystem: unregister_filesystem(&f2fs_fs_type); free_shrinker: unregister_shrinker(&f2fs_shrinker_info); -free_kset: - kset_unregister(f2fs_kset); +free_sysfs: + f2fs_unregister_sysfs(); free_extent_cache: destroy_extent_cache(); free_checkpoint_caches: @@ -2367,11 +2395,10 @@ static int __init init_f2fs_fs(void) static void __exit exit_f2fs_fs(void) { - remove_proc_entry("fs/f2fs", NULL); f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); unregister_shrinker(&f2fs_shrinker_info); - kset_unregister(f2fs_kset); + f2fs_unregister_sysfs(); destroy_extent_cache(); destroy_checkpoint_caches(); destroy_segment_manager_caches(); From 06fc88a60d56abbde2f0cd034358fef37f7d307c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 14 Jun 2017 17:39:47 +0800 Subject: [PATCH 0342/1212] f2fs: move sysfs code from super.c to fs/f2fs/sysfs.c Codes related to sysfs and procfs are dispersive and mixed with sb related codes, but actually these codes are independent from others, so split them from super.c, and reorgnize and manger them in sysfs.c. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/super.c --- fs/f2fs/Makefile | 2 +- fs/f2fs/f2fs.h | 8 ++ fs/f2fs/super.c | 333 -------------------------------------------- fs/f2fs/sysfs.c | 350 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 359 insertions(+), 334 deletions(-) create mode 100644 fs/f2fs/sysfs.c diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index ca949ea7c02f..a0dc559b1b47 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_F2FS_FS) += f2fs.o f2fs-y := dir.o file.o inode.o namei.o hash.o super.o inline.o f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o -f2fs-y += shrinker.o extent_cache.o +f2fs-y += shrinker.o extent_cache.o sysfs.o f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index da7bb61a678a..0d9d25891833 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2740,6 +2740,14 @@ void init_extent_cache_info(struct f2fs_sb_info *sbi); int __init create_extent_cache(void); void destroy_extent_cache(void); +/* + * sysfs.c + */ +int __init f2fs_register_sysfs(void); +void f2fs_unregister_sysfs(void); +int f2fs_init_sysfs(struct f2fs_sb_info *sbi); +void f2fs_exit_sysfs(struct f2fs_sb_info *sbi); + /* * crypto support */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 70e82c4c210d..42093c7c9ae0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -35,9 +35,7 @@ #define CREATE_TRACE_POINTS #include -static struct proc_dir_entry *f2fs_proc_root; static struct kmem_cache *f2fs_inode_cachep; -static struct kset *f2fs_kset; #ifdef CONFIG_F2FS_FAULT_INJECTION @@ -146,225 +144,6 @@ static match_table_t f2fs_tokens = { {Opt_err, NULL}, }; -/* Sysfs support for f2fs */ -enum { - GC_THREAD, /* struct f2fs_gc_thread */ - SM_INFO, /* struct f2fs_sm_info */ - DCC_INFO, /* struct discard_cmd_control */ - NM_INFO, /* struct f2fs_nm_info */ - F2FS_SBI, /* struct f2fs_sb_info */ -#ifdef CONFIG_F2FS_FAULT_INJECTION - FAULT_INFO_RATE, /* struct f2fs_fault_info */ - FAULT_INFO_TYPE, /* struct f2fs_fault_info */ -#endif -}; - -struct f2fs_attr { - struct attribute attr; - ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *); - ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *, - const char *, size_t); - int struct_type; - int offset; -}; - -static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) -{ - if (struct_type == GC_THREAD) - return (unsigned char *)sbi->gc_thread; - else if (struct_type == SM_INFO) - return (unsigned char *)SM_I(sbi); - else if (struct_type == DCC_INFO) - return (unsigned char *)SM_I(sbi)->dcc_info; - else if (struct_type == NM_INFO) - return (unsigned char *)NM_I(sbi); - else if (struct_type == F2FS_SBI) - return (unsigned char *)sbi; -#ifdef CONFIG_F2FS_FAULT_INJECTION - else if (struct_type == FAULT_INFO_RATE || - struct_type == FAULT_INFO_TYPE) - return (unsigned char *)&sbi->fault_info; -#endif - return NULL; -} - -static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, - struct f2fs_sb_info *sbi, char *buf) -{ - struct super_block *sb = sbi->sb; - - if (!sb->s_bdev->bd_part) - return snprintf(buf, PAGE_SIZE, "0\n"); - - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)(sbi->kbytes_written + - BD_PART_WRITTEN(sbi))); -} - -static ssize_t f2fs_sbi_show(struct f2fs_attr *a, - struct f2fs_sb_info *sbi, char *buf) -{ - unsigned char *ptr = NULL; - unsigned int *ui; - - ptr = __struct_ptr(sbi, a->struct_type); - if (!ptr) - return -EINVAL; - - ui = (unsigned int *)(ptr + a->offset); - - return snprintf(buf, PAGE_SIZE, "%u\n", *ui); -} - -static ssize_t f2fs_sbi_store(struct f2fs_attr *a, - struct f2fs_sb_info *sbi, - const char *buf, size_t count) -{ - unsigned char *ptr; - unsigned long t; - unsigned int *ui; - ssize_t ret; - - ptr = __struct_ptr(sbi, a->struct_type); - if (!ptr) - return -EINVAL; - - ui = (unsigned int *)(ptr + a->offset); - - ret = kstrtoul(skip_spaces(buf), 0, &t); - if (ret < 0) - return ret; -#ifdef CONFIG_F2FS_FAULT_INJECTION - if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX)) - return -EINVAL; -#endif - *ui = t; - return count; -} - -static ssize_t f2fs_attr_show(struct kobject *kobj, - struct attribute *attr, char *buf) -{ - struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, - s_kobj); - struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); - - return a->show ? a->show(a, sbi, buf) : 0; -} - -static ssize_t f2fs_attr_store(struct kobject *kobj, struct attribute *attr, - const char *buf, size_t len) -{ - struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, - s_kobj); - struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); - - return a->store ? a->store(a, sbi, buf, len) : 0; -} - -static void f2fs_sb_release(struct kobject *kobj) -{ - struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, - s_kobj); - complete(&sbi->s_kobj_unregister); -} - -#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \ -static struct f2fs_attr f2fs_attr_##_name = { \ - .attr = {.name = __stringify(_name), .mode = _mode }, \ - .show = _show, \ - .store = _store, \ - .struct_type = _struct_type, \ - .offset = _offset \ -} - -#define F2FS_RW_ATTR(struct_type, struct_name, name, elname) \ - F2FS_ATTR_OFFSET(struct_type, name, 0644, \ - f2fs_sbi_show, f2fs_sbi_store, \ - offsetof(struct struct_name, elname)) - -#define F2FS_GENERAL_RO_ATTR(name) \ -static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL) - -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); -F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks); -F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); -F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); -F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); -#ifdef CONFIG_F2FS_FAULT_INJECTION -F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); -F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); -#endif -F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); - -#define ATTR_LIST(name) (&f2fs_attr_##name.attr) -static struct attribute *f2fs_attrs[] = { - ATTR_LIST(gc_min_sleep_time), - ATTR_LIST(gc_max_sleep_time), - ATTR_LIST(gc_no_gc_sleep_time), - ATTR_LIST(gc_idle), - ATTR_LIST(reclaim_segments), - ATTR_LIST(max_small_discards), - ATTR_LIST(batched_trim_sections), - ATTR_LIST(ipu_policy), - ATTR_LIST(min_ipu_util), - ATTR_LIST(min_fsync_blocks), - ATTR_LIST(min_hot_blocks), - ATTR_LIST(max_victim_search), - ATTR_LIST(dir_level), - ATTR_LIST(ram_thresh), - ATTR_LIST(ra_nid_pages), - ATTR_LIST(dirty_nats_ratio), - ATTR_LIST(cp_interval), - ATTR_LIST(idle_interval), -#ifdef CONFIG_F2FS_FAULT_INJECTION - ATTR_LIST(inject_rate), - ATTR_LIST(inject_type), -#endif - ATTR_LIST(lifetime_write_kbytes), - NULL, -}; - -static const struct sysfs_ops f2fs_attr_ops = { - .show = f2fs_attr_show, - .store = f2fs_attr_store, -}; - -static struct kobj_type f2fs_ktype = { - .default_attrs = f2fs_attrs, - .sysfs_ops = &f2fs_attr_ops, - .release = f2fs_sb_release, -}; - -int __init f2fs_register_sysfs(void) -{ - f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); - - f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj); - if (!f2fs_kset) - return -ENOMEM; - return 0; -} - -void f2fs_unregister_sysfs(void) -{ - kset_unregister(f2fs_kset); - remove_proc_entry("fs/f2fs", NULL); -} - void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) { struct va_format vaf; @@ -782,19 +561,6 @@ static void destroy_device_list(struct f2fs_sb_info *sbi) kfree(sbi->devs); } -void f2fs_exit_sysfs(struct f2fs_sb_info *sbi) -{ - kobject_del(&sbi->s_kobj); - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); - - if (sbi->s_proc) { - remove_proc_entry("segment_info", sbi->s_proc); - remove_proc_entry("segment_bits", sbi->s_proc); - remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); - } -} - static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -1014,105 +780,6 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) return 0; } -static int segment_info_seq_show(struct seq_file *seq, void *offset) -{ - struct super_block *sb = seq->private; - struct f2fs_sb_info *sbi = F2FS_SB(sb); - unsigned int total_segs = - le32_to_cpu(sbi->raw_super->segment_count_main); - int i; - - seq_puts(seq, "format: segment_type|valid_blocks\n" - "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); - - for (i = 0; i < total_segs; i++) { - struct seg_entry *se = get_seg_entry(sbi, i); - - if ((i % 10) == 0) - seq_printf(seq, "%-10d", i); - seq_printf(seq, "%d|%-3u", se->type, - get_valid_blocks(sbi, i, false)); - if ((i % 10) == 9 || i == (total_segs - 1)) - seq_putc(seq, '\n'); - else - seq_putc(seq, ' '); - } - - return 0; -} - -static int segment_bits_seq_show(struct seq_file *seq, void *offset) -{ - struct super_block *sb = seq->private; - struct f2fs_sb_info *sbi = F2FS_SB(sb); - unsigned int total_segs = - le32_to_cpu(sbi->raw_super->segment_count_main); - int i, j; - - seq_puts(seq, "format: segment_type|valid_blocks|bitmaps\n" - "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); - - for (i = 0; i < total_segs; i++) { - struct seg_entry *se = get_seg_entry(sbi, i); - - seq_printf(seq, "%-10d", i); - seq_printf(seq, "%d|%-3u|", se->type, - get_valid_blocks(sbi, i, false)); - for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++) - seq_printf(seq, " %.2x", se->cur_valid_map[j]); - seq_putc(seq, '\n'); - } - return 0; -} - -#define F2FS_PROC_FILE_DEF(_name) \ -static int _name##_open_fs(struct inode *inode, struct file *file) \ -{ \ - return single_open(file, _name##_seq_show, PDE_DATA(inode)); \ -} \ - \ -static const struct file_operations f2fs_seq_##_name##_fops = { \ - .owner = THIS_MODULE, \ - .open = _name##_open_fs, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = single_release, \ -}; - -F2FS_PROC_FILE_DEF(segment_info); -F2FS_PROC_FILE_DEF(segment_bits); - -int f2fs_init_sysfs(struct f2fs_sb_info *sbi) -{ - struct super_block *sb = sbi->sb; - int err; - - if (f2fs_proc_root) - sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); - - if (sbi->s_proc) { - proc_create_data("segment_info", S_IRUGO, sbi->s_proc, - &f2fs_seq_segment_info_fops, sb); - proc_create_data("segment_bits", S_IRUGO, sbi->s_proc, - &f2fs_seq_segment_bits_fops, sb); - } - - sbi->s_kobj.kset = f2fs_kset; - init_completion(&sbi->s_kobj_unregister); - err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL, - "%s", sb->s_id); - if (err) - goto err_out; - return 0; -err_out: - if (sbi->s_proc) { - remove_proc_entry("segment_info", sbi->s_proc); - remove_proc_entry("segment_bits", sbi->s_proc); - remove_proc_entry(sb->s_id, f2fs_proc_root); - } - return err; -} - static void default_options(struct f2fs_sb_info *sbi) { /* init some FS parameters */ diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c new file mode 100644 index 000000000000..714a3e47bbe8 --- /dev/null +++ b/fs/f2fs/sysfs.c @@ -0,0 +1,350 @@ +/* + * f2fs sysfs interface + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * Copyright (c) 2017 Chao Yu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include + +#include "f2fs.h" +#include "segment.h" +#include "gc.h" + +static struct proc_dir_entry *f2fs_proc_root; +static struct kset *f2fs_kset; + +/* Sysfs support for f2fs */ +enum { + GC_THREAD, /* struct f2fs_gc_thread */ + SM_INFO, /* struct f2fs_sm_info */ + DCC_INFO, /* struct discard_cmd_control */ + NM_INFO, /* struct f2fs_nm_info */ + F2FS_SBI, /* struct f2fs_sb_info */ +#ifdef CONFIG_F2FS_FAULT_INJECTION + FAULT_INFO_RATE, /* struct f2fs_fault_info */ + FAULT_INFO_TYPE, /* struct f2fs_fault_info */ +#endif +}; + +struct f2fs_attr { + struct attribute attr; + ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *); + ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *, + const char *, size_t); + int struct_type; + int offset; +}; + +static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) +{ + if (struct_type == GC_THREAD) + return (unsigned char *)sbi->gc_thread; + else if (struct_type == SM_INFO) + return (unsigned char *)SM_I(sbi); + else if (struct_type == DCC_INFO) + return (unsigned char *)SM_I(sbi)->dcc_info; + else if (struct_type == NM_INFO) + return (unsigned char *)NM_I(sbi); + else if (struct_type == F2FS_SBI) + return (unsigned char *)sbi; +#ifdef CONFIG_F2FS_FAULT_INJECTION + else if (struct_type == FAULT_INFO_RATE || + struct_type == FAULT_INFO_TYPE) + return (unsigned char *)&sbi->fault_info; +#endif + return NULL; +} + +static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->sb; + + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); + + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)(sbi->kbytes_written + + BD_PART_WRITTEN(sbi))); +} + +static ssize_t f2fs_sbi_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + unsigned char *ptr = NULL; + unsigned int *ui; + + ptr = __struct_ptr(sbi, a->struct_type); + if (!ptr) + return -EINVAL; + + ui = (unsigned int *)(ptr + a->offset); + + return snprintf(buf, PAGE_SIZE, "%u\n", *ui); +} + +static ssize_t f2fs_sbi_store(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned char *ptr; + unsigned long t; + unsigned int *ui; + ssize_t ret; + + ptr = __struct_ptr(sbi, a->struct_type); + if (!ptr) + return -EINVAL; + + ui = (unsigned int *)(ptr + a->offset); + + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret < 0) + return ret; +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX)) + return -EINVAL; +#endif + *ui = t; + return count; +} + +static ssize_t f2fs_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->show ? a->show(a, sbi, buf) : 0; +} + +static ssize_t f2fs_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->store ? a->store(a, sbi, buf, len) : 0; +} + +static void f2fs_sb_release(struct kobject *kobj) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + complete(&sbi->s_kobj_unregister); +} + +#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \ +static struct f2fs_attr f2fs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ + .struct_type = _struct_type, \ + .offset = _offset \ +} + +#define F2FS_RW_ATTR(struct_type, struct_name, name, elname) \ + F2FS_ATTR_OFFSET(struct_type, name, 0644, \ + f2fs_sbi_show, f2fs_sbi_store, \ + offsetof(struct struct_name, elname)) + +#define F2FS_GENERAL_RO_ATTR(name) \ +static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL) + +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); +#ifdef CONFIG_F2FS_FAULT_INJECTION +F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); +F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); +#endif +F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); + +#define ATTR_LIST(name) (&f2fs_attr_##name.attr) +static struct attribute *f2fs_attrs[] = { + ATTR_LIST(gc_min_sleep_time), + ATTR_LIST(gc_max_sleep_time), + ATTR_LIST(gc_no_gc_sleep_time), + ATTR_LIST(gc_idle), + ATTR_LIST(reclaim_segments), + ATTR_LIST(max_small_discards), + ATTR_LIST(batched_trim_sections), + ATTR_LIST(ipu_policy), + ATTR_LIST(min_ipu_util), + ATTR_LIST(min_fsync_blocks), + ATTR_LIST(min_hot_blocks), + ATTR_LIST(max_victim_search), + ATTR_LIST(dir_level), + ATTR_LIST(ram_thresh), + ATTR_LIST(ra_nid_pages), + ATTR_LIST(dirty_nats_ratio), + ATTR_LIST(cp_interval), + ATTR_LIST(idle_interval), +#ifdef CONFIG_F2FS_FAULT_INJECTION + ATTR_LIST(inject_rate), + ATTR_LIST(inject_type), +#endif + ATTR_LIST(lifetime_write_kbytes), + NULL, +}; + +static const struct sysfs_ops f2fs_attr_ops = { + .show = f2fs_attr_show, + .store = f2fs_attr_store, +}; + +static struct kobj_type f2fs_ktype = { + .default_attrs = f2fs_attrs, + .sysfs_ops = &f2fs_attr_ops, + .release = f2fs_sb_release, +}; + +static int segment_info_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + unsigned int total_segs = + le32_to_cpu(sbi->raw_super->segment_count_main); + int i; + + seq_puts(seq, "format: segment_type|valid_blocks\n" + "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); + + for (i = 0; i < total_segs; i++) { + struct seg_entry *se = get_seg_entry(sbi, i); + + if ((i % 10) == 0) + seq_printf(seq, "%-10d", i); + seq_printf(seq, "%d|%-3u", se->type, + get_valid_blocks(sbi, i, false)); + if ((i % 10) == 9 || i == (total_segs - 1)) + seq_putc(seq, '\n'); + else + seq_putc(seq, ' '); + } + + return 0; +} + +static int segment_bits_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + unsigned int total_segs = + le32_to_cpu(sbi->raw_super->segment_count_main); + int i, j; + + seq_puts(seq, "format: segment_type|valid_blocks|bitmaps\n" + "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); + + for (i = 0; i < total_segs; i++) { + struct seg_entry *se = get_seg_entry(sbi, i); + + seq_printf(seq, "%-10d", i); + seq_printf(seq, "%d|%-3u|", se->type, + get_valid_blocks(sbi, i, false)); + for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++) + seq_printf(seq, " %.2x", se->cur_valid_map[j]); + seq_putc(seq, '\n'); + } + return 0; +} + +#define F2FS_PROC_FILE_DEF(_name) \ +static int _name##_open_fs(struct inode *inode, struct file *file) \ +{ \ + return single_open(file, _name##_seq_show, PDE_DATA(inode)); \ +} \ + \ +static const struct file_operations f2fs_seq_##_name##_fops = { \ + .open = _name##_open_fs, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ +}; + +F2FS_PROC_FILE_DEF(segment_info); +F2FS_PROC_FILE_DEF(segment_bits); + +int __init f2fs_register_sysfs(void) +{ + f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); + + f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj); + if (!f2fs_kset) + return -ENOMEM; + return 0; +} + +void f2fs_unregister_sysfs(void) +{ + kset_unregister(f2fs_kset); + remove_proc_entry("fs/f2fs", NULL); +} + +int f2fs_init_sysfs(struct f2fs_sb_info *sbi) +{ + struct super_block *sb = sbi->sb; + int err; + + if (f2fs_proc_root) + sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); + + if (sbi->s_proc) { + proc_create_data("segment_info", S_IRUGO, sbi->s_proc, + &f2fs_seq_segment_info_fops, sb); + proc_create_data("segment_bits", S_IRUGO, sbi->s_proc, + &f2fs_seq_segment_bits_fops, sb); + } + + sbi->s_kobj.kset = f2fs_kset; + init_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL, + "%s", sb->s_id); + if (err) + goto err_out; + return 0; +err_out: + if (sbi->s_proc) { + remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry("segment_bits", sbi->s_proc); + remove_proc_entry(sb->s_id, f2fs_proc_root); + } + return err; +} + +void f2fs_exit_sysfs(struct f2fs_sb_info *sbi) +{ + kobject_del(&sbi->s_kobj); + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + + if (sbi->s_proc) { + remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry("segment_bits", sbi->s_proc); + remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); + } +} From 4e968ec7cb9d76cc71c46370696784818a0dd364 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 13 Jun 2017 16:47:54 -0700 Subject: [PATCH 0343/1212] f2fs: require key for truncate(2) of encrypted file Currently, filesystems allow truncate(2) on an encrypted file without the encryption key. However, it's impossible to correctly handle the case where the size being truncated to is not a multiple of the filesystem block size, because that would require decrypting the final block, zeroing the part beyond i_size, then encrypting the block. As other modifications to encrypted file contents are prohibited without the key, just prohibit truncate(2) as well, making it fail with ENOKEY. Signed-off-by: Eric Biggers Acked-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 447dd1221167..e93dcb9da1c0 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -692,9 +692,13 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) return err; if (attr->ia_valid & ATTR_SIZE) { - if (f2fs_encrypted_inode(inode) && - fscrypt_get_encryption_info(inode)) - return -EACCES; + if (f2fs_encrypted_inode(inode)) { + err = fscrypt_get_encryption_info(inode); + if (err) + return err; + if (!fscrypt_has_encryption_key(inode)) + return -ENOKEY; + } if (attr->ia_size <= i_size_read(inode)) { down_write(&F2FS_I(inode)->i_mmap_sem); From 8a4c67330110ee0623b7215961ced82dc5e6b5cc Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 14 Jun 2017 23:00:55 +0800 Subject: [PATCH 0344/1212] f2fs: set CP_TRIMMED_FLAG correctly Don't set CP_TRIMMED_FLAG for non-zoned block device or discard unsupported device, it can avoid to trigger unneeded checkpoint for that kind of device. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 42093c7c9ae0..2b72e1e9a330 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -587,7 +587,7 @@ static void f2fs_put_super(struct super_block *sb) /* be sure to wait for any on-going discard commands */ f2fs_wait_discard_bios(sbi); - if (!sbi->discard_blks) { + if (f2fs_discard_en(sbi) && !sbi->discard_blks) { struct cp_control cpc = { .reason = CP_UMOUNT | CP_TRIMMED, }; From b3cba4ddf8140b28c9bcc4c22c1ccbe1342cc55a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 14 Jun 2017 23:00:56 +0800 Subject: [PATCH 0345/1212] f2fs: measure inode.i_blocks as generic filesystem Both in memory or on disk, generic filesystems record i_blocks with 512bytes sized sector count, also VFS sub module such as disk quota follows this rule, but f2fs records it with 4096bytes sized block count, this difference leads to that once we use dquota's function which inc/dec iblocks, it will make i_blocks of f2fs being inconsistent between in memory and on disk. In order to resolve this issue, this patch changes to make in-memory i_blocks of f2fs recording sector count instead of block count, meanwhile leaving on-disk i_blocks recording block count. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 23 +++++++++++++---------- fs/f2fs/file.c | 1 - fs/f2fs/inode.c | 5 +++-- fs/f2fs/node.c | 2 +- 4 files changed, 17 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0d9d25891833..8e0f9693db04 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1419,10 +1419,10 @@ static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) */ static inline int F2FS_HAS_BLOCKS(struct inode *inode) { - if (F2FS_I(inode)->i_xattr_nid) - return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1; - else - return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS; + block_t xattr_block = F2FS_I(inode)->i_xattr_nid ? 1 : 0; + + return (inode->i_blocks >> F2FS_LOG_SECTORS_PER_BLOCK) > + (F2FS_DEFAULT_ALLOCATED_BLOCKS + xattr_block); } static inline bool f2fs_has_xattr_block(unsigned int ofs) @@ -1430,7 +1430,7 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs) return ofs == XATTR_NODE_OFFSET; } -static inline void f2fs_i_blocks_write(struct inode *, blkcnt_t, bool); +static inline void f2fs_i_blocks_write(struct inode *, block_t, bool); static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, blkcnt_t *count) { @@ -1468,11 +1468,13 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, - blkcnt_t count) + block_t count) { + blkcnt_t sectors = count << F2FS_LOG_SECTORS_PER_BLOCK; + spin_lock(&sbi->stat_lock); f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count); - f2fs_bug_on(sbi, inode->i_blocks < count); + f2fs_bug_on(sbi, inode->i_blocks < sectors); sbi->total_valid_block_count -= (block_t)count; spin_unlock(&sbi->stat_lock); f2fs_i_blocks_write(inode, count, false); @@ -1923,13 +1925,14 @@ static inline void f2fs_i_links_write(struct inode *inode, bool inc) } static inline void f2fs_i_blocks_write(struct inode *inode, - blkcnt_t diff, bool add) + block_t diff, bool add) { bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER); + blkcnt_t sectors = diff << F2FS_LOG_SECTORS_PER_BLOCK; - inode->i_blocks = add ? inode->i_blocks + diff : - inode->i_blocks - diff; + inode->i_blocks = add ? inode->i_blocks + sectors : + inode->i_blocks - sectors; f2fs_mark_inode_dirty_sync(inode, true); if (clean || recover) set_inode_flag(inode, FI_AUTO_RECOVER); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index e93dcb9da1c0..b9a33c910b8a 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -647,7 +647,6 @@ int f2fs_getattr(struct vfsmount *mnt, { struct inode *inode = d_inode(dentry); generic_fillattr(inode, stat); - stat->blocks <<= 3; return 0; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 868d71436ebc..1ff5bd418d87 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -16,6 +16,7 @@ #include "f2fs.h" #include "node.h" +#include "segment.h" #include @@ -129,7 +130,7 @@ static int do_read_inode(struct inode *inode) i_gid_write(inode, le32_to_cpu(ri->i_gid)); set_nlink(inode, le32_to_cpu(ri->i_links)); inode->i_size = le64_to_cpu(ri->i_size); - inode->i_blocks = le64_to_cpu(ri->i_blocks); + inode->i_blocks = SECTOR_FROM_BLOCK(le64_to_cpu(ri->i_blocks)); inode->i_atime.tv_sec = le64_to_cpu(ri->i_atime); inode->i_ctime.tv_sec = le64_to_cpu(ri->i_ctime); @@ -267,7 +268,7 @@ int update_inode(struct inode *inode, struct page *node_page) ri->i_gid = cpu_to_le32(i_gid_read(inode)); ri->i_links = cpu_to_le32(inode->i_nlink); ri->i_size = cpu_to_le64(i_size_read(inode)); - ri->i_blocks = cpu_to_le64(inode->i_blocks); + ri->i_blocks = cpu_to_le64(SECTOR_TO_BLOCK(inode->i_blocks)); if (et) { read_lock(&et->lock); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 70f3c01a806f..b36b34f45bae 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1011,7 +1011,7 @@ int remove_inode_page(struct inode *inode) /* 0 is possible, after f2fs_new_inode() has failed */ f2fs_bug_on(F2FS_I_SB(inode), - inode->i_blocks != 0 && inode->i_blocks != 1); + inode->i_blocks != 0 && inode->i_blocks != 8); /* will put inode & node pages */ truncate_node(&dn); From 17ca8933b3827480fe7e94c228db05d3a471d180 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 14 Jun 2017 08:05:32 -0700 Subject: [PATCH 0346/1212] f2fs: don't need to check encrypted inode for partial truncation The cache_only is always false, if inode is encrypted. Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index b9a33c910b8a..cb99a7701080 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -537,8 +537,10 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, truncate_out: f2fs_wait_on_page_writeback(page, DATA, true); zero_user(page, offset, PAGE_SIZE - offset); - if (!cache_only || !f2fs_encrypted_inode(inode) || - !S_ISREG(inode->i_mode)) + + /* An encrypted inode should have a key and truncate the last page. */ + f2fs_bug_on(F2FS_I_SB(inode), cache_only && f2fs_encrypted_inode(inode)); + if (!cache_only) set_page_dirty(page); f2fs_put_page(page, 1); return 0; From 40edf0c8ad01ca3c495bf2173ae5e263e9dbd318 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 15 Jun 2017 16:44:42 -0700 Subject: [PATCH 0347/1212] f2fs: add ioctl to do gc with target block address This patch adds f2fs_ioc_gc_range() to move blocks located in the given range. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 8 ++++++++ fs/f2fs/file.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8e0f9693db04..c0985a462af9 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -370,6 +370,8 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, struct f2fs_move_range) #define F2FS_IOC_FLUSH_DEVICE _IOW(F2FS_IOCTL_MAGIC, 10, \ struct f2fs_flush_device) +#define F2FS_IOC_GARBAGE_COLLECT_RANGE _IOW(F2FS_IOCTL_MAGIC, 11, \ + struct f2fs_gc_range) #define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY #define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY @@ -394,6 +396,12 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_IOC32_GETVERSION FS_IOC32_GETVERSION #endif +struct f2fs_gc_range { + u32 sync; + u64 start; + u64 len; +}; + struct f2fs_defragment { u64 start; u64 len; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index cb99a7701080..9978e86c89c5 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1896,6 +1896,50 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) return ret; } +static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_gc_range range; + u64 end; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&range, (struct f2fs_gc_range __user *)arg, + sizeof(range))) + return -EFAULT; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + end = range.start + range.len; + if (range.start < MAIN_BLKADDR(sbi) || end >= MAX_BLKADDR(sbi)) + return -EINVAL; +do_more: + if (!range.sync) { + if (!mutex_trylock(&sbi->gc_mutex)) { + ret = -EBUSY; + goto out; + } + } else { + mutex_lock(&sbi->gc_mutex); + } + + ret = f2fs_gc(sbi, range.sync, true, GET_SEGNO(sbi, range.start)); + range.start += sbi->blocks_per_seg; + if (range.start <= end) + goto do_more; +out: + mnt_drop_write_file(filp); + return ret; +} + static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -2340,6 +2384,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_get_encryption_pwsalt(filp, arg); case F2FS_IOC_GARBAGE_COLLECT: return f2fs_ioc_gc(filp, arg); + case F2FS_IOC_GARBAGE_COLLECT_RANGE: + return f2fs_ioc_gc_range(filp, arg); case F2FS_IOC_WRITE_CHECKPOINT: return f2fs_ioc_write_checkpoint(filp, arg); case F2FS_IOC_DEFRAGMENT: @@ -2413,6 +2459,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_GET_ENCRYPTION_PWSALT: case F2FS_IOC_GET_ENCRYPTION_POLICY: case F2FS_IOC_GARBAGE_COLLECT: + case F2FS_IOC_GARBAGE_COLLECT_RANGE: case F2FS_IOC_WRITE_CHECKPOINT: case F2FS_IOC_DEFRAGMENT: case F2FS_IOC_MOVE_RANGE: From 38435dd20c4cd9e08dcdb9a8f79529cf13bf4936 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 21 Jun 2017 20:55:55 -0700 Subject: [PATCH 0348/1212] f2fs: report # of free inodes more precisely If the partition is small, we don't need to report total # of inodes including hidden free nodes. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 2b72e1e9a330..8948d7b2cb28 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -681,6 +681,7 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) struct f2fs_sb_info *sbi = F2FS_SB(sb); u64 id = huge_encode_dev(sb->s_bdev->bd_dev); block_t total_count, user_block_count, start_count, ovp_count; + u64 avail_node_count; total_count = le64_to_cpu(sbi->raw_super->block_count); user_block_count = sbi->user_block_count; @@ -693,9 +694,16 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count; buf->f_bavail = user_block_count - valid_user_blocks(sbi); - buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; - buf->f_ffree = min(buf->f_files - valid_node_count(sbi), - buf->f_bavail); + avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; + + if (avail_node_count > user_block_count) { + buf->f_files = user_block_count; + buf->f_ffree = buf->f_bavail; + } else { + buf->f_files = avail_node_count; + buf->f_ffree = min(avail_node_count - valid_node_count(sbi), + buf->f_bavail); + } buf->f_namelen = F2FS_NAME_LEN; buf->f_fsid.val[0] = (u32)id; From 99ad6f555ab7f7ca7c337521dfe0c66a703dbdff Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Sat, 24 Jun 2017 15:57:19 +0800 Subject: [PATCH 0349/1212] f2fs: avoid redundant f2fs_flush after remount create_flush_cmd_control will create redundant issue_flush_thread after each remount with flush_merge option. Signed-off-by: Yunlong Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 9b08a6660d13..d73b4b29055c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -555,6 +555,8 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) if (SM_I(sbi)->fcc_info) { fcc = SM_I(sbi)->fcc_info; + if (fcc->f2fs_issue_flush) + return err; goto init_thread; } From ea9b9467fabf37b393eca00668cf9e2f8fce1ee8 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 26 Jun 2017 16:24:41 +0800 Subject: [PATCH 0350/1212] f2fs: introduce reserved_blocks in sysfs In this patch, we add a new sysfs interface, with it, we can control number of reserved blocks in system which could not be used by user, it enable f2fs to let user to configure for adjusting over-provision ratio dynamically instead of changing it by mkfs. So we can expect it will help to reserve more free space for relieving GC in both filesystem and flash device. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Conflicts: Documentation/ABI/testing/sysfs-fs-f2fs --- Documentation/ABI/testing/sysfs-fs-f2fs | 32 +++++++++++++++++++++++++ fs/f2fs/f2fs.h | 13 ++++++---- fs/f2fs/super.c | 4 +++- fs/f2fs/sysfs.c | 16 ++++++++++++- 4 files changed, 59 insertions(+), 6 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 0345f2d1c727..2805ce062fdb 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -92,3 +92,35 @@ Date: October 2015 Contact: "Chao Yu" Description: Controls the count of nid pages to be readaheaded. + +What: /sys/fs/f2fs//dirty_nats_ratio +Date: January 2016 +Contact: "Chao Yu" +Description: + Controls dirty nat entries ratio threshold, if current + ratio exceeds configured threshold, checkpoint will + be triggered for flushing dirty nat entries. + +What: /sys/fs/f2fs//lifetime_write_kbytes +Date: January 2016 +Contact: "Shuoran Liu" +Description: + Shows total written kbytes issued to disk. + +What: /sys/fs/f2fs//inject_rate +Date: May 2016 +Contact: "Sheng Yong" +Description: + Controls the injection rate. + +What: /sys/fs/f2fs//inject_type +Date: May 2016 +Contact: "Sheng Yong" +Description: + Controls the injection type. + +What: /sys/fs/f2fs//reserved_blocks +Date: June 2017 +Contact: "Chao Yu" +Description: + Controls current reserved blocks in system. diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c0985a462af9..e908bf456230 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1033,6 +1033,8 @@ struct f2fs_sb_info { block_t total_valid_block_count; /* # of valid blocks */ block_t discard_blks; /* discard command candidats */ block_t last_valid_block_count; /* for recovery */ + block_t reserved_blocks; /* configurable reserved blocks */ + u32 s_next_generation; /* for NFS support */ /* # of pages, see count_type */ @@ -1443,6 +1445,7 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, blkcnt_t *count) { blkcnt_t diff; + block_t avail_user_block_count; #ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_BLOCK)) { @@ -1458,10 +1461,11 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); sbi->total_valid_block_count += (block_t)(*count); - if (unlikely(sbi->total_valid_block_count > sbi->user_block_count)) { - diff = sbi->total_valid_block_count - sbi->user_block_count; + avail_user_block_count = sbi->user_block_count - sbi->reserved_blocks; + if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) { + diff = sbi->total_valid_block_count - avail_user_block_count; *count -= diff; - sbi->total_valid_block_count = sbi->user_block_count; + sbi->total_valid_block_count = avail_user_block_count; if (!*count) { spin_unlock(&sbi->stat_lock); percpu_counter_sub(&sbi->alloc_valid_block_count, diff); @@ -1623,7 +1627,8 @@ static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); valid_block_count = sbi->total_valid_block_count + 1; - if (unlikely(valid_block_count > sbi->user_block_count)) { + if (unlikely(valid_block_count + sbi->reserved_blocks > + sbi->user_block_count)) { spin_unlock(&sbi->stat_lock); return false; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8948d7b2cb28..c83a5ad7c2ba 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -692,7 +692,8 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = total_count - start_count; buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count; - buf->f_bavail = user_block_count - valid_user_blocks(sbi); + buf->f_bavail = user_block_count - valid_user_blocks(sbi) - + sbi->reserved_blocks; avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; @@ -1773,6 +1774,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) sbi->total_valid_block_count = le64_to_cpu(sbi->ckpt->valid_block_count); sbi->last_valid_block_count = sbi->total_valid_block_count; + sbi->reserved_blocks = 0; for (i = 0; i < NR_INODE_TYPE; i++) { INIT_LIST_HEAD(&sbi->inode_list[i]); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 714a3e47bbe8..9adc202fcd6f 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -30,6 +30,7 @@ enum { FAULT_INFO_RATE, /* struct f2fs_fault_info */ FAULT_INFO_TYPE, /* struct f2fs_fault_info */ #endif + RESERVED_BLOCKS, }; struct f2fs_attr { @@ -51,7 +52,7 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) return (unsigned char *)SM_I(sbi)->dcc_info; else if (struct_type == NM_INFO) return (unsigned char *)NM_I(sbi); - else if (struct_type == F2FS_SBI) + else if (struct_type == F2FS_SBI || struct_type == RESERVED_BLOCKS) return (unsigned char *)sbi; #ifdef CONFIG_F2FS_FAULT_INJECTION else if (struct_type == FAULT_INFO_RATE || @@ -111,6 +112,17 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX)) return -EINVAL; #endif + if (a->struct_type == RESERVED_BLOCKS) { + spin_lock(&sbi->stat_lock); + if ((unsigned long)sbi->total_valid_block_count + t > + (unsigned long)sbi->user_block_count) { + spin_unlock(&sbi->stat_lock); + return -EINVAL; + } + *ui = t; + spin_unlock(&sbi->stat_lock); + return count; + } *ui = t; return count; } @@ -165,6 +177,7 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); +F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); @@ -208,6 +221,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(inject_type), #endif ATTR_LIST(lifetime_write_kbytes), + ATTR_LIST(reserved_blocks), NULL, }; From 243d3acf5b181bbea3a4ad2ec8a3a84c26a22701 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 29 Jun 2017 23:17:45 +0800 Subject: [PATCH 0351/1212] f2fs: stop gc/discard thread in prior during umount This patch resolves kernel panic for xfstests/081, caused by recent f2fs_bug_on f2fs: add f2fs_bug_on in __remove_discard_cmd For fixing, we will stop gc/discard thread in prior in ->kill_sb in order to avoid referring and releasing race among them. Signed-off-by: Jaegeuk Kim Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/segment.c | 19 +++++++++++++------ fs/f2fs/super.c | 7 ++++--- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e908bf456230..33681a2f160d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2360,6 +2360,7 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new); +void stop_discard_thread(struct f2fs_sb_info *sbi); void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); void release_discard_addrs(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d73b4b29055c..09532f823cbc 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1144,6 +1144,18 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) __wait_one_discard_bio(sbi, dc); } +void stop_discard_thread(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + + if (dcc && dcc->f2fs_issue_discard) { + struct task_struct *discard_thread = dcc->f2fs_issue_discard; + + dcc->f2fs_issue_discard = NULL; + kthread_stop(discard_thread); + } +} + /* This comes from f2fs_put_super */ void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) { @@ -1501,12 +1513,7 @@ static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi) if (!dcc) return; - if (dcc->f2fs_issue_discard) { - struct task_struct *discard_thread = dcc->f2fs_issue_discard; - - dcc->f2fs_issue_discard = NULL; - kthread_stop(discard_thread); - } + stop_discard_thread(sbi); kfree(dcc); SM_I(sbi)->dcc_info = NULL; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index c83a5ad7c2ba..e4202585f92e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -566,8 +566,6 @@ static void f2fs_put_super(struct super_block *sb) struct f2fs_sb_info *sbi = F2FS_SB(sb); int i; - stop_gc_thread(sbi); - /* prevent remaining shrinker jobs */ mutex_lock(&sbi->umount_mutex); @@ -1981,8 +1979,11 @@ static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags, static void kill_f2fs_super(struct super_block *sb) { - if (sb->s_root) + if (sb->s_root) { set_sbi_flag(F2FS_SB(sb), SBI_IS_CLOSE); + stop_gc_thread(F2FS_SB(sb)); + stop_discard_thread(F2FS_SB(sb)); + } kill_block_super(sb); } From 3099c953ccfdd643e83329f55afa088e94904831 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 30 Jun 2017 17:19:02 +0800 Subject: [PATCH 0352/1212] f2fs: introduce __check_sit_bitmap After we introduce discard thread, discard command can be issued concurrently with data allocating, this patch adds new function to heck sit bitmap to ensure that userdata was invalid in which on-going discard command covered. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/segment.c --- fs/f2fs/segment.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 09532f823cbc..0f6cded83c7b 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -844,6 +844,31 @@ static int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, return 0; } +void __check_sit_bitmap(struct f2fs_sb_info *sbi, + block_t start, block_t end) +{ +#ifdef CONFIG_F2FS_CHECK_FS + struct seg_entry *sentry; + unsigned int segno; + block_t blk = start; + unsigned long offset, size, max_blocks = sbi->blocks_per_seg; + unsigned long *map; + + while (blk < end) { + segno = GET_SEGNO(sbi, blk); + sentry = get_seg_entry(sbi, segno); + offset = GET_BLKOFF_FROM_SEG0(sbi, blk); + + size = min((unsigned long)(end - blk), max_blocks); + map = (unsigned long *)(sentry->cur_valid_map); + offset = __find_rev_next_bit(map, size, offset); + f2fs_bug_on(sbi, offset != size); + blk += size; + } +#endif +} + +/* this function is copied from blkdev_issue_discard from block/blk-lib.c */ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc) { @@ -869,6 +894,7 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, bio->bi_end_io = f2fs_submit_discard_endio; submit_bio(REQ_SYNC, bio); list_move_tail(&dc->list, &dcc->wait_list); + __check_sit_bitmap(sbi, dc->start, dc->start + dc->len); } } else { __remove_discard_cmd(sbi, dc); From 0b1e7ba9ee12277e92fc9043fd8a9f718e8bd9c1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 29 Jun 2017 23:20:45 +0800 Subject: [PATCH 0353/1212] f2fs: skip ->writepages for {mete,node}_inode during recovery Skip ->writepages in prior to ->writepage for {meta,node}_inode during recovery, hence unneeded loop in ->writepages can be avoided. Moreover, check SBI_POR_DOING earlier while writebacking pages. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 3 +++ fs/f2fs/data.c | 13 +++++++------ fs/f2fs/node.c | 3 +++ 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 69641cf7fd6f..52ed60e72a25 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -270,6 +270,9 @@ static int f2fs_write_meta_pages(struct address_space *mapping, struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); long diff, written; + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto skip_write; + /* collect a number of dirty meta pages and write together */ if (wbc->for_kupdate || get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 50048986bed9..4a1730cfe86c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1490,6 +1490,9 @@ static int __write_data_page(struct page *page, bool *submitted, trace_f2fs_writepage(page, DATA); + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto redirty_out; + if (page->index < end_index) goto write; @@ -1503,8 +1506,6 @@ static int __write_data_page(struct page *page, bool *submitted, zero_user_segment(page, offset, PAGE_SIZE); write: - if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) - goto redirty_out; if (f2fs_is_drop_cache(inode)) goto out; /* we should not write 0'th page having journal header */ @@ -1752,6 +1753,10 @@ static int f2fs_write_data_pages(struct address_space *mapping, if (!get_dirty_pages(inode) && wbc->sync_mode == WB_SYNC_NONE) return 0; + /* during POR, we don't need to trigger writepage at all. */ + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto skip_write; + if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) && available_free_memory(sbi, DIRTY_DENTS)) @@ -1761,10 +1766,6 @@ static int f2fs_write_data_pages(struct address_space *mapping, if (is_inode_flag_set(inode, FI_DO_DEFRAG)) goto skip_write; - /* during POR, we don't need to trigger writepage at all. */ - if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) - goto skip_write; - trace_f2fs_writepages(mapping->host, wbc, DATA); /* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b36b34f45bae..f5eebb99f57a 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1690,6 +1690,9 @@ static int f2fs_write_node_pages(struct address_space *mapping, struct blk_plug plug; long diff; + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto skip_write; + /* balancing f2fs's metadata in background */ f2fs_balance_fs_bg(sbi); From 76b23d9006666425dce0ff700370c4887081129d Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Mon, 26 Jun 2017 10:41:35 +0800 Subject: [PATCH 0354/1212] f2fs: do not set LOST_PINO for newly created dir Since directories will be written back with checkpoint and fsync a directory will always write CP, there is no need to set LOST_PINO after creating a directory. Signed-off-by: Sheng Yong Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index a87a5ecca74d..1380c442648b 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -415,7 +415,8 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, * We lost i_pino from now on. */ if (is_inode_flag_set(inode, FI_INC_LINK)) { - file_lost_pino(inode); + if (!S_ISDIR(inode->i_mode)) + file_lost_pino(inode); /* * If link the tmpfile to alias through linkat path, * we should remove this inode from orphan list. From deb4cef3cba9854c07641dd29b63f16dc535889d Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Mon, 26 Jun 2017 10:41:36 +0800 Subject: [PATCH 0355/1212] f2fs: do not set LOST_PINO for renamed dir After renaming a directory, fsck could detect unmatched pino. The scenario can be reproduced as the following: $ mkdir /bar/subbar /foo $ rename /bar/subbar /foo Then fsck will report: [ASSERT] (__chk_dots_dentries:1182) --> Bad inode number[0x3] for '..', parent parent ino is [0x4] Rename sets LOST_PINO for old_inode. However, the flag cannot be cleared, since dir is written back with CP. So, let's get rid of LOST_PINO for a renamed dir and fix the pino directly at the end of rename. Signed-off-by: Sheng Yong Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 77349d51f952..82714cdde5f2 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -769,7 +769,10 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } down_write(&F2FS_I(old_inode)->i_sem); - file_lost_pino(old_inode); + if (!old_dir_entry || whiteout) + file_lost_pino(old_inode); + else + F2FS_I(old_inode)->i_pino = new_dir->i_ino; up_write(&F2FS_I(old_inode)->i_sem); old_inode->i_ctime = current_time(old_inode); From 0b359879b966344cb2aa51e3906b422dfd201974 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 5 Jul 2017 12:17:24 +0800 Subject: [PATCH 0356/1212] Revert "f2fs: fix to clean previous mount option when remount_fs" Don't clear old mount option before parse new option during ->remount_fs like other generic filesystems. This reverts commit 26666c8a4366debae30ae37d0688b2bec92d196a. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e4202585f92e..7f3c99d43579 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -847,7 +847,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) clear_sbi_flag(sbi, SBI_NEED_SB_WRITE); } - sbi->mount_opt.opt = 0; default_options(sbi); /* parse mount options */ From 4f7f22b7c85a4b25247df14955c64761b401ebc5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 6 Jul 2017 01:11:31 +0800 Subject: [PATCH 0357/1212] f2fs: don't count inode block in in-memory inode.i_blocks Previously, we count all inode consumed blocks including inode block, xattr block, index block, data block into i_blocks, for other generic filesystems, they won't count inode block into i_blocks, so for userspace applications or quota system, they may detect incorrect block count according to i_blocks value in inode. This patch changes to count all blocks into inode.i_blocks excluding inode block, for on-disk i_blocks, we keep counting inode block for backward compatibility. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 22 ++++++++++++---------- fs/f2fs/inode.c | 4 ++-- fs/f2fs/node.c | 16 ++++++---------- 3 files changed, 20 insertions(+), 22 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 33681a2f160d..3e0832973a2c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1422,8 +1422,6 @@ static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) return 0; } -#define F2FS_DEFAULT_ALLOCATED_BLOCKS 1 - /* * Check whether the inode has blocks or not */ @@ -1431,8 +1429,7 @@ static inline int F2FS_HAS_BLOCKS(struct inode *inode) { block_t xattr_block = F2FS_I(inode)->i_xattr_nid ? 1 : 0; - return (inode->i_blocks >> F2FS_LOG_SECTORS_PER_BLOCK) > - (F2FS_DEFAULT_ALLOCATED_BLOCKS + xattr_block); + return (inode->i_blocks >> F2FS_LOG_SECTORS_PER_BLOCK) > xattr_block; } static inline bool f2fs_has_xattr_block(unsigned int ofs) @@ -1619,7 +1616,7 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) } static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi, - struct inode *inode) + struct inode *inode, bool is_inode) { block_t valid_block_count; unsigned int valid_node_count; @@ -1639,8 +1636,12 @@ static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi, return false; } - if (inode) - f2fs_i_blocks_write(inode, 1, true); + if (inode) { + if (is_inode) + f2fs_mark_inode_dirty_sync(inode, true); + else + f2fs_i_blocks_write(inode, 1, true); + } sbi->total_valid_node_count++; sbi->total_valid_block_count++; @@ -1651,15 +1652,16 @@ static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi, } static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, - struct inode *inode) + struct inode *inode, bool is_inode) { spin_lock(&sbi->stat_lock); f2fs_bug_on(sbi, !sbi->total_valid_block_count); f2fs_bug_on(sbi, !sbi->total_valid_node_count); - f2fs_bug_on(sbi, !inode->i_blocks); + f2fs_bug_on(sbi, !is_inode && !inode->i_blocks); - f2fs_i_blocks_write(inode, 1, false); + if (!is_inode) + f2fs_i_blocks_write(inode, 1, false); sbi->total_valid_node_count--; sbi->total_valid_block_count--; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 1ff5bd418d87..e42a7a8805dc 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -130,7 +130,7 @@ static int do_read_inode(struct inode *inode) i_gid_write(inode, le32_to_cpu(ri->i_gid)); set_nlink(inode, le32_to_cpu(ri->i_links)); inode->i_size = le64_to_cpu(ri->i_size); - inode->i_blocks = SECTOR_FROM_BLOCK(le64_to_cpu(ri->i_blocks)); + inode->i_blocks = SECTOR_FROM_BLOCK(le64_to_cpu(ri->i_blocks) - 1); inode->i_atime.tv_sec = le64_to_cpu(ri->i_atime); inode->i_ctime.tv_sec = le64_to_cpu(ri->i_ctime); @@ -268,7 +268,7 @@ int update_inode(struct inode *inode, struct page *node_page) ri->i_gid = cpu_to_le32(i_gid_read(inode)); ri->i_links = cpu_to_le32(inode->i_nlink); ri->i_size = cpu_to_le64(i_size_read(inode)); - ri->i_blocks = cpu_to_le64(SECTOR_TO_BLOCK(inode->i_blocks)); + ri->i_blocks = cpu_to_le64(SECTOR_TO_BLOCK(inode->i_blocks) + 1); if (et) { read_lock(&et->lock); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index f5eebb99f57a..81c8d4eca6b9 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -678,15 +678,11 @@ static void truncate_node(struct dnode_of_data *dn) struct node_info ni; get_node_info(sbi, dn->nid, &ni); - if (dn->inode->i_blocks == 0) { - f2fs_bug_on(sbi, ni.blk_addr != NULL_ADDR); - goto invalidate; - } f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR); /* Deallocate node address */ invalidate_blocks(sbi, ni.blk_addr); - dec_valid_node_count(sbi, dn->inode); + dec_valid_node_count(sbi, dn->inode, dn->nid == dn->inode->i_ino); set_node_addr(sbi, &ni, NULL_ADDR, false); if (dn->nid == dn->inode->i_ino) { @@ -694,7 +690,7 @@ static void truncate_node(struct dnode_of_data *dn) dec_valid_inode_count(sbi); f2fs_inode_synced(dn->inode); } -invalidate: + clear_node_page_dirty(dn->node_page); set_sbi_flag(sbi, SBI_IS_DIRTY); @@ -1044,7 +1040,7 @@ struct page *new_node_page(struct dnode_of_data *dn, if (!page) return ERR_PTR(-ENOMEM); - if (unlikely(!inc_valid_node_count(sbi, dn->inode))) { + if (unlikely(!inc_valid_node_count(sbi, dn->inode, !ofs))) { err = -ENOSPC; goto fail; } @@ -2210,14 +2206,14 @@ int recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) get_node_info(sbi, prev_xnid, &ni); f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR); invalidate_blocks(sbi, ni.blk_addr); - dec_valid_node_count(sbi, inode); + dec_valid_node_count(sbi, inode, false); set_node_addr(sbi, &ni, NULL_ADDR, false); recover_xnid: /* 2: update xattr nid in inode */ remove_free_nid(sbi, new_xnid); f2fs_i_xnid_write(inode, new_xnid); - if (unlikely(!inc_valid_node_count(sbi, inode))) + if (unlikely(!inc_valid_node_count(sbi, inode, false))) f2fs_bug_on(sbi, 1); update_inode_page(inode); @@ -2275,7 +2271,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) new_ni = old_ni; new_ni.ino = ino; - if (unlikely(!inc_valid_node_count(sbi, NULL))) + if (unlikely(!inc_valid_node_count(sbi, NULL, true))) WARN_ON(1); set_node_addr(sbi, &new_ni, NEW_ADDR, false); inc_valid_inode_count(sbi); From e4a2b45da36b6e2c31bdee76550f0eccd1f9a21f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 6 Jul 2017 14:46:01 -0700 Subject: [PATCH 0358/1212] f2fs: relax migratepage for atomic written page In order to avoid lock contention for atomic written pages, we'd better give EBUSY in f2fs_migrate_page when mode is asynchronous. We expect it will be released soon as transaction commits. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 4a1730cfe86c..dc204f178b13 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2202,8 +2202,12 @@ int f2fs_migrate_page(struct address_space *mapping, BUG_ON(PageWriteback(page)); /* migrating an atomic written page is safe with the inmem_lock hold */ - if (atomic_written && !mutex_trylock(&fi->inmem_lock)) - return -EAGAIN; + if (atomic_written) { + if (mode != MIGRATE_SYNC) + return -EBUSY; + if (!mutex_trylock(&fi->inmem_lock)) + return -EAGAIN; + } /* * A reference is expected if PagePrivate set when move mapping, From a2bd44aca5f999b5fb69428448f88e4e8823c9ee Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 7 Jul 2017 14:10:15 +0800 Subject: [PATCH 0359/1212] f2fs: use spin_{,un}lock_irq{save,restore} generic/361 reports below warning, this is because: once, there is someone entering into critical region of sbi.cp_lock, if write_end_io. f2fs_stop_checkpoint is invoked from an triggered IRQ, we will encounter deadlock. So this patch changes to use spin_{,un}lock_irq{save,restore} to create critical region without IRQ enabled to avoid potential deadlock. irq event stamp: 83391573 loop: Write error at byte offset 438729728, length 1024. hardirqs last enabled at (83391573): [] restore_all+0xf/0x65 hardirqs last disabled at (83391572): [] reschedule_interrupt+0x30/0x3c loop: Write error at byte offset 438860288, length 1536. softirqs last enabled at (83389244): [] __do_softirq+0x1ae/0x476 softirqs last disabled at (83389237): [] do_softirq_own_stack+0x2c/0x40 loop: Write error at byte offset 438990848, length 2048. ================================ WARNING: inconsistent lock state 4.12.0-rc2+ #30 Tainted: G O -------------------------------- inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage. xfs_io/7959 [HC1[1]:SC0[0]:HE0:SE1] takes: (&(&sbi->cp_lock)->rlock){?.+...}, at: [] f2fs_stop_checkpoint+0x1c/0x50 [f2fs] {HARDIRQ-ON-W} state was registered at: __lock_acquire+0x527/0x7b0 lock_acquire+0xae/0x220 _raw_spin_lock+0x42/0x50 do_checkpoint+0x165/0x9e0 [f2fs] write_checkpoint+0x33f/0x740 [f2fs] __f2fs_sync_fs+0x92/0x1f0 [f2fs] f2fs_sync_fs+0x12/0x20 [f2fs] sync_filesystem+0x67/0x80 generic_shutdown_super+0x27/0x100 kill_block_super+0x22/0x50 kill_f2fs_super+0x3a/0x40 [f2fs] deactivate_locked_super+0x3d/0x70 deactivate_super+0x40/0x60 cleanup_mnt+0x39/0x70 __cleanup_mnt+0x10/0x20 task_work_run+0x69/0x80 exit_to_usermode_loop+0x57/0x85 do_fast_syscall_32+0x18c/0x1b0 entry_SYSENTER_32+0x4c/0x7b irq event stamp: 1957420 hardirqs last enabled at (1957419): [] _raw_spin_unlock_irq+0x27/0x50 hardirqs last disabled at (1957420): [] call_function_single_interrupt+0x30/0x3c softirqs last enabled at (1953784): [] __do_softirq+0x1ae/0x476 softirqs last disabled at (1953773): [] do_softirq_own_stack+0x2c/0x40 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&(&sbi->cp_lock)->rlock); lock(&(&sbi->cp_lock)->rlock); *** DEADLOCK *** 2 locks held by xfs_io/7959: #0: (sb_writers#13){.+.+.+}, at: [] vfs_write+0x16a/0x190 #1: (&sb->s_type->i_mutex_key#16){+.+.+.}, at: [] f2fs_file_write_iter+0x25/0x140 [f2fs] stack backtrace: CPU: 2 PID: 7959 Comm: xfs_io Tainted: G O 4.12.0-rc2+ #30 Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 Call Trace: dump_stack+0x5f/0x92 print_usage_bug+0x1d3/0x1dd ? check_usage_backwards+0xe0/0xe0 mark_lock+0x23d/0x280 __lock_acquire+0x699/0x7b0 ? __this_cpu_preempt_check+0xf/0x20 ? trace_hardirqs_off_caller+0x91/0xe0 lock_acquire+0xae/0x220 ? f2fs_stop_checkpoint+0x1c/0x50 [f2fs] _raw_spin_lock+0x42/0x50 ? f2fs_stop_checkpoint+0x1c/0x50 [f2fs] f2fs_stop_checkpoint+0x1c/0x50 [f2fs] f2fs_write_end_io+0x147/0x150 [f2fs] bio_endio+0x7a/0x1e0 blk_update_request+0xad/0x410 blk_mq_end_request+0x16/0x60 lo_complete_rq+0x3c/0x70 __blk_mq_complete_request_remote+0x11/0x20 flush_smp_call_function_queue+0x6d/0x120 ? debug_smp_processor_id+0x12/0x20 generic_smp_call_function_single_interrupt+0x12/0x30 smp_call_function_single_interrupt+0x25/0x40 call_function_single_interrupt+0x37/0x3c EIP: _raw_spin_unlock_irq+0x2d/0x50 EFLAGS: 00000296 CPU: 2 EAX: 00000001 EBX: d2ccc51c ECX: 00000001 EDX: c1aacebd ESI: 00000000 EDI: 00000000 EBP: c96c9d1c ESP: c96c9d18 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 ? inherit_task_group.isra.98.part.99+0x6b/0xb0 __add_to_page_cache_locked+0x1d4/0x290 add_to_page_cache_lru+0x38/0xb0 pagecache_get_page+0x8e/0x200 f2fs_write_begin+0x96/0xf00 [f2fs] ? trace_hardirqs_on_caller+0xdd/0x1c0 ? current_time+0x17/0x50 ? trace_hardirqs_on+0xb/0x10 generic_perform_write+0xa9/0x170 __generic_file_write_iter+0x1a2/0x1f0 ? f2fs_preallocate_blocks+0x137/0x160 [f2fs] f2fs_file_write_iter+0x6e/0x140 [f2fs] ? __lock_acquire+0x429/0x7b0 __vfs_write+0xc1/0x140 vfs_write+0x9b/0x190 SyS_pwrite64+0x63/0xa0 do_fast_syscall_32+0xa1/0x1b0 entry_SYSENTER_32+0x4c/0x7b EIP: 0xb7786c61 EFLAGS: 00000293 CPU: 2 EAX: ffffffda EBX: 00000003 ECX: 08416000 EDX: 00001000 ESI: 18b24000 EDI: 00000000 EBP: 00000003 ESP: bf9b36b0 DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 007b Fixes: aaec2b1d1879 ("f2fs: introduce cp_lock to protect updating of ckpt_flags") Cc: stable@vger.kernel.org Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 11 ++++++----- fs/f2fs/f2fs.h | 18 ++++++++++++------ 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 52ed60e72a25..aabf7c4984d3 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1054,8 +1054,9 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) { unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + unsigned long flags; - spin_lock(&sbi->cp_lock); + spin_lock_irqsave(&sbi->cp_lock, flags); if ((cpc->reason & CP_UMOUNT) && le32_to_cpu(ckpt->cp_pack_total_block_count) > @@ -1086,14 +1087,14 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* set this flag to activate crc|cp_ver for recovery */ __set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG); - spin_unlock(&sbi->cp_lock); + spin_unlock_irqrestore(&sbi->cp_lock, flags); } static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct f2fs_nm_info *nm_i = NM_I(sbi); - unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; + unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num, flags; block_t start_blk; unsigned int data_sum_blocks, orphan_blocks; __u32 crc32 = 0; @@ -1135,12 +1136,12 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* 2 cp + n data seg summary + orphan inode blocks */ data_sum_blocks = npages_for_summary_flush(sbi, false); - spin_lock(&sbi->cp_lock); + spin_lock_irqsave(&sbi->cp_lock, flags); if (data_sum_blocks < NR_CURSEG_DATA_TYPE) __set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); else __clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); - spin_unlock(&sbi->cp_lock); + spin_unlock_irqrestore(&sbi->cp_lock, flags); orphan_blocks = GET_ORPHAN_BLOCKS(orphan_num); ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks + diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3e0832973a2c..b44519fef652 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1321,9 +1321,11 @@ static inline void __set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) static inline void set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) { - spin_lock(&sbi->cp_lock); + unsigned long flags; + + spin_lock_irqsave(&sbi->cp_lock, flags); __set_ckpt_flags(F2FS_CKPT(sbi), f); - spin_unlock(&sbi->cp_lock); + spin_unlock_irqrestore(&sbi->cp_lock, flags); } static inline void __clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) @@ -1337,22 +1339,26 @@ static inline void __clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) { - spin_lock(&sbi->cp_lock); + unsigned long flags; + + spin_lock_irqsave(&sbi->cp_lock, flags); __clear_ckpt_flags(F2FS_CKPT(sbi), f); - spin_unlock(&sbi->cp_lock); + spin_unlock_irqrestore(&sbi->cp_lock, flags); } static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock) { + unsigned long flags; + set_sbi_flag(sbi, SBI_NEED_FSCK); if (lock) - spin_lock(&sbi->cp_lock); + spin_lock_irqsave(&sbi->cp_lock, flags); __clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG); kfree(NM_I(sbi)->nat_bits); NM_I(sbi)->nat_bits = NULL; if (lock) - spin_unlock(&sbi->cp_lock); + spin_unlock_irqrestore(&sbi->cp_lock, flags); } static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi, From f2cade6d8a7ec93e8e7392c4c161d26ea3350aad Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 21 Jun 2017 17:52:39 -0700 Subject: [PATCH 0360/1212] f2fs: avoid deadlock caused by lock order of page and lock_op - punch_hole - fill_zero - f2fs_lock_op - get_new_data_page - lock_page - f2fs_write_data_pages - lock_page - do_write_data_page - f2fs_lock_op Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index dc204f178b13..e5efea00ad05 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1402,8 +1402,9 @@ int do_write_data_page(struct f2fs_io_info *fio) } } - if (fio->need_lock == LOCK_REQ) - f2fs_lock_op(fio->sbi); + /* Deadlock due to between page->lock and f2fs_lock_op */ + if (fio->need_lock == LOCK_REQ && !f2fs_trylock_op(fio->sbi)) + return -EAGAIN; err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); if (err) @@ -1665,7 +1666,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, } done_index = page->index; - +retry_write: lock_page(page); if (unlikely(page->mapping != mapping)) { @@ -1701,6 +1702,15 @@ static int f2fs_write_cache_pages(struct address_space *mapping, unlock_page(page); ret = 0; continue; + } else if (ret == -EAGAIN) { + ret = 0; + if (wbc->sync_mode == WB_SYNC_ALL) { + cond_resched(); + congestion_wait(BLK_RW_ASYNC, + HZ/50); + goto retry_write; + } + continue; } done_index = page->index + 1; done = 1; From 9df0a9280fcbdabf170ad3c0d36548bf9fc37e67 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 9 Jul 2017 00:13:07 +0800 Subject: [PATCH 0361/1212] f2fs: support plain user/group quota This patch adds to support plain user/group quota. Change Note by Jaegeuk Kim. - Use f2fs page cache for quota files in order to consider garbage collection. so, quota files are not tolerable for sudden power-cuts, so user needs to do quotacheck. - setattr() calls dquot_transfer which will transfer inode->i_blocks. We can't reclaim that during f2fs_evict_inode(). So, we need to count node blocks as well in order to match i_blocks with dquot's space. Note that, Chao wrote a patch to count inode->i_blocks without inode block. (f2fs: don't count inode block in in-memory inode.i_blocks) - in f2fs_remount, we need to make RW in prior to dquot_resume. - handle fault_injection case during f2fs_quota_off_umount - TODO: Project quota Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 2 + fs/f2fs/data.c | 10 +- fs/f2fs/f2fs.h | 92 +++++++--- fs/f2fs/file.c | 34 +++- fs/f2fs/inode.c | 5 + fs/f2fs/namei.c | 66 ++++++- fs/f2fs/node.c | 9 +- fs/f2fs/super.c | 278 +++++++++++++++++++++++++++++ 8 files changed, 454 insertions(+), 42 deletions(-) diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index 8e454b0559f1..3ba27469a8dd 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -160,6 +160,8 @@ mode=%s Control block allocation mode which supports "adaptive" writes towards main area. io_bits=%u Set the bit size of write IO requests. It should be set with "mode=lfs". +usrquota Enable plain user disk quota accounting. +grpquota Enable plain group disk quota accounting. ================================================================================ DEBUGFS ENTRIES diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index e5efea00ad05..b8588c8360e2 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -490,14 +490,15 @@ void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + int err; if (!count) return 0; if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; - if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count))) - return -ENOSPC; + if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count)))) + return err; trace_f2fs_reserve_new_blocks(dn->inode, dn->nid, dn->ofs_in_node, count); @@ -748,6 +749,7 @@ static int __allocate_data_block(struct dnode_of_data *dn) struct node_info ni; pgoff_t fofs; blkcnt_t count = 1; + int err; if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; @@ -756,8 +758,8 @@ static int __allocate_data_block(struct dnode_of_data *dn) if (dn->data_blkaddr == NEW_ADDR) goto alloc; - if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count))) - return -ENOSPC; + if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count)))) + return err; alloc: get_node_info(sbi, dn->nid, &ni); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b44519fef652..c1d323018738 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -22,6 +22,7 @@ #include #include #include +#include #ifdef CONFIG_F2FS_FS_ENCRYPTION #include #else @@ -89,6 +90,8 @@ extern char *fault_name[FAULT_MAX]; #define F2FS_MOUNT_FAULT_INJECTION 0x00010000 #define F2FS_MOUNT_ADAPTIVE 0x00020000 #define F2FS_MOUNT_LFS 0x00040000 +#define F2FS_MOUNT_USRQUOTA 0x00080000 +#define F2FS_MOUNT_GRPQUOTA 0x00100000 #define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option) @@ -588,6 +591,12 @@ struct f2fs_inode_info { nid_t i_xattr_nid; /* node id that contains xattrs */ loff_t last_disk_size; /* lastly written file size */ +#ifdef CONFIG_QUOTA + struct dquot *i_dquot[MAXQUOTAS]; + + /* quota space reservation, managed internally by quota code */ + qsize_t i_reserved_quota; +#endif struct list_head dirty_list; /* dirty list for dirs and files */ struct list_head gdirty_list; /* linked in global dirty list */ struct list_head inmem_pages; /* inmemory pages managed by f2fs */ @@ -1443,17 +1452,23 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs) return ofs == XATTR_NODE_OFFSET; } -static inline void f2fs_i_blocks_write(struct inode *, block_t, bool); -static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, +static inline void f2fs_i_blocks_write(struct inode *, block_t, bool, bool); +static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, blkcnt_t *count) { - blkcnt_t diff; + blkcnt_t diff = 0, release = 0; block_t avail_user_block_count; + int ret; + + ret = dquot_reserve_block(inode, *count); + if (ret) + return ret; #ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_BLOCK)) { f2fs_show_injection_info(FAULT_BLOCK); - return false; + release = *count; + goto enospc; } #endif /* @@ -1468,17 +1483,24 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) { diff = sbi->total_valid_block_count - avail_user_block_count; *count -= diff; + release = diff; sbi->total_valid_block_count = avail_user_block_count; if (!*count) { spin_unlock(&sbi->stat_lock); percpu_counter_sub(&sbi->alloc_valid_block_count, diff); - return false; + goto enospc; } } spin_unlock(&sbi->stat_lock); - f2fs_i_blocks_write(inode, *count, true); - return true; + if (release) + dquot_release_reservation_block(inode, release); + f2fs_i_blocks_write(inode, *count, true, true); + return 0; + +enospc: + dquot_release_reservation_block(inode, release); + return -ENOSPC; } static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, @@ -1492,7 +1514,7 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, inode->i_blocks < sectors); sbi->total_valid_block_count -= (block_t)count; spin_unlock(&sbi->stat_lock); - f2fs_i_blocks_write(inode, count, false); + f2fs_i_blocks_write(inode, count, false, true); } static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) @@ -1621,11 +1643,18 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); } -static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi, +static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, struct inode *inode, bool is_inode) { block_t valid_block_count; unsigned int valid_node_count; + bool quota = inode && !is_inode; + + if (quota) { + int ret = dquot_reserve_block(inode, 1); + if (ret) + return ret; + } spin_lock(&sbi->stat_lock); @@ -1633,28 +1662,33 @@ static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi, if (unlikely(valid_block_count + sbi->reserved_blocks > sbi->user_block_count)) { spin_unlock(&sbi->stat_lock); - return false; + goto enospc; } valid_node_count = sbi->total_valid_node_count + 1; if (unlikely(valid_node_count > sbi->total_node_count)) { spin_unlock(&sbi->stat_lock); - return false; - } - - if (inode) { - if (is_inode) - f2fs_mark_inode_dirty_sync(inode, true); - else - f2fs_i_blocks_write(inode, 1, true); + goto enospc; } sbi->total_valid_node_count++; sbi->total_valid_block_count++; spin_unlock(&sbi->stat_lock); + if (inode) { + if (is_inode) + f2fs_mark_inode_dirty_sync(inode, true); + else + f2fs_i_blocks_write(inode, 1, true, true); + } + percpu_counter_inc(&sbi->alloc_valid_block_count); - return true; + return 0; + +enospc: + if (quota) + dquot_release_reservation_block(inode, 1); + return -ENOSPC; } static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, @@ -1666,12 +1700,13 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, !sbi->total_valid_node_count); f2fs_bug_on(sbi, !is_inode && !inode->i_blocks); - if (!is_inode) - f2fs_i_blocks_write(inode, 1, false); sbi->total_valid_node_count--; sbi->total_valid_block_count--; spin_unlock(&sbi->stat_lock); + + if (!is_inode) + f2fs_i_blocks_write(inode, 1, false, true); } static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi) @@ -1946,14 +1981,21 @@ static inline void f2fs_i_links_write(struct inode *inode, bool inc) } static inline void f2fs_i_blocks_write(struct inode *inode, - block_t diff, bool add) + block_t diff, bool add, bool claim) { bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER); - blkcnt_t sectors = diff << F2FS_LOG_SECTORS_PER_BLOCK; - inode->i_blocks = add ? inode->i_blocks + sectors : - inode->i_blocks - sectors; + /* add = 1, claim = 1 should be dquot_reserve_block in pair */ + if (add) { + if (claim) + dquot_claim_block(inode, diff); + else + dquot_alloc_block_nofail(inode, diff); + } else { + dquot_free_block(inode, diff); + } + f2fs_mark_inode_dirty_sync(inode, true); if (clean || recover) set_inode_flag(inode, FI_AUTO_RECOVER); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 9978e86c89c5..5700722a1c42 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -445,11 +445,10 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) static int f2fs_file_open(struct inode *inode, struct file *filp) { - int ret = generic_file_open(inode, filp); struct dentry *dir; - if (!ret && f2fs_encrypted_inode(inode)) { - ret = fscrypt_get_encryption_info(inode); + if (f2fs_encrypted_inode(inode)) { + int ret = fscrypt_get_encryption_info(inode); if (ret) return -EACCES; if (!fscrypt_has_encryption_key(inode)) @@ -462,7 +461,7 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) return -EPERM; } dput(dir); - return ret; + return dquot_file_open(inode, filp); } int truncate_data_blocks_range(struct dnode_of_data *dn, int count) @@ -692,6 +691,20 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (err) return err; + if (is_quota_modification(inode, attr)) { + err = dquot_initialize(inode); + if (err) + return err; + } + if ((attr->ia_valid & ATTR_UID && + !uid_eq(attr->ia_uid, inode->i_uid)) || + (attr->ia_valid & ATTR_GID && + !gid_eq(attr->ia_gid, inode->i_gid))) { + err = dquot_transfer(inode, attr); + if (err) + return err; + } + if (attr->ia_valid & ATTR_SIZE) { if (f2fs_encrypted_inode(inode)) { err = fscrypt_get_encryption_info(inode); @@ -981,9 +994,9 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, if (do_replace[i]) { f2fs_i_blocks_write(src_inode, - 1, false); + 1, false, false); f2fs_i_blocks_write(dst_inode, - 1, true); + 1, true, false); f2fs_replace_block(sbi, &dn, dn.data_blkaddr, blkaddr[i], ni.version, true, false); @@ -1508,6 +1521,13 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) inode_lock(inode); + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) { + inode_unlock(inode); + ret = -EPERM; + goto unlock_out; + } + flags = f2fs_mask_flags(inode->i_mode, flags); oldflags = fi->i_flags; @@ -1527,7 +1547,7 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) inode->i_ctime = current_time(inode); f2fs_set_inode_flags(inode); f2fs_mark_inode_dirty_sync(inode, false); - +unlock_out: inode_unlock(inode); out: mnt_drop_write_file(filp); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index e42a7a8805dc..6cd312a17c69 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -373,6 +373,8 @@ void f2fs_evict_inode(struct inode *inode) if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; + dquot_initialize(inode); + remove_ino_entry(sbi, inode->i_ino, APPEND_INO); remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); @@ -405,8 +407,11 @@ void f2fs_evict_inode(struct inode *inode) if (err) update_inode_page(inode); + dquot_free_inode(inode); sb_end_intwrite(inode->i_sb); no_delete: + dquot_drop(inode); + stat_dec_inline_xattr(inode); stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 82714cdde5f2..766439e3ba42 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" @@ -42,6 +43,8 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) } f2fs_unlock_op(sbi); + nid_free = true; + inode_init_owner(inode, dir, mode); inode->i_ino = ino; @@ -52,10 +55,17 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) err = insert_inode_locked(inode); if (err) { err = -EINVAL; - nid_free = true; goto fail; } + err = dquot_initialize(inode); + if (err) + goto fail_drop; + + err = dquot_alloc_inode(inode); + if (err) + goto fail_drop; + /* If the directory encrypted, then we should encrypt the inode. */ if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) f2fs_set_encrypted_inode(inode); @@ -85,6 +95,16 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) set_inode_flag(inode, FI_FREE_NID); iput(inode); return ERR_PTR(err); +fail_drop: + trace_f2fs_new_inode(inode, err); + dquot_drop(inode); + inode->i_flags |= S_NOQUOTA; + if (nid_free) + set_inode_flag(inode, FI_FREE_NID); + clear_nlink(inode); + unlock_new_inode(inode); + iput(inode); + return ERR_PTR(err); } static int is_multimedia_file(const unsigned char *s, const char *sub) @@ -136,6 +156,10 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, nid_t ino = 0; int err; + err = dquot_initialize(dir); + if (err) + return err; + inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -180,6 +204,10 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, !fscrypt_has_permitted_context(dir, inode)) return -EPERM; + err = dquot_initialize(dir); + if (err) + return err; + f2fs_balance_fs(sbi, true); inode->i_ctime = current_time(inode); @@ -347,6 +375,10 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) trace_f2fs_unlink_enter(dir, dentry); + err = dquot_initialize(dir); + if (err) + return err; + de = f2fs_find_entry(dir, &dentry->d_name, &page); if (!de) { if (IS_ERR(page)) @@ -410,6 +442,10 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, if (disk_link.len > dir->i_sb->s_blocksize) return -ENAMETOOLONG; + err = dquot_initialize(dir); + if (err) + return err; + inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -497,6 +533,10 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct inode *inode; int err; + err = dquot_initialize(dir); + if (err) + return err; + inode = f2fs_new_inode(dir, S_IFDIR | mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -545,6 +585,10 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, struct inode *inode; int err = 0; + err = dquot_initialize(dir); + if (err) + return err; + inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -580,6 +624,10 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, struct inode *inode; int err; + err = dquot_initialize(dir); + if (err) + return err; + inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -673,6 +721,14 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; } + err = dquot_initialize(old_dir); + if (err) + goto out; + + err = dquot_initialize(new_dir); + if (err) + goto out; + old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) { if (IS_ERR(old_page)) @@ -853,6 +909,14 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, !fscrypt_has_permitted_context(old_dir, new_inode))) return -EPERM; + err = dquot_initialize(old_dir); + if (err) + goto out; + + err = dquot_initialize(new_dir); + if (err) + goto out; + old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) { if (IS_ERR(old_page)) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 81c8d4eca6b9..d737ae43ce08 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1040,10 +1040,9 @@ struct page *new_node_page(struct dnode_of_data *dn, if (!page) return ERR_PTR(-ENOMEM); - if (unlikely(!inc_valid_node_count(sbi, dn->inode, !ofs))) { - err = -ENOSPC; + if (unlikely((err = inc_valid_node_count(sbi, dn->inode, !ofs)))) goto fail; - } + #ifdef CONFIG_F2FS_CHECK_FS get_node_info(sbi, dn->nid, &new_ni); f2fs_bug_on(sbi, new_ni.blk_addr != NULL_ADDR); @@ -2213,7 +2212,7 @@ int recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) /* 2: update xattr nid in inode */ remove_free_nid(sbi, new_xnid); f2fs_i_xnid_write(inode, new_xnid); - if (unlikely(!inc_valid_node_count(sbi, inode, false))) + if (unlikely(inc_valid_node_count(sbi, inode, false))) f2fs_bug_on(sbi, 1); update_inode_page(inode); @@ -2271,7 +2270,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) new_ni = old_ni; new_ni.ino = ino; - if (unlikely(!inc_valid_node_count(sbi, NULL, true))) + if (unlikely(inc_valid_node_count(sbi, NULL, true))) WARN_ON(1); set_node_addr(sbi, &new_ni, NEW_ADDR, false); inc_valid_inode_count(sbi); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7f3c99d43579..49dd2b8efc03 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -106,6 +107,8 @@ enum { Opt_fault_injection, Opt_lazytime, Opt_nolazytime, + Opt_usrquota, + Opt_grpquota, Opt_err, }; @@ -141,6 +144,8 @@ static match_table_t f2fs_tokens = { {Opt_fault_injection, "fault_injection=%u"}, {Opt_lazytime, "lazytime"}, {Opt_nolazytime, "nolazytime"}, + {Opt_usrquota, "usrquota"}, + {Opt_grpquota, "grpquota"}, {Opt_err, NULL}, }; @@ -380,6 +385,20 @@ static int parse_options(struct super_block *sb, char *options) case Opt_nolazytime: sb->s_flags &= ~MS_LAZYTIME; break; +#ifdef CONFIG_QUOTA + case Opt_usrquota: + set_opt(sbi, USRQUOTA); + break; + case Opt_grpquota: + set_opt(sbi, GRPQUOTA); + break; +#else + case Opt_usrquota: + case Opt_grpquota: + f2fs_msg(sb, KERN_INFO, + "quota operations not supported"); + break; +#endif default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -421,6 +440,10 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) init_rwsem(&fi->dio_rwsem[WRITE]); init_rwsem(&fi->i_mmap_sem); +#ifdef CONFIG_QUOTA + memset(&fi->i_dquot, 0, sizeof(fi->i_dquot)); + fi->i_reserved_quota = 0; +#endif /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; return &fi->vfs_inode; @@ -561,11 +584,14 @@ static void destroy_device_list(struct f2fs_sb_info *sbi) kfree(sbi->devs); } +static void f2fs_quota_off_umount(struct super_block *sb); static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); int i; + f2fs_quota_off_umount(sb); + /* prevent remaining shrinker jobs */ mutex_lock(&sbi->umount_mutex); @@ -783,6 +809,12 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",fault_injection=%u", sbi->fault_info.inject_rate); #endif +#ifdef CONFIG_QUOTA + if (test_opt(sbi, USRQUOTA)) + seq_puts(seq, ",usrquota"); + if (test_opt(sbi, GRPQUOTA)) + seq_puts(seq, ",grpquota"); +#endif return 0; } @@ -823,6 +855,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) { struct f2fs_sb_info *sbi = F2FS_SB(sb); struct f2fs_mount_info org_mount_opt; + unsigned long old_sb_flags; int err, active_logs; bool need_restart_gc = false; bool need_stop_gc = false; @@ -836,6 +869,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * need to restore them. */ org_mount_opt = sbi->mount_opt; + old_sb_flags = sb->s_flags; active_logs = sbi->active_logs; /* recover superblocks we couldn't write due to previous RO mount */ @@ -861,6 +895,16 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if (f2fs_readonly(sb) && (*flags & MS_RDONLY)) goto skip; + if (!f2fs_readonly(sb) && (*flags & MS_RDONLY)) { + err = dquot_suspend(sb, -1); + if (err < 0) + goto restore_opts; + } else { + /* dquot_resume needs RW */ + sb->s_flags &= ~MS_RDONLY; + dquot_resume(sb, -1); + } + /* disallow enable/disable extent_cache dynamically */ if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) { err = -EINVAL; @@ -925,12 +969,235 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) restore_opts: sbi->mount_opt = org_mount_opt; sbi->active_logs = active_logs; + sb->s_flags = old_sb_flags; #ifdef CONFIG_F2FS_FAULT_INJECTION sbi->fault_info = ffi; #endif return err; } +#ifdef CONFIG_QUOTA +/* Read data from quotafile */ +static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + struct address_space *mapping = inode->i_mapping; + block_t blkidx = F2FS_BYTES_TO_BLK(off); + int offset = off & (sb->s_blocksize - 1); + int tocopy; + size_t toread; + loff_t i_size = i_size_read(inode); + struct page *page; + char *kaddr; + + if (off > i_size) + return 0; + + if (off + len > i_size) + len = i_size - off; + toread = len; + while (toread > 0) { + tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread); +repeat: + page = read_mapping_page(mapping, blkidx, NULL); + if (IS_ERR(page)) + return PTR_ERR(page); + + lock_page(page); + + if (unlikely(page->mapping != mapping)) { + f2fs_put_page(page, 1); + goto repeat; + } + if (unlikely(!PageUptodate(page))) { + f2fs_put_page(page, 1); + return -EIO; + } + + kaddr = kmap_atomic(page); + memcpy(data, kaddr + offset, tocopy); + kunmap_atomic(kaddr); + f2fs_put_page(page, 1); + + offset = 0; + toread -= tocopy; + data += tocopy; + blkidx++; + } + return len; +} + +/* Write to quotafile */ +static ssize_t f2fs_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + struct address_space *mapping = inode->i_mapping; + const struct address_space_operations *a_ops = mapping->a_ops; + int offset = off & (sb->s_blocksize - 1); + size_t towrite = len; + struct page *page; + char *kaddr; + int err = 0; + int tocopy; + + while (towrite > 0) { + tocopy = min_t(unsigned long, sb->s_blocksize - offset, + towrite); + + err = a_ops->write_begin(NULL, mapping, off, tocopy, 0, + &page, NULL); + if (unlikely(err)) + break; + + kaddr = kmap_atomic(page); + memcpy(kaddr + offset, data, tocopy); + kunmap_atomic(kaddr); + flush_dcache_page(page); + + a_ops->write_end(NULL, mapping, off, tocopy, tocopy, + page, NULL); + offset = 0; + towrite -= tocopy; + off += tocopy; + data += tocopy; + cond_resched(); + } + + if (len == towrite) + return err; + inode->i_version++; + inode->i_mtime = inode->i_ctime = current_time(inode); + f2fs_mark_inode_dirty_sync(inode, false); + return len - towrite; +} + +static struct dquot **f2fs_get_dquots(struct inode *inode) +{ + return F2FS_I(inode)->i_dquot; +} + +static qsize_t *f2fs_get_reserved_space(struct inode *inode) +{ + return &F2FS_I(inode)->i_reserved_quota; +} + +static int f2fs_quota_sync(struct super_block *sb, int type) +{ + struct quota_info *dqopt = sb_dqopt(sb); + int cnt; + int ret; + + ret = dquot_writeback_dquots(sb, type); + if (ret) + return ret; + + /* + * Now when everything is written we can discard the pagecache so + * that userspace sees the changes. + */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (type != -1 && cnt != type) + continue; + if (!sb_has_quota_active(sb, cnt)) + continue; + + ret = filemap_write_and_wait(dqopt->files[cnt]->i_mapping); + if (ret) + return ret; + + inode_lock(dqopt->files[cnt]); + truncate_inode_pages(&dqopt->files[cnt]->i_data, 0); + inode_unlock(dqopt->files[cnt]); + } + return 0; +} + +static int f2fs_quota_on(struct super_block *sb, int type, int format_id, + struct path *path) +{ + struct inode *inode; + int err; + + err = f2fs_quota_sync(sb, -1); + if (err) + return err; + + err = dquot_quota_on(sb, type, format_id, path); + if (err) + return err; + + inode = d_inode(path->dentry); + + inode_lock(inode); + F2FS_I(inode)->i_flags |= FS_NOATIME_FL | FS_IMMUTABLE_FL; + inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, + S_NOATIME | S_IMMUTABLE); + inode_unlock(inode); + f2fs_mark_inode_dirty_sync(inode, false); + + return 0; +} + +static int f2fs_quota_off(struct super_block *sb, int type) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + int err; + + if (!inode || !igrab(inode)) + return dquot_quota_off(sb, type); + + f2fs_quota_sync(sb, -1); + + err = dquot_quota_off(sb, type); + if (err) + goto out_put; + + inode_lock(inode); + F2FS_I(inode)->i_flags &= ~(FS_NOATIME_FL | FS_IMMUTABLE_FL); + inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); + inode_unlock(inode); + f2fs_mark_inode_dirty_sync(inode, false); +out_put: + iput(inode); + return err; +} + +static void f2fs_quota_off_umount(struct super_block *sb) +{ + int type; + + for (type = 0; type < MAXQUOTAS; type++) + f2fs_quota_off(sb, type); +} + +static const struct dquot_operations f2fs_quota_operations = { + .get_reserved_space = f2fs_get_reserved_space, + .write_dquot = dquot_commit, + .acquire_dquot = dquot_acquire, + .release_dquot = dquot_release, + .mark_dirty = dquot_mark_dquot_dirty, + .write_info = dquot_commit_info, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, +}; + +static const struct quotactl_ops f2fs_quotactl_ops = { + .quota_on = f2fs_quota_on, + .quota_off = f2fs_quota_off, + .quota_sync = f2fs_quota_sync, + .get_state = dquot_get_state, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk, +}; +#else +static inline void f2fs_quota_off_umount(struct super_block *sb) +{ +} +#endif + static struct super_operations f2fs_sops = { .alloc_inode = f2fs_alloc_inode, .drop_inode = f2fs_drop_inode, @@ -938,6 +1205,11 @@ static struct super_operations f2fs_sops = { .write_inode = f2fs_write_inode, .dirty_inode = f2fs_dirty_inode, .show_options = f2fs_show_options, +#ifdef CONFIG_QUOTA + .quota_read = f2fs_quota_read, + .quota_write = f2fs_quota_write, + .get_dquots = f2fs_get_dquots, +#endif .evict_inode = f2fs_evict_inode, .put_super = f2fs_put_super, .sync_fs = f2fs_sync_fs, @@ -1684,6 +1956,12 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) sb->s_max_links = F2FS_LINK_MAX; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); +#ifdef CONFIG_QUOTA + sb->dq_op = &f2fs_quota_operations; + sb->s_qcop = &f2fs_quotactl_ops; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; +#endif + sb->s_op = &f2fs_sops; sb->s_cop = &f2fs_cryptops; sb->s_xattr = f2fs_xattr_handlers; From 972aaba68e97f8eabdfcfe594a9b78d604ee613f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 10 Jul 2017 19:16:28 -0700 Subject: [PATCH 0362/1212] f2fs: make more close to v4.13-rc1 Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 5 +++-- fs/f2fs/file.c | 4 ++-- fs/f2fs/namei.c | 4 ++-- fs/f2fs/node.c | 6 +++--- fs/f2fs/segment.c | 18 +++++++++--------- fs/f2fs/super.c | 8 ++++---- 6 files changed, 23 insertions(+), 22 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c1d323018738..ecfd7fc02b57 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2228,7 +2228,7 @@ static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, return kmalloc(size, flags); } -static inline void *f2fs_kvmalloc(size_t size, gfp_t flags) +static inline void *kvmalloc(size_t size, gfp_t flags) { void *ret; @@ -2238,7 +2238,7 @@ static inline void *f2fs_kvmalloc(size_t size, gfp_t flags) return ret; } -static inline void *f2fs_kvzalloc(size_t size, gfp_t flags) +static inline void *kvzalloc(size_t size, gfp_t flags) { void *ret; @@ -2891,4 +2891,5 @@ static inline bool f2fs_may_encrypt(struct inode *inode) return 0; #endif } + #endif diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5700722a1c42..789d75beb7ed 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1048,11 +1048,11 @@ static int __exchange_data_block(struct inode *src_inode, while (len) { olen = min((pgoff_t)4 * ADDRS_PER_BLOCK, len); - src_blkaddr = f2fs_kvzalloc(sizeof(block_t) * olen, GFP_KERNEL); + src_blkaddr = kvzalloc(sizeof(block_t) * olen, GFP_KERNEL); if (!src_blkaddr) return -ENOMEM; - do_replace = f2fs_kvzalloc(sizeof(int) * olen, GFP_KERNEL); + do_replace = kvzalloc(sizeof(int) * olen, GFP_KERNEL); if (!do_replace) { kvfree(src_blkaddr); return -ENOMEM; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 766439e3ba42..541d755193c8 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -988,7 +988,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, file_lost_pino(old_inode); up_write(&F2FS_I(old_inode)->i_sem); - old_dir->i_ctime = CURRENT_TIME; + old_dir->i_ctime = current_time(old_dir); if (old_nlink) { down_write(&F2FS_I(old_dir)->i_sem); f2fs_i_links_write(old_dir, old_nlink > 0); @@ -1003,7 +1003,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, file_lost_pino(new_inode); up_write(&F2FS_I(new_inode)->i_sem); - new_dir->i_ctime = CURRENT_TIME; + new_dir->i_ctime = current_time(new_dir); if (new_nlink) { down_write(&F2FS_I(new_dir)->i_sem); f2fs_i_links_write(new_dir, new_nlink > 0); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d737ae43ce08..d0d6a5830181 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2664,17 +2664,17 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); - nm_i->free_nid_bitmap = f2fs_kvzalloc(nm_i->nat_blocks * + nm_i->free_nid_bitmap = kvzalloc(nm_i->nat_blocks * NAT_ENTRY_BITMAP_SIZE, GFP_KERNEL); if (!nm_i->free_nid_bitmap) return -ENOMEM; - nm_i->nat_block_bitmap = f2fs_kvzalloc(nm_i->nat_blocks / 8, + nm_i->nat_block_bitmap = kvzalloc(nm_i->nat_blocks / 8, GFP_KERNEL); if (!nm_i->nat_block_bitmap) return -ENOMEM; - nm_i->free_nid_count = f2fs_kvzalloc(nm_i->nat_blocks * + nm_i->free_nid_count = kvzalloc(nm_i->nat_blocks * sizeof(unsigned short), GFP_KERNEL); if (!nm_i->free_nid_count) return -ENOMEM; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0f6cded83c7b..9744e8c9d308 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1243,8 +1243,8 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, sector = SECTOR_FROM_BLOCK(blkstart); nr_sects = SECTOR_FROM_BLOCK(blklen); - if (sector & (bdev_zone_size(bdev) - 1) || - nr_sects != bdev_zone_size(bdev)) { + if (sector & (bdev_zone_sectors(bdev) - 1) || + nr_sects != bdev_zone_sectors(bdev)) { f2fs_msg(sbi->sb, KERN_INFO, "(%d) %s: Unaligned discard attempted (block %x + %x)", devi, sbi->s_ndevs ? FDEV(devi).path: "", @@ -2998,13 +2998,13 @@ static int build_sit_info(struct f2fs_sb_info *sbi) SM_I(sbi)->sit_info = sit_i; - sit_i->sentries = f2fs_kvzalloc(MAIN_SEGS(sbi) * + sit_i->sentries = kvzalloc(MAIN_SEGS(sbi) * sizeof(struct seg_entry), GFP_KERNEL); if (!sit_i->sentries) return -ENOMEM; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); - sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); + sit_i->dirty_sentries_bitmap = kvzalloc(bitmap_size, GFP_KERNEL); if (!sit_i->dirty_sentries_bitmap) return -ENOMEM; @@ -3037,7 +3037,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) return -ENOMEM; if (sbi->segs_per_sec > 1) { - sit_i->sec_entries = f2fs_kvzalloc(MAIN_SECS(sbi) * + sit_i->sec_entries = kvzalloc(MAIN_SECS(sbi) * sizeof(struct sec_entry), GFP_KERNEL); if (!sit_i->sec_entries) return -ENOMEM; @@ -3088,12 +3088,12 @@ static int build_free_segmap(struct f2fs_sb_info *sbi) SM_I(sbi)->free_info = free_i; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); - free_i->free_segmap = f2fs_kvmalloc(bitmap_size, GFP_KERNEL); + free_i->free_segmap = kvmalloc(bitmap_size, GFP_KERNEL); if (!free_i->free_segmap) return -ENOMEM; sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); - free_i->free_secmap = f2fs_kvmalloc(sec_bitmap_size, GFP_KERNEL); + free_i->free_secmap = kvmalloc(sec_bitmap_size, GFP_KERNEL); if (!free_i->free_secmap) return -ENOMEM; @@ -3273,7 +3273,7 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi) struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); - dirty_i->victim_secmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); + dirty_i->victim_secmap = kvzalloc(bitmap_size, GFP_KERNEL); if (!dirty_i->victim_secmap) return -ENOMEM; return 0; @@ -3295,7 +3295,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi) bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); for (i = 0; i < NR_DIRTY_TYPE; i++) { - dirty_i->dirty_segmap[i] = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); + dirty_i->dirty_segmap[i] = kvzalloc(bitmap_size, GFP_KERNEL); if (!dirty_i->dirty_segmap[i]) return -ENOMEM; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 49dd2b8efc03..1eb2013fece6 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1638,16 +1638,16 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) return 0; if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != - SECTOR_TO_BLOCK(bdev_zone_size(bdev))) + SECTOR_TO_BLOCK(bdev_zone_sectors(bdev))) return -EINVAL; - sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_size(bdev)); + sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_sectors(bdev)); if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz != __ilog2_u32(sbi->blocks_per_blkz)) return -EINVAL; sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz); FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> sbi->log_blocks_per_blkz; - if (nr_sectors & (bdev_zone_size(bdev) - 1)) + if (nr_sectors & (bdev_zone_sectors(bdev) - 1)) FDEV(devi).nr_blkz++; FDEV(devi).blkz_type = kmalloc(FDEV(devi).nr_blkz, GFP_KERNEL); @@ -1789,7 +1789,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) /* Initialize single device information */ if (!RDEV(0).path[0]) { #ifdef CONFIG_BLK_DEV_ZONED - if (bdev_zoned_model(sbi->sb->s_bdev) == BLK_ZONED_NONE) + if (!bdev_is_zoned(sbi->sb->s_bdev)) return 0; max_devices = 1; #else From f6ba8b4893da189199657b0db85705ee247d9c36 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 22 May 2017 18:14:06 -0700 Subject: [PATCH 0363/1212] fscrypt: inline fscrypt_free_filename() fscrypt_free_filename() only needs to do a kfree() of crypto_buf.name, which works well as an inline function. We can skip setting the various pointers to NULL, since no user cares about it (the name is always freed just before it goes out of scope). Signed-off-by: Eric Biggers Reviewed-by: David Gstir Signed-off-by: Theodore Ts'o --- fs/crypto/fname.c | 9 --------- include/linux/fscrypt_supp.h | 7 ++++++- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index d1bb02b1ee58..ad9f814fdead 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -453,12 +453,3 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, return ret; } EXPORT_SYMBOL(fscrypt_setup_filename); - -void fscrypt_free_filename(struct fscrypt_name *fname) -{ - kfree(fname->crypto_buf.name); - fname->crypto_buf.name = NULL; - fname->usr_fname = NULL; - fname->disk_name.name = NULL; -} -EXPORT_SYMBOL(fscrypt_free_filename); diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index cd4e82c17304..32e2fcf13b01 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -47,7 +47,12 @@ extern void fscrypt_put_encryption_info(struct inode *, struct fscrypt_info *); /* fname.c */ extern int fscrypt_setup_filename(struct inode *, const struct qstr *, int lookup, struct fscrypt_name *); -extern void fscrypt_free_filename(struct fscrypt_name *); + +static inline void fscrypt_free_filename(struct fscrypt_name *fname) +{ + kfree(fname->crypto_buf.name); +} + extern u32 fscrypt_fname_encrypted_size(const struct inode *, u32); extern int fscrypt_fname_alloc_buffer(const struct inode *, u32, struct fscrypt_str *); From 73a2900701bac20ceb808514dc3b275e96484a79 Mon Sep 17 00:00:00 2001 From: Daniel Walter Date: Mon, 19 Jun 2017 09:27:58 +0200 Subject: [PATCH 0364/1212] fscrypt: add support for AES-128-CBC fscrypt provides facilities to use different encryption algorithms which are selectable by userspace when setting the encryption policy. Currently, only AES-256-XTS for file contents and AES-256-CBC-CTS for file names are implemented. This is a clear case of kernel offers the mechanism and userspace selects a policy. Similar to what dm-crypt and ecryptfs have. This patch adds support for using AES-128-CBC for file contents and AES-128-CBC-CTS for file name encryption. To mitigate watermarking attacks, IVs are generated using the ESSIV algorithm. While AES-CBC is actually slightly less secure than AES-XTS from a security point of view, there is more widespread hardware support. Using AES-CBC gives us the acceptable performance while still providing a moderate level of security for persistent storage. Especially low-powered embedded devices with crypto accelerators such as CAAM or CESA often only support AES-CBC. Since using AES-CBC over AES-XTS is basically thought of a last resort, we use AES-128-CBC over AES-256-CBC since it has less encryption rounds and yields noticeable better performance starting from a file size of just a few kB. Signed-off-by: Daniel Walter [david@sigma-star.at: addressed review comments] Signed-off-by: David Gstir Reviewed-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/crypto.c | 23 +++-- fs/crypto/fscrypt_private.h | 9 +- fs/crypto/keyinfo.c | 175 +++++++++++++++++++++++++-------- fs/crypto/policy.c | 8 +- include/linux/fscrypt_common.h | 16 +-- include/uapi/linux/fs.h | 2 + 6 files changed, 174 insertions(+), 59 deletions(-) diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 6d6eca394d4d..c7835df7e7b8 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "fscrypt_private.h" static unsigned int num_prealloc_crypto_pages = 32; @@ -147,8 +148,8 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, { struct { __le64 index; - u8 padding[FS_XTS_TWEAK_SIZE - sizeof(__le64)]; - } xts_tweak; + u8 padding[FS_IV_SIZE - sizeof(__le64)]; + } iv; struct skcipher_request *req = NULL; DECLARE_FS_COMPLETION_RESULT(ecr); struct scatterlist dst, src; @@ -158,6 +159,16 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, BUG_ON(len == 0); + BUILD_BUG_ON(sizeof(iv) != FS_IV_SIZE); + BUILD_BUG_ON(AES_BLOCK_SIZE != FS_IV_SIZE); + iv.index = cpu_to_le64(lblk_num); + memset(iv.padding, 0, sizeof(iv.padding)); + + if (ci->ci_essiv_tfm != NULL) { + crypto_cipher_encrypt_one(ci->ci_essiv_tfm, (u8 *)&iv, + (u8 *)&iv); + } + req = skcipher_request_alloc(tfm, gfp_flags); if (!req) { printk_ratelimited(KERN_ERR @@ -170,15 +181,11 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, page_crypt_complete, &ecr); - BUILD_BUG_ON(sizeof(xts_tweak) != FS_XTS_TWEAK_SIZE); - xts_tweak.index = cpu_to_le64(lblk_num); - memset(xts_tweak.padding, 0, sizeof(xts_tweak.padding)); - sg_init_table(&dst, 1); sg_set_page(&dst, dest_page, len, offs); sg_init_table(&src, 1); sg_set_page(&src, src_page, len, offs); - skcipher_request_set_crypt(req, &src, &dst, len, &xts_tweak); + skcipher_request_set_crypt(req, &src, &dst, len, &iv); if (rw == FS_DECRYPT) res = crypto_skcipher_decrypt(req); else @@ -477,6 +484,8 @@ static void __exit fscrypt_exit(void) destroy_workqueue(fscrypt_read_workqueue); kmem_cache_destroy(fscrypt_ctx_cachep); kmem_cache_destroy(fscrypt_info_cachep); + + fscrypt_essiv_cleanup(); } module_exit(fscrypt_exit); diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 0b65491de28a..79d79755d79b 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -12,10 +12,13 @@ #define _FSCRYPT_PRIVATE_H #include +#include /* Encryption parameters */ -#define FS_XTS_TWEAK_SIZE 16 +#define FS_IV_SIZE 16 #define FS_AES_128_ECB_KEY_SIZE 16 +#define FS_AES_128_CBC_KEY_SIZE 16 +#define FS_AES_128_CTS_KEY_SIZE 16 #define FS_AES_256_GCM_KEY_SIZE 32 #define FS_AES_256_CBC_KEY_SIZE 32 #define FS_AES_256_CTS_KEY_SIZE 32 @@ -54,6 +57,7 @@ struct fscrypt_info { u8 ci_filename_mode; u8 ci_flags; struct crypto_skcipher *ci_ctfm; + struct crypto_cipher *ci_essiv_tfm; u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE]; }; @@ -97,4 +101,7 @@ extern int fscrypt_do_page_crypto(const struct inode *inode, extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags); +/* keyinfo.c */ +extern void __exit fscrypt_essiv_cleanup(void); + #endif /* _FSCRYPT_PRIVATE_H */ diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 4636c18c2fb9..66e0728e9bbe 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -10,8 +10,13 @@ #include #include +#include +#include +#include #include "fscrypt_private.h" +static struct crypto_shash *essiv_hash_tfm; + static void derive_crypt_complete(struct crypto_async_request *req, int rc) { struct fscrypt_completion_result *ecr = req->data; @@ -27,13 +32,13 @@ static void derive_crypt_complete(struct crypto_async_request *req, int rc) * derive_key_aes() - Derive a key using AES-128-ECB * @deriving_key: Encryption key used for derivation. * @source_key: Source key to which to apply derivation. - * @derived_key: Derived key. + * @derived_raw_key: Derived raw key. * * Return: Zero on success; non-zero otherwise. */ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], - u8 source_key[FS_AES_256_XTS_KEY_SIZE], - u8 derived_key[FS_AES_256_XTS_KEY_SIZE]) + const struct fscrypt_key *source_key, + u8 derived_raw_key[FS_MAX_KEY_SIZE]) { int res = 0; struct skcipher_request *req = NULL; @@ -60,10 +65,10 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], if (res < 0) goto out; - sg_init_one(&src_sg, source_key, FS_AES_256_XTS_KEY_SIZE); - sg_init_one(&dst_sg, derived_key, FS_AES_256_XTS_KEY_SIZE); - skcipher_request_set_crypt(req, &src_sg, &dst_sg, - FS_AES_256_XTS_KEY_SIZE, NULL); + sg_init_one(&src_sg, source_key->raw, source_key->size); + sg_init_one(&dst_sg, derived_raw_key, source_key->size); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, source_key->size, + NULL); res = crypto_skcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { wait_for_completion(&ecr.completion); @@ -77,7 +82,7 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], static int validate_user_key(struct fscrypt_info *crypt_info, struct fscrypt_context *ctx, u8 *raw_key, - const char *prefix) + const char *prefix, int min_keysize) { char *description; struct key *keyring_key; @@ -111,50 +116,60 @@ static int validate_user_key(struct fscrypt_info *crypt_info, master_key = (struct fscrypt_key *)ukp->data; BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE); - if (master_key->size != FS_AES_256_XTS_KEY_SIZE) { + if (master_key->size < min_keysize || master_key->size > FS_MAX_KEY_SIZE + || master_key->size % AES_BLOCK_SIZE != 0) { printk_once(KERN_WARNING "%s: key size incorrect: %d\n", __func__, master_key->size); res = -ENOKEY; goto out; } - res = derive_key_aes(ctx->nonce, master_key->raw, raw_key); + res = derive_key_aes(ctx->nonce, master_key, raw_key); out: up_read(&keyring_key->sem); key_put(keyring_key); return res; } +static const struct { + const char *cipher_str; + int keysize; +} available_modes[] = { + [FS_ENCRYPTION_MODE_AES_256_XTS] = { "xts(aes)", + FS_AES_256_XTS_KEY_SIZE }, + [FS_ENCRYPTION_MODE_AES_256_CTS] = { "cts(cbc(aes))", + FS_AES_256_CTS_KEY_SIZE }, + [FS_ENCRYPTION_MODE_AES_128_CBC] = { "cbc(aes)", + FS_AES_128_CBC_KEY_SIZE }, + [FS_ENCRYPTION_MODE_AES_128_CTS] = { "cts(cbc(aes))", + FS_AES_128_CTS_KEY_SIZE }, +}; + static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode, const char **cipher_str_ret, int *keysize_ret) { + u32 mode; + + if (!fscrypt_valid_enc_modes(ci->ci_data_mode, ci->ci_filename_mode)) { + pr_warn_ratelimited("fscrypt: inode %lu uses unsupported encryption modes (contents mode %d, filenames mode %d)\n", + inode->i_ino, + ci->ci_data_mode, ci->ci_filename_mode); + return -EINVAL; + } + if (S_ISREG(inode->i_mode)) { - if (ci->ci_data_mode == FS_ENCRYPTION_MODE_AES_256_XTS) { - *cipher_str_ret = "xts(aes)"; - *keysize_ret = FS_AES_256_XTS_KEY_SIZE; - return 0; - } - pr_warn_once("fscrypto: unsupported contents encryption mode " - "%d for inode %lu\n", - ci->ci_data_mode, inode->i_ino); - return -ENOKEY; + mode = ci->ci_data_mode; + } else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) { + mode = ci->ci_filename_mode; + } else { + WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n", + inode->i_ino, (inode->i_mode & S_IFMT)); + return -EINVAL; } - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) { - if (ci->ci_filename_mode == FS_ENCRYPTION_MODE_AES_256_CTS) { - *cipher_str_ret = "cts(cbc(aes))"; - *keysize_ret = FS_AES_256_CTS_KEY_SIZE; - return 0; - } - pr_warn_once("fscrypto: unsupported filenames encryption mode " - "%d for inode %lu\n", - ci->ci_filename_mode, inode->i_ino); - return -ENOKEY; - } - - pr_warn_once("fscrypto: unsupported file type %d for inode %lu\n", - (inode->i_mode & S_IFMT), inode->i_ino); - return -ENOKEY; + *cipher_str_ret = available_modes[mode].cipher_str; + *keysize_ret = available_modes[mode].keysize; + return 0; } static void put_crypt_info(struct fscrypt_info *ci) @@ -163,9 +178,76 @@ static void put_crypt_info(struct fscrypt_info *ci) return; crypto_free_skcipher(ci->ci_ctfm); + crypto_free_cipher(ci->ci_essiv_tfm); kmem_cache_free(fscrypt_info_cachep, ci); } +static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt) +{ + struct crypto_shash *tfm = READ_ONCE(essiv_hash_tfm); + + /* init hash transform on demand */ + if (unlikely(!tfm)) { + struct crypto_shash *prev_tfm; + + tfm = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(tfm)) { + pr_warn_ratelimited("fscrypt: error allocating SHA-256 transform: %ld\n", + PTR_ERR(tfm)); + return PTR_ERR(tfm); + } + prev_tfm = cmpxchg(&essiv_hash_tfm, NULL, tfm); + if (prev_tfm) { + crypto_free_shash(tfm); + tfm = prev_tfm; + } + } + + { + SHASH_DESC_ON_STACK(desc, tfm); + desc->tfm = tfm; + desc->flags = 0; + + return crypto_shash_digest(desc, key, keysize, salt); + } +} + +static int init_essiv_generator(struct fscrypt_info *ci, const u8 *raw_key, + int keysize) +{ + int err; + struct crypto_cipher *essiv_tfm; + u8 salt[SHA256_DIGEST_SIZE]; + + essiv_tfm = crypto_alloc_cipher("aes", 0, 0); + if (IS_ERR(essiv_tfm)) + return PTR_ERR(essiv_tfm); + + ci->ci_essiv_tfm = essiv_tfm; + + err = derive_essiv_salt(raw_key, keysize, salt); + if (err) + goto out; + + /* + * Using SHA256 to derive the salt/key will result in AES-256 being + * used for IV generation. File contents encryption will still use the + * configured keysize (AES-128) nevertheless. + */ + err = crypto_cipher_setkey(essiv_tfm, salt, sizeof(salt)); + if (err) + goto out; + +out: + memzero_explicit(salt, sizeof(salt)); + return err; +} + +void __exit fscrypt_essiv_cleanup(void) +{ + crypto_free_shash(essiv_hash_tfm); +} + int fscrypt_get_encryption_info(struct inode *inode) { struct fscrypt_info *crypt_info; @@ -212,6 +294,7 @@ int fscrypt_get_encryption_info(struct inode *inode) crypt_info->ci_data_mode = ctx.contents_encryption_mode; crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; crypt_info->ci_ctfm = NULL; + crypt_info->ci_essiv_tfm = NULL; memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, sizeof(crypt_info->ci_master_key)); @@ -228,10 +311,12 @@ int fscrypt_get_encryption_info(struct inode *inode) if (!raw_key) goto out; - res = validate_user_key(crypt_info, &ctx, raw_key, FS_KEY_DESC_PREFIX); + res = validate_user_key(crypt_info, &ctx, raw_key, FS_KEY_DESC_PREFIX, + keysize); if (res && inode->i_sb->s_cop->key_prefix) { int res2 = validate_user_key(crypt_info, &ctx, raw_key, - inode->i_sb->s_cop->key_prefix); + inode->i_sb->s_cop->key_prefix, + keysize); if (res2) { if (res2 == -ENOKEY) res = -ENOKEY; @@ -243,18 +328,30 @@ int fscrypt_get_encryption_info(struct inode *inode) ctfm = crypto_alloc_skcipher(cipher_str, 0, 0); if (!ctfm || IS_ERR(ctfm)) { res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; - printk(KERN_DEBUG - "%s: error %d (inode %u) allocating crypto tfm\n", - __func__, res, (unsigned) inode->i_ino); + pr_debug("%s: error %d (inode %lu) allocating crypto tfm\n", + __func__, res, inode->i_ino); goto out; } crypt_info->ci_ctfm = ctfm; crypto_skcipher_clear_flags(ctfm, ~0); crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY); + /* + * if the provided key is longer than keysize, we use the first + * keysize bytes of the derived key only + */ res = crypto_skcipher_setkey(ctfm, raw_key, keysize); if (res) goto out; + if (S_ISREG(inode->i_mode) && + crypt_info->ci_data_mode == FS_ENCRYPTION_MODE_AES_128_CBC) { + res = init_essiv_generator(crypt_info, raw_key, keysize); + if (res) { + pr_debug("%s: error %d (inode %lu) allocating essiv tfm\n", + __func__, res, inode->i_ino); + goto out; + } + } if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) == NULL) crypt_info = NULL; out: diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 210976e7a269..9914d51dff86 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -38,12 +38,8 @@ static int create_encryption_context_from_policy(struct inode *inode, memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, FS_KEY_DESCRIPTOR_SIZE); - if (!fscrypt_valid_contents_enc_mode( - policy->contents_encryption_mode)) - return -EINVAL; - - if (!fscrypt_valid_filenames_enc_mode( - policy->filenames_encryption_mode)) + if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, + policy->filenames_encryption_mode)) return -EINVAL; if (policy->flags & ~FS_POLICY_FLAGS_VALID) diff --git a/include/linux/fscrypt_common.h b/include/linux/fscrypt_common.h index 0a30c106c1e5..4022c61f7e9b 100644 --- a/include/linux/fscrypt_common.h +++ b/include/linux/fscrypt_common.h @@ -91,14 +91,18 @@ static inline bool fscrypt_dummy_context_enabled(struct inode *inode) return false; } -static inline bool fscrypt_valid_contents_enc_mode(u32 mode) +static inline bool fscrypt_valid_enc_modes(u32 contents_mode, + u32 filenames_mode) { - return (mode == FS_ENCRYPTION_MODE_AES_256_XTS); -} + if (contents_mode == FS_ENCRYPTION_MODE_AES_128_CBC && + filenames_mode == FS_ENCRYPTION_MODE_AES_128_CTS) + return true; -static inline bool fscrypt_valid_filenames_enc_mode(u32 mode) -{ - return (mode == FS_ENCRYPTION_MODE_AES_256_CTS); + if (contents_mode == FS_ENCRYPTION_MODE_AES_256_XTS && + filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS) + return true; + + return false; } static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index a1533084395c..f3ef5016cf9c 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -189,6 +189,8 @@ struct inodes_stat_t { #define FS_ENCRYPTION_MODE_AES_256_GCM 2 #define FS_ENCRYPTION_MODE_AES_256_CBC 3 #define FS_ENCRYPTION_MODE_AES_256_CTS 4 +#define FS_ENCRYPTION_MODE_AES_128_CBC 5 +#define FS_ENCRYPTION_MODE_AES_128_CTS 6 struct fscrypt_policy { From 8e85002136aab271e4c9bf4c60da5d5c1b46dc4b Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Tue, 11 Jul 2017 17:30:33 +0100 Subject: [PATCH 0365/1212] f2fs: remove extra inode_unlock() in error path This commit removes an extra inode_unlock() that is being done in function f2fs_ioc_setflags error path. While there, get rid of a useless 'out' label as well. Fixes: 0abd675e97e6 ("f2fs: support plain user/group quota") Signed-off-by: Luis Henriques Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 789d75beb7ed..435927e6c6f3 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1523,7 +1523,6 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) /* Is it quota file? Do not allow user to mess with it */ if (IS_NOQUOTA(inode)) { - inode_unlock(inode); ret = -EPERM; goto unlock_out; } @@ -1534,9 +1533,8 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { if (!capable(CAP_LINUX_IMMUTABLE)) { - inode_unlock(inode); ret = -EPERM; - goto out; + goto unlock_out; } } @@ -1549,7 +1547,6 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) f2fs_mark_inode_dirty_sync(inode, false); unlock_out: inode_unlock(inode); -out: mnt_drop_write_file(filp); return ret; } From 39480d9be6996dcafae10a9ca147ce5c77efaa83 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 11 Jul 2017 14:56:49 -0700 Subject: [PATCH 0366/1212] f2fs: Don't clear SGID when inheriting ACLs This patch copies commit b7f8a09f80: "btrfs: Don't clear SGID when inheriting ACLs" written by Jan. Fixes: 073931017b49d9458aa351605b43a7e34598caef CC: stable@vger.kernel.org Signed-off-by: Jan Kara Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index ad26f59ba464..05d6f6095549 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -214,7 +214,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, switch (type) { case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; - if (acl) { + if (acl && !ipage) { error = posix_acl_update_mode(inode, &inode->i_mode, &acl); if (error) return error; From 7c7c34c88c646b70285fb2c3b9d004e3fbe0d011 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 13 Jul 2017 17:45:21 -0700 Subject: [PATCH 0367/1212] f2fs: include seq_file.h for sysfs.c This patch includes seq_file.h to avoid compile error. Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 9adc202fcd6f..71191d89917d 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -11,6 +11,7 @@ */ #include #include +#include #include "f2fs.h" #include "segment.h" From 745ad3de2fbae66415d2734837384ab6e648e357 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 14 Jul 2017 11:45:21 -0700 Subject: [PATCH 0368/1212] f2fs: avoid cpu lockup Before retrying to flush data or dentry pages, we need to release cpu in order to prevent watchdog. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index aabf7c4984d3..e8ceff42d09b 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -880,6 +880,7 @@ int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) struct inode *inode; struct f2fs_inode_info *fi; bool is_dir = (type == DIR_INODE); + unsigned long ino = 0; trace_f2fs_sync_dirty_inodes_enter(sbi->sb, is_dir, get_pages(sbi, is_dir ? @@ -902,8 +903,17 @@ int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) inode = igrab(&fi->vfs_inode); spin_unlock(&sbi->inode_lock[type]); if (inode) { + unsigned long cur_ino = inode->i_ino; + filemap_fdatawrite(inode->i_mapping); iput(inode); + /* We need to give cpu to another writers. */ + if (ino == cur_ino) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + cond_resched(); + } else { + ino = cur_ino; + } } else { /* * We should submit bio, since it exists several From 92fd02d1052e69d85e577f9b5ec5af0ec1de1dc3 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Mon, 17 Jul 2017 19:16:11 +0800 Subject: [PATCH 0369/1212] f2fs: remove unused input parameter This patch remove unused input parameter in function new_node_page. Signed-off-by: Yunlei He Signed-off-by: Yong Sheng Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 +-- fs/f2fs/node.c | 7 +++---- fs/f2fs/xattr.c | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ecfd7fc02b57..43c0956f7ce7 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2369,8 +2369,7 @@ int truncate_xattr_node(struct inode *inode, struct page *page); int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino); int remove_inode_page(struct inode *inode); struct page *new_inode_page(struct inode *inode); -struct page *new_node_page(struct dnode_of_data *dn, - unsigned int ofs, struct page *ipage); +struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs); void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); struct page *get_node_page_ra(struct page *parent, int start); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d0d6a5830181..ed4014f677c2 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -613,7 +613,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) } dn->nid = nids[i]; - npage[i] = new_node_page(dn, noffset[i], NULL); + npage[i] = new_node_page(dn, noffset[i]); if (IS_ERR(npage[i])) { alloc_nid_failed(sbi, nids[i]); err = PTR_ERR(npage[i]); @@ -1022,11 +1022,10 @@ struct page *new_inode_page(struct inode *inode) set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); /* caller should f2fs_put_page(page, 1); */ - return new_node_page(&dn, 0, NULL); + return new_node_page(&dn, 0); } -struct page *new_node_page(struct dnode_of_data *dn, - unsigned int ofs, struct page *ipage) +struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct node_info new_ni; diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index aaf0a4167175..aad59c7c3a63 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -489,7 +489,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, } else { struct dnode_of_data dn; set_new_dnode(&dn, inode, NULL, NULL, new_nid); - xpage = new_node_page(&dn, XATTR_NODE_OFFSET, ipage); + xpage = new_node_page(&dn, XATTR_NODE_OFFSET); if (IS_ERR(xpage)) { alloc_nid_failed(sbi, new_nid); return PTR_ERR(xpage); From f19a8a046a172655102a9aa0a995c4c77143df0c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 16 Jul 2017 15:08:54 +0800 Subject: [PATCH 0370/1212] f2fs: spread struct f2fs_dentry_ptr for inline path Use f2fs_dentry_ptr structure to indicate inline dentry structure as much as possible, so we can wrap inline dentry with size-fixed fields to the one with size-changeable fields. With this change, we can handle size-changeable inline dentry more easily. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 5 ++++- fs/f2fs/inline.c | 47 ++++++++++++++++++++++++++--------------------- 2 files changed, 30 insertions(+), 22 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 43c0956f7ce7..a02645780fd7 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -428,10 +428,11 @@ struct f2fs_flush_device { /* for directory operations */ struct f2fs_dentry_ptr { struct inode *inode; - const void *bitmap; + void *bitmap; struct f2fs_dir_entry *dentry; __u8 (*filename)[F2FS_SLOT_LEN]; int max; + int nr_bitmap; }; static inline void make_dentry_ptr_block(struct inode *inode, @@ -439,6 +440,7 @@ static inline void make_dentry_ptr_block(struct inode *inode, { d->inode = inode; d->max = NR_DENTRY_IN_BLOCK; + d->nr_bitmap = SIZE_OF_DENTRY_BITMAP; d->bitmap = &t->dentry_bitmap; d->dentry = t->dentry; d->filename = t->filename; @@ -449,6 +451,7 @@ static inline void make_dentry_ptr_inline(struct inode *inode, { d->inode = inode; d->max = NR_INLINE_DENTRY; + d->nr_bitmap = INLINE_DENTRY_BITMAP_SIZE; d->bitmap = &t->dentry_bitmap; d->dentry = t->dentry; d->filename = t->filename; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 03c86e55e4a7..918eb89eb404 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -342,6 +342,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, struct page *page; struct dnode_of_data dn; struct f2fs_dentry_block *dentry_blk; + struct f2fs_dentry_ptr src, dst; int err; page = f2fs_grab_cache_page(dir->i_mapping, 0, false); @@ -360,21 +361,20 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, dentry_blk = kmap_atomic(page); + make_dentry_ptr_inline(NULL, &src, inline_dentry); + make_dentry_ptr_block(NULL, &dst, dentry_blk); + /* copy data from inline dentry block to new dentry block */ - memcpy(dentry_blk->dentry_bitmap, inline_dentry->dentry_bitmap, - INLINE_DENTRY_BITMAP_SIZE); - memset(dentry_blk->dentry_bitmap + INLINE_DENTRY_BITMAP_SIZE, 0, - SIZE_OF_DENTRY_BITMAP - INLINE_DENTRY_BITMAP_SIZE); + memcpy(dst.bitmap, src.bitmap, src.nr_bitmap); + memset(dst.bitmap + src.nr_bitmap, 0, dst.nr_bitmap - src.nr_bitmap); /* * we do not need to zero out remainder part of dentry and filename * field, since we have used bitmap for marking the usage status of * them, besides, we can also ignore copying/zeroing reserved space * of dentry block, because them haven't been used so far. */ - memcpy(dentry_blk->dentry, inline_dentry->dentry, - sizeof(struct f2fs_dir_entry) * NR_INLINE_DENTRY); - memcpy(dentry_blk->filename, inline_dentry->filename, - NR_INLINE_DENTRY * F2FS_SLOT_LEN); + memcpy(dst.dentry, src.dentry, SIZE_OF_DIR_ENTRY * src.max); + memcpy(dst.filename, src.filename, src.max * F2FS_SLOT_LEN); kunmap_atomic(dentry_blk); if (!PageUptodate(page)) @@ -511,9 +511,10 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, return PTR_ERR(ipage); inline_dentry = inline_data_addr(ipage); - bit_pos = room_for_filename(&inline_dentry->dentry_bitmap, - slots, NR_INLINE_DENTRY); - if (bit_pos >= NR_INLINE_DENTRY) { + make_dentry_ptr_inline(NULL, &d, inline_dentry); + + bit_pos = room_for_filename(d.bitmap, slots, d.max); + if (bit_pos >= d.max) { err = f2fs_convert_inline_dir(dir, ipage, inline_dentry); if (err) return err; @@ -534,7 +535,6 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, f2fs_wait_on_page_writeback(ipage, NODE, true); name_hash = f2fs_dentry_hash(new_name, NULL); - make_dentry_ptr_inline(NULL, &d, inline_dentry); f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos); set_page_dirty(ipage); @@ -558,6 +558,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, struct inode *dir, struct inode *inode) { struct f2fs_inline_dentry *inline_dentry; + struct f2fs_dentry_ptr d; int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); unsigned int bit_pos; int i; @@ -566,10 +567,11 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, f2fs_wait_on_page_writeback(page, NODE, true); inline_dentry = inline_data_addr(page); - bit_pos = dentry - inline_dentry->dentry; + make_dentry_ptr_inline(NULL, &d, inline_dentry); + + bit_pos = dentry - d.dentry; for (i = 0; i < slots; i++) - __clear_bit_le(bit_pos + i, - &inline_dentry->dentry_bitmap); + __clear_bit_le(bit_pos + i, d.bitmap); set_page_dirty(page); f2fs_put_page(page, 1); @@ -587,19 +589,20 @@ bool f2fs_empty_inline_dir(struct inode *dir) struct page *ipage; unsigned int bit_pos = 2; struct f2fs_inline_dentry *inline_dentry; + struct f2fs_dentry_ptr d; ipage = get_node_page(sbi, dir->i_ino); if (IS_ERR(ipage)) return false; inline_dentry = inline_data_addr(ipage); - bit_pos = find_next_bit_le(&inline_dentry->dentry_bitmap, - NR_INLINE_DENTRY, - bit_pos); + make_dentry_ptr_inline(NULL, &d, inline_dentry); + + bit_pos = find_next_bit_le(d.bitmap, d.max, bit_pos); f2fs_put_page(ipage, 1); - if (bit_pos < NR_INLINE_DENTRY) + if (bit_pos < d.max) return false; return true; @@ -614,7 +617,9 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, struct f2fs_dentry_ptr d; int err; - if (ctx->pos == NR_INLINE_DENTRY) + make_dentry_ptr_inline(inode, &d, inline_dentry); + + if (ctx->pos == d.max) return 0; ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); @@ -627,7 +632,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, err = f2fs_fill_dentries(ctx, &d, 0, fstr); if (!err) - ctx->pos = NR_INLINE_DENTRY; + ctx->pos = d.max; f2fs_put_page(ipage, 1); return err < 0 ? err : 0; From d143729d715df2a467e52fbe401f7ab2cc162dca Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Tue, 18 Jul 2017 09:48:12 +0800 Subject: [PATCH 0371/1212] f2fs: alloc new nids for xattr block in recovery recovery file A: recovery file B: -get_dnode_of_data -alloc_nid -recover_xattr_data -set_node_addr(sbi, &ni, NEW_ADDR, false); --->bug_on for nid has been used by file A In recovery process, new allocated node blocks may "reuse" xattr block nids, this patch alloc new nids for xattr blocks in recovery process to avoid this problem. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index ed4014f677c2..46fb5c2693ad 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -19,6 +19,7 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include "xattr.h" #include "trace.h" #include @@ -2193,7 +2194,8 @@ int recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; - nid_t new_xnid = nid_of_node(page); + nid_t new_xnid; + struct dnode_of_data dn; struct node_info ni; struct page *xpage; @@ -2209,22 +2211,22 @@ int recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) recover_xnid: /* 2: update xattr nid in inode */ - remove_free_nid(sbi, new_xnid); - f2fs_i_xnid_write(inode, new_xnid); - if (unlikely(inc_valid_node_count(sbi, inode, false))) - f2fs_bug_on(sbi, 1); + if (!alloc_nid(sbi, &new_xnid)) + return -ENOSPC; + + set_new_dnode(&dn, inode, NULL, NULL, new_xnid); + xpage = new_node_page(&dn, XATTR_NODE_OFFSET); + if (IS_ERR(xpage)) { + alloc_nid_failed(sbi, new_xnid); + return PTR_ERR(xpage); + } + + alloc_nid_done(sbi, new_xnid); update_inode_page(inode); /* 3: update and set xattr node page dirty */ - xpage = grab_cache_page(NODE_MAPPING(sbi), new_xnid); - if (!xpage) - return -ENOMEM; + memcpy(F2FS_NODE(xpage), F2FS_NODE(page), VALID_XATTR_BLOCK_SIZE); - memcpy(F2FS_NODE(xpage), F2FS_NODE(page), PAGE_SIZE); - - get_node_info(sbi, new_xnid, &ni); - ni.ino = inode->i_ino; - set_node_addr(sbi, &ni, NEW_ADDR, false); set_page_dirty(xpage); f2fs_put_page(xpage, 1); From ada848409822bd027643a98e5cae45958650beca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ernesto=20A=2E=20Fern=C3=A1ndez?= Date: Sun, 23 Jul 2017 22:32:54 -0300 Subject: [PATCH 0372/1212] f2fs: preserve i_mode if __f2fs_set_acl() fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When changing a file's acl mask, __f2fs_set_acl() will first set the group bits of i_mode to the value of the mask, and only then set the actual extended attribute representing the new acl. If the second part fails (due to lack of space, for example) and the file had no acl attribute to begin with, the system will from now on assume that the mask permission bits are actual group permission bits, potentially granting access to the wrong users. Prevent this by only changing the inode mode after the acl has been set. Signed-off-by: Ernesto A. Fernández Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 05d6f6095549..112f8e04c549 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -210,15 +210,16 @@ static int __f2fs_set_acl(struct inode *inode, int type, void *value = NULL; size_t size = 0; int error; + umode_t mode = inode->i_mode; switch (type) { case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl && !ipage) { - error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + error = posix_acl_update_mode(inode, &mode, &acl); if (error) return error; - set_acl_inode(inode, inode->i_mode); + set_acl_inode(inode, mode); } break; From 6bf7fc57146876306137d2229d42082165af5dbf Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 19 Jul 2017 10:59:55 -0700 Subject: [PATCH 0373/1212] f2fs: give a try to do atomic write in -ENOMEM case It'd be better to retry writing atomic pages when we get -ENOMEM. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 9744e8c9d308..bf9d66fa0af5 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -309,17 +309,21 @@ static int __commit_inmem_pages(struct inode *inode, inode_dec_dirty_pages(inode); remove_dirty_inode(inode); } - +retry: fio.page = page; fio.old_blkaddr = NULL_ADDR; fio.encrypted_page = NULL; fio.need_lock = LOCK_DONE; err = do_write_data_page(&fio); if (err) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + cond_resched(); + goto retry; + } unlock_page(page); break; } - /* record old blkaddr for revoking */ cur->old_addr = fio.old_blkaddr; last_idx = page->index; From 79e86c92c62c8bc5b699f657b016fd281a39f2ac Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 24 Jul 2017 19:46:29 -0700 Subject: [PATCH 0374/1212] f2fs: don't give partially written atomic data from process crash This patch resolves the below scenario. == Process 1 == == Process 2 == open(w) open(rw) begin write(new_#1) process_crash f_op->flush locks_remove_posix f_op>release read (new_#1) In order to avoid corrupted database caused by new_#1, we must do roll-back at process_crash time. In order to check that, this patch keeps task which triggers transaction begin, and does roll-back in f_op->flush before removing file locks. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a02645780fd7..8ec1afac4897 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -603,6 +603,7 @@ struct f2fs_inode_info { struct list_head dirty_list; /* dirty list for dirs and files */ struct list_head gdirty_list; /* linked in global dirty list */ struct list_head inmem_pages; /* inmemory pages managed by f2fs */ + struct task_struct *inmem_task; /* store inmemory task */ struct mutex inmem_lock; /* lock for inmemory pages */ struct extent_tree *extent_tree; /* cached extent_tree entry */ struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 435927e6c6f3..368aa332c833 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1480,6 +1480,22 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) return 0; } +static int f2fs_file_flush(struct file *file, fl_owner_t id) +{ + struct inode *inode = file_inode(file); + + /* + * If the process doing a transaction is crashed, we should do + * roll-back. Otherwise, other reader/write can see corrupted database + * until all the writers close its file. Since this should be done + * before dropping file lock, it needs to do in ->flush. + */ + if (f2fs_is_atomic_file(inode) && + F2FS_I(inode)->inmem_task == current) + drop_inmem_pages(inode); + return 0; +} + #define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) #define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) @@ -1599,6 +1615,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) } inc_stat: + F2FS_I(inode)->inmem_task = current; stat_inc_atomic_write(inode); stat_update_max_atomic_write(inode); out: @@ -2496,6 +2513,7 @@ const struct file_operations f2fs_file_operations = { .open = f2fs_file_open, .release = f2fs_release_file, .mmap = f2fs_file_mmap, + .flush = f2fs_file_flush, .fsync = f2fs_sync_file, .fallocate = f2fs_fallocate, .unlocked_ioctl = f2fs_ioctl, From e088277a813b12d98f81f79da0c738b54974d56a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 22 Jul 2017 08:52:23 +0800 Subject: [PATCH 0375/1212] f2fs: make background threads of f2fs being aware of freezing When ->freeze_fs is called from lvm for doing snapshot, it needs to make sure there will be no more changes in filesystem's data, however, previously, background threads like GC thread wasn't aware of freezing, so in environment with active background threads, data of snapshot becomes unstable. This patch fixes this issue by adding sb_{start,end}_intwrite in below background threads: - GC thread - flush thread - discard thread Note that, don't use sb_start_intwrite() in gc_thread_func() due to: generic/241 reports below bug: ====================================================== WARNING: possible circular locking dependency detected 4.13.0-rc1+ #32 Tainted: G O ------------------------------------------------------ f2fs_gc-250:0/22186 is trying to acquire lock: (&sbi->gc_mutex){+.+...}, at: [] f2fs_sync_fs+0x7b/0x1b0 [f2fs] but task is already holding lock: (sb_internal#2){++++.-}, at: [] gc_thread_func+0x159/0x4a0 [f2fs] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (sb_internal#2){++++.-}: __lock_acquire+0x405/0x7b0 lock_acquire+0xae/0x220 __sb_start_write+0x11d/0x1f0 f2fs_evict_inode+0x2d6/0x4e0 [f2fs] evict+0xa8/0x170 iput+0x1fb/0x2c0 f2fs_sync_inode_meta+0x3f/0xf0 [f2fs] write_checkpoint+0x1b1/0x750 [f2fs] f2fs_sync_fs+0x85/0x1b0 [f2fs] f2fs_do_sync_file.isra.24+0x137/0xa30 [f2fs] f2fs_sync_file+0x34/0x40 [f2fs] vfs_fsync_range+0x4a/0xa0 do_fsync+0x3c/0x60 SyS_fdatasync+0x15/0x20 do_fast_syscall_32+0xa1/0x1b0 entry_SYSENTER_32+0x4c/0x7b -> #1 (&sbi->cp_mutex){+.+...}: __lock_acquire+0x405/0x7b0 lock_acquire+0xae/0x220 __mutex_lock+0x4f/0x830 mutex_lock_nested+0x25/0x30 write_checkpoint+0x2f/0x750 [f2fs] f2fs_sync_fs+0x85/0x1b0 [f2fs] sync_filesystem+0x67/0x80 generic_shutdown_super+0x27/0x100 kill_block_super+0x22/0x50 kill_f2fs_super+0x3a/0x40 [f2fs] deactivate_locked_super+0x3d/0x70 deactivate_super+0x40/0x60 cleanup_mnt+0x39/0x70 __cleanup_mnt+0x10/0x20 task_work_run+0x69/0x80 exit_to_usermode_loop+0x57/0x92 do_fast_syscall_32+0x18c/0x1b0 entry_SYSENTER_32+0x4c/0x7b -> #0 (&sbi->gc_mutex){+.+...}: validate_chain.isra.36+0xc50/0xdb0 __lock_acquire+0x405/0x7b0 lock_acquire+0xae/0x220 __mutex_lock+0x4f/0x830 mutex_lock_nested+0x25/0x30 f2fs_sync_fs+0x7b/0x1b0 [f2fs] f2fs_balance_fs_bg+0xb9/0x200 [f2fs] gc_thread_func+0x302/0x4a0 [f2fs] kthread+0xe9/0x120 ret_from_fork+0x19/0x24 other info that might help us debug this: Chain exists of: &sbi->gc_mutex --> &sbi->cp_mutex --> sb_internal#2 Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(sb_internal#2); lock(&sbi->cp_mutex); lock(sb_internal#2); lock(&sbi->gc_mutex); *** DEADLOCK *** 1 lock held by f2fs_gc-250:0/22186: #0: (sb_internal#2){++++.-}, at: [] gc_thread_func+0x159/0x4a0 [f2fs] stack backtrace: CPU: 2 PID: 22186 Comm: f2fs_gc-250:0 Tainted: G O 4.13.0-rc1+ #32 Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 Call Trace: dump_stack+0x5f/0x92 print_circular_bug+0x1b3/0x1bd validate_chain.isra.36+0xc50/0xdb0 ? __this_cpu_preempt_check+0xf/0x20 __lock_acquire+0x405/0x7b0 lock_acquire+0xae/0x220 ? f2fs_sync_fs+0x7b/0x1b0 [f2fs] __mutex_lock+0x4f/0x830 ? f2fs_sync_fs+0x7b/0x1b0 [f2fs] mutex_lock_nested+0x25/0x30 ? f2fs_sync_fs+0x7b/0x1b0 [f2fs] f2fs_sync_fs+0x7b/0x1b0 [f2fs] f2fs_balance_fs_bg+0xb9/0x200 [f2fs] gc_thread_func+0x302/0x4a0 [f2fs] ? preempt_schedule_common+0x2f/0x4d ? f2fs_gc+0x540/0x540 [f2fs] kthread+0xe9/0x120 ? f2fs_gc+0x540/0x540 [f2fs] ? kthread_create_on_node+0x30/0x30 ret_from_fork+0x19/0x24 The deadlock occurs in below condition: GC Thread Thread B - sb_start_intwrite - f2fs_sync_file - f2fs_sync_fs - mutex_lock(&sbi->gc_mutex) - write_checkpoint - block_operations - f2fs_sync_inode_meta - iput - sb_start_intwrite - mutex_lock(&sbi->gc_mutex) Fix this by altering sb_start_intwrite to sb_start_write_trylock. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 9 +++++++-- fs/f2fs/segment.c | 8 ++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index c72da8733ba6..8f30dae0fe46 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -55,6 +55,9 @@ static int gc_thread_func(void *data) } #endif + if (!sb_start_write_trylock(sbi->sb)) + continue; + /* * [GC triggering condition] * 0. GC is not conducted currently. @@ -69,12 +72,12 @@ static int gc_thread_func(void *data) * So, I'd like to wait some time to collect dirty segments. */ if (!mutex_trylock(&sbi->gc_mutex)) - continue; + goto next; if (!is_idle(sbi)) { increase_sleep_time(gc_th, &wait_ms); mutex_unlock(&sbi->gc_mutex); - continue; + goto next; } if (has_enough_invalid_blocks(sbi)) @@ -93,6 +96,8 @@ static int gc_thread_func(void *data) /* balancing f2fs's metadata periodically */ f2fs_balance_fs_bg(sbi); +next: + sb_end_write(sbi->sb); } while (!kthread_should_stop()); return 0; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index bf9d66fa0af5..3573b95f4fab 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -485,6 +485,8 @@ static int issue_flush_thread(void *data) if (kthread_should_stop()) return 0; + sb_start_intwrite(sbi->sb); + if (!llist_empty(&fcc->issue_list)) { struct flush_cmd *cmd, *next; int ret; @@ -503,6 +505,8 @@ static int issue_flush_thread(void *data) fcc->dispatch_list = NULL; } + sb_end_intwrite(sbi->sb); + wait_event_interruptible(*q, kthread_should_stop() || !llist_empty(&fcc->issue_list)); goto repeat; @@ -1210,9 +1214,13 @@ static int issue_discard_thread(void *data) if (kthread_should_stop()) return 0; + sb_start_intwrite(sbi->sb); + __issue_discard_cmd(sbi, true); __wait_discard_cmd(sbi, true); + sb_end_intwrite(sbi->sb); + congestion_wait(BLK_RW_SYNC, HZ/50); } while (!kthread_should_stop()); return 0; From 68a6e4b9740c7e16636360a317c12d4ba870eb21 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 21 Jul 2017 12:58:59 -0700 Subject: [PATCH 0376/1212] f2fs: add ioctl to expose current features This patch adds an ioctl to provide feature information to user. For exapmle, SQLite can use this ioctl to detect whether f2fs support atomic write or not. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/file.c | 13 +++++++++++++ 2 files changed, 15 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8ec1afac4897..4eb067f1b160 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -113,6 +113,7 @@ struct f2fs_mount_info { #define F2FS_FEATURE_ENCRYPT 0x0001 #define F2FS_FEATURE_BLKZONED 0x0002 +#define F2FS_FEATURE_ATOMIC_WRITE 0x0004 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -375,6 +376,7 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, struct f2fs_flush_device) #define F2FS_IOC_GARBAGE_COLLECT_RANGE _IOW(F2FS_IOCTL_MAGIC, 11, \ struct f2fs_gc_range) +#define F2FS_IOC_GET_FEATURES _IOR(F2FS_IOCTL_MAGIC, 12, __u32) #define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY #define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 368aa332c833..bc732b17be91 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2386,6 +2386,16 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) return ret; } +static int f2fs_ioc_get_features(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + u32 sb_feature = le32_to_cpu(F2FS_I_SB(inode)->raw_super->feature); + + /* Must validate to set it with SQLite behavior in Android. */ + sb_feature |= F2FS_FEATURE_ATOMIC_WRITE; + + return put_user(sb_feature, (u32 __user *)arg); +} long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { @@ -2428,6 +2438,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_move_range(filp, arg); case F2FS_IOC_FLUSH_DEVICE: return f2fs_ioc_flush_device(filp, arg); + case F2FS_IOC_GET_FEATURES: + return f2fs_ioc_get_features(filp, arg); default: return -ENOTTY; } @@ -2498,6 +2510,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_DEFRAGMENT: case F2FS_IOC_MOVE_RANGE: case F2FS_IOC_FLUSH_DEVICE: + case F2FS_IOC_GET_FEATURES: break; default: return -ENOIOCTLCMD; From 242ed6f4bbea201af6f4a8cc8db9f4b407bbfb25 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 19 Jul 2017 00:19:05 +0800 Subject: [PATCH 0377/1212] f2fs: make max inline size changeable This patch tries to make below macros calculating max inline size, inline dentry field size considerring reserving size-changeable space: - MAX_INLINE_DATA - NR_INLINE_DENTRY - INLINE_DENTRY_BITMAP_SIZE - INLINE_RESERVED_SIZE Then, when inline_{data,dentry} options is enabled, it allows us to reserve inline space with different size flexibly for adding newly introduced inode attribute. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 +- fs/f2fs/f2fs.h | 48 +++++++++++++++++---- fs/f2fs/inline.c | 95 +++++++++++++++++++++-------------------- fs/f2fs/inode.c | 4 +- fs/f2fs/super.c | 3 ++ include/linux/f2fs_fs.h | 23 +--------- 6 files changed, 96 insertions(+), 81 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b8588c8360e2..f31c71bcdf0e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -812,7 +812,7 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO); } - if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) { + if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) { err = f2fs_convert_inline_inode(inode); if (err) return err; @@ -1855,7 +1855,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, set_new_dnode(&dn, inode, ipage, ipage, 0); if (f2fs_has_inline_data(inode)) { - if (pos + len <= MAX_INLINE_DATA) { + if (pos + len <= MAX_INLINE_DATA(inode)) { read_inline_data(page, ipage); set_inode_flag(inode, FI_DATA_EXIST); if (inode->i_nlink) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4eb067f1b160..153a487b1189 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -424,6 +424,25 @@ struct f2fs_flush_device { u32 segments; /* # of segments to flush */ }; +/* for inline stuff */ +#define DEF_INLINE_RESERVED_SIZE 1 + +static inline int get_inline_reserved_size(struct inode *inode); +#define MAX_INLINE_DATA(inode) (sizeof(__le32) * (DEF_ADDRS_PER_INODE -\ + get_inline_reserved_size(inode) -\ + F2FS_INLINE_XATTR_ADDRS)) + +/* for inline dir */ +#define NR_INLINE_DENTRY(inode) (MAX_INLINE_DATA(inode) * BITS_PER_BYTE / \ + ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ + BITS_PER_BYTE + 1)) +#define INLINE_DENTRY_BITMAP_SIZE(inode) ((NR_INLINE_DENTRY(inode) + \ + BITS_PER_BYTE - 1) / BITS_PER_BYTE) +#define INLINE_RESERVED_SIZE(inode) (MAX_INLINE_DATA(inode) - \ + ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ + NR_INLINE_DENTRY(inode) + \ + INLINE_DENTRY_BITMAP_SIZE(inode))) + /* * For INODE and NODE manager */ @@ -449,14 +468,19 @@ static inline void make_dentry_ptr_block(struct inode *inode, } static inline void make_dentry_ptr_inline(struct inode *inode, - struct f2fs_dentry_ptr *d, struct f2fs_inline_dentry *t) + struct f2fs_dentry_ptr *d, void *t) { + int entry_cnt = NR_INLINE_DENTRY(inode); + int bitmap_size = INLINE_DENTRY_BITMAP_SIZE(inode); + int reserved_size = INLINE_RESERVED_SIZE(inode); + d->inode = inode; - d->max = NR_INLINE_DENTRY; - d->nr_bitmap = INLINE_DENTRY_BITMAP_SIZE; - d->bitmap = &t->dentry_bitmap; - d->dentry = t->dentry; - d->filename = t->filename; + d->max = entry_cnt; + d->nr_bitmap = bitmap_size; + d->bitmap = t; + d->dentry = t + bitmap_size + reserved_size; + d->filename = t + bitmap_size + reserved_size + + SIZE_OF_DIR_ENTRY * entry_cnt; } /* @@ -610,6 +634,8 @@ struct f2fs_inode_info { struct extent_tree *extent_tree; /* cached extent_tree entry */ struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */ struct rw_semaphore i_mmap_sem; + + int i_inline_reserved; /* reserved size in inline data */ }; static inline void get_extent_info(struct extent_info *ext, @@ -2139,11 +2165,12 @@ static inline bool f2fs_is_drop_cache(struct inode *inode) return is_inode_flag_set(inode, FI_DROP_CACHE); } -static inline void *inline_data_addr(struct page *page) +static inline void *inline_data_addr(struct inode *inode, struct page *page) { struct f2fs_inode *ri = F2FS_INODE(page); + int reserved_size = get_inline_reserved_size(inode); - return (void *)&(ri->i_addr[1]); + return (void *)&(ri->i_addr[reserved_size]); } static inline int f2fs_has_inline_dentry(struct inode *inode) @@ -2254,6 +2281,11 @@ static inline void *kvzalloc(size_t size, gfp_t flags) return ret; } +static inline int get_inline_reserved_size(struct inode *inode) +{ + return F2FS_I(inode)->i_inline_reserved; +} + #define get_inode_mode(i) \ ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 918eb89eb404..ed5b1153901e 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -22,7 +22,7 @@ bool f2fs_may_inline_data(struct inode *inode) if (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) return false; - if (i_size_read(inode) > MAX_INLINE_DATA) + if (i_size_read(inode) > MAX_INLINE_DATA(inode)) return false; if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) @@ -44,6 +44,7 @@ bool f2fs_may_inline_dentry(struct inode *inode) void read_inline_data(struct page *page, struct page *ipage) { + struct inode *inode = page->mapping->host; void *src_addr, *dst_addr; if (PageUptodate(page)) @@ -51,12 +52,12 @@ void read_inline_data(struct page *page, struct page *ipage) f2fs_bug_on(F2FS_P_SB(page), page->index); - zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE); + zero_user_segment(page, MAX_INLINE_DATA(inode), PAGE_SIZE); /* Copy the whole inline data block */ - src_addr = inline_data_addr(ipage); + src_addr = inline_data_addr(inode, ipage); dst_addr = kmap_atomic(page); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); flush_dcache_page(page); kunmap_atomic(dst_addr); if (!PageUptodate(page)) @@ -67,13 +68,13 @@ void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from) { void *addr; - if (from >= MAX_INLINE_DATA) + if (from >= MAX_INLINE_DATA(inode)) return; - addr = inline_data_addr(ipage); + addr = inline_data_addr(inode, ipage); f2fs_wait_on_page_writeback(ipage, NODE, true); - memset(addr + from, 0, MAX_INLINE_DATA - from); + memset(addr + from, 0, MAX_INLINE_DATA(inode) - from); set_page_dirty(ipage); if (from == 0) @@ -216,8 +217,8 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) f2fs_wait_on_page_writeback(dn.inode_page, NODE, true); src_addr = kmap_atomic(page); - dst_addr = inline_data_addr(dn.inode_page); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + dst_addr = inline_data_addr(inode, dn.inode_page); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); kunmap_atomic(src_addr); set_page_dirty(dn.inode_page); @@ -255,9 +256,9 @@ bool recover_inline_data(struct inode *inode, struct page *npage) f2fs_wait_on_page_writeback(ipage, NODE, true); - src_addr = inline_data_addr(npage); - dst_addr = inline_data_addr(ipage); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + src_addr = inline_data_addr(inode, npage); + dst_addr = inline_data_addr(inode, ipage); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); set_inode_flag(inode, FI_INLINE_DATA); set_inode_flag(inode, FI_DATA_EXIST); @@ -285,11 +286,11 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, struct fscrypt_name *fname, struct page **res_page) { struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); - struct f2fs_inline_dentry *inline_dentry; struct qstr name = FSTR_TO_QSTR(&fname->disk_name); struct f2fs_dir_entry *de; struct f2fs_dentry_ptr d; struct page *ipage; + void *inline_dentry; f2fs_hash_t namehash; ipage = get_node_page(sbi, dir->i_ino); @@ -300,9 +301,9 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, namehash = f2fs_dentry_hash(&name, fname); - inline_dentry = inline_data_addr(ipage); + inline_dentry = inline_data_addr(dir, ipage); - make_dentry_ptr_inline(NULL, &d, inline_dentry); + make_dentry_ptr_inline(dir, &d, inline_dentry); de = find_target_dentry(fname, namehash, NULL, &d); unlock_page(ipage); if (de) @@ -316,19 +317,19 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, int make_empty_inline_dir(struct inode *inode, struct inode *parent, struct page *ipage) { - struct f2fs_inline_dentry *inline_dentry; struct f2fs_dentry_ptr d; + void *inline_dentry; - inline_dentry = inline_data_addr(ipage); + inline_dentry = inline_data_addr(inode, ipage); - make_dentry_ptr_inline(NULL, &d, inline_dentry); + make_dentry_ptr_inline(inode, &d, inline_dentry); do_make_empty_dir(inode, parent, &d); set_page_dirty(ipage); /* update i_size to MAX_INLINE_DATA */ - if (i_size_read(inode) < MAX_INLINE_DATA) - f2fs_i_size_write(inode, MAX_INLINE_DATA); + if (i_size_read(inode) < MAX_INLINE_DATA(inode)) + f2fs_i_size_write(inode, MAX_INLINE_DATA(inode)); return 0; } @@ -337,7 +338,7 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent, * release ipage in this function. */ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, - struct f2fs_inline_dentry *inline_dentry) + void *inline_dentry) { struct page *page; struct dnode_of_data dn; @@ -357,12 +358,12 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, goto out; f2fs_wait_on_page_writeback(page, DATA, true); - zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE); + zero_user_segment(page, MAX_INLINE_DATA(dir), PAGE_SIZE); dentry_blk = kmap_atomic(page); - make_dentry_ptr_inline(NULL, &src, inline_dentry); - make_dentry_ptr_block(NULL, &dst, dentry_blk); + make_dentry_ptr_inline(dir, &src, inline_dentry); + make_dentry_ptr_block(dir, &dst, dentry_blk); /* copy data from inline dentry block to new dentry block */ memcpy(dst.bitmap, src.bitmap, src.nr_bitmap); @@ -395,14 +396,13 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, return err; } -static int f2fs_add_inline_entries(struct inode *dir, - struct f2fs_inline_dentry *inline_dentry) +static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry) { struct f2fs_dentry_ptr d; unsigned long bit_pos = 0; int err = 0; - make_dentry_ptr_inline(NULL, &d, inline_dentry); + make_dentry_ptr_inline(dir, &d, inline_dentry); while (bit_pos < d.max) { struct f2fs_dir_entry *de; @@ -444,19 +444,19 @@ static int f2fs_add_inline_entries(struct inode *dir, } static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, - struct f2fs_inline_dentry *inline_dentry) + void *inline_dentry) { - struct f2fs_inline_dentry *backup_dentry; + void *backup_dentry; int err; backup_dentry = f2fs_kmalloc(F2FS_I_SB(dir), - sizeof(struct f2fs_inline_dentry), GFP_F2FS_ZERO); + MAX_INLINE_DATA(dir), GFP_F2FS_ZERO); if (!backup_dentry) { f2fs_put_page(ipage, 1); return -ENOMEM; } - memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA); + memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA(dir)); truncate_inline_inode(dir, ipage, 0); unlock_page(ipage); @@ -473,9 +473,9 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, return 0; recover: lock_page(ipage); - memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA); + memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA(dir)); f2fs_i_depth_write(dir, 0); - f2fs_i_size_write(dir, MAX_INLINE_DATA); + f2fs_i_size_write(dir, MAX_INLINE_DATA(dir)); set_page_dirty(ipage); f2fs_put_page(ipage, 1); @@ -484,7 +484,7 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, } static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, - struct f2fs_inline_dentry *inline_dentry) + void *inline_dentry) { if (!F2FS_I(dir)->i_dir_level) return f2fs_move_inline_dirents(dir, ipage, inline_dentry); @@ -500,7 +500,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, struct page *ipage; unsigned int bit_pos; f2fs_hash_t name_hash; - struct f2fs_inline_dentry *inline_dentry = NULL; + void *inline_dentry = NULL; struct f2fs_dentry_ptr d; int slots = GET_DENTRY_SLOTS(new_name->len); struct page *page = NULL; @@ -510,8 +510,8 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, if (IS_ERR(ipage)) return PTR_ERR(ipage); - inline_dentry = inline_data_addr(ipage); - make_dentry_ptr_inline(NULL, &d, inline_dentry); + inline_dentry = inline_data_addr(dir, ipage); + make_dentry_ptr_inline(dir, &d, inline_dentry); bit_pos = room_for_filename(d.bitmap, slots, d.max); if (bit_pos >= d.max) { @@ -557,8 +557,8 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, struct inode *dir, struct inode *inode) { - struct f2fs_inline_dentry *inline_dentry; struct f2fs_dentry_ptr d; + void *inline_dentry; int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); unsigned int bit_pos; int i; @@ -566,8 +566,8 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, lock_page(page); f2fs_wait_on_page_writeback(page, NODE, true); - inline_dentry = inline_data_addr(page); - make_dentry_ptr_inline(NULL, &d, inline_dentry); + inline_dentry = inline_data_addr(dir, page); + make_dentry_ptr_inline(dir, &d, inline_dentry); bit_pos = dentry - d.dentry; for (i = 0; i < slots; i++) @@ -588,15 +588,15 @@ bool f2fs_empty_inline_dir(struct inode *dir) struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct page *ipage; unsigned int bit_pos = 2; - struct f2fs_inline_dentry *inline_dentry; + void *inline_dentry; struct f2fs_dentry_ptr d; ipage = get_node_page(sbi, dir->i_ino); if (IS_ERR(ipage)) return false; - inline_dentry = inline_data_addr(ipage); - make_dentry_ptr_inline(NULL, &d, inline_dentry); + inline_dentry = inline_data_addr(dir, ipage); + make_dentry_ptr_inline(dir, &d, inline_dentry); bit_pos = find_next_bit_le(d.bitmap, d.max, bit_pos); @@ -612,9 +612,9 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, struct fscrypt_str *fstr) { struct inode *inode = file_inode(file); - struct f2fs_inline_dentry *inline_dentry = NULL; struct page *ipage = NULL; struct f2fs_dentry_ptr d; + void *inline_dentry = NULL; int err; make_dentry_ptr_inline(inode, &d, inline_dentry); @@ -626,7 +626,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, if (IS_ERR(ipage)) return PTR_ERR(ipage); - inline_dentry = inline_data_addr(ipage); + inline_dentry = inline_data_addr(inode, ipage); make_dentry_ptr_inline(inode, &d, inline_dentry); @@ -657,7 +657,7 @@ int f2fs_inline_data_fiemap(struct inode *inode, goto out; } - ilen = min_t(size_t, MAX_INLINE_DATA, i_size_read(inode)); + ilen = min_t(size_t, MAX_INLINE_DATA(inode), i_size_read(inode)); if (start >= ilen) goto out; if (start + len < ilen) @@ -666,7 +666,8 @@ int f2fs_inline_data_fiemap(struct inode *inode, get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni); byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits; - byteaddr += (char *)inline_data_addr(ipage) - (char *)F2FS_INODE(ipage); + byteaddr += (char *)inline_data_addr(inode, ipage) - + (char *)F2FS_INODE(ipage); err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags); out: f2fs_put_page(ipage, 1); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 6cd312a17c69..32ec6b23fe01 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -87,9 +87,9 @@ static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri) static void __recover_inline_status(struct inode *inode, struct page *ipage) { - void *inline_data = inline_data_addr(ipage); + void *inline_data = inline_data_addr(inode, ipage); __le32 *start = inline_data; - __le32 *end = start + MAX_INLINE_DATA / sizeof(__le32); + __le32 *end = start + MAX_INLINE_DATA(inode) / sizeof(__le32); while (start < end) { if (*start++) { diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1eb2013fece6..ac719a3ef848 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -446,6 +446,9 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) #endif /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; + + fi->i_inline_reserved = DEF_INLINE_RESERVED_SIZE; + return &fi->vfs_inode; } diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 2b7183c5c9a3..bf27f140c21b 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -206,9 +206,6 @@ struct f2fs_extent { #define F2FS_DATA_EXIST 0x08 /* file inline data exist flag */ #define F2FS_INLINE_DOTS 0x10 /* file having implicit dot dentries */ -#define MAX_INLINE_DATA (sizeof(__le32) * (DEF_ADDRS_PER_INODE - \ - F2FS_INLINE_XATTR_ADDRS - 1)) - struct f2fs_inode { __le16 i_mode; /* file mode */ __u8 i_advise; /* file hints */ @@ -465,7 +462,7 @@ typedef __le32 f2fs_hash_t; #define MAX_DIR_BUCKETS (1 << ((MAX_DIR_HASH_DEPTH / 2) - 1)) /* - * space utilization of regular dentry and inline dentry + * space utilization of regular dentry and inline dentry (w/o extra reservation) * regular dentry inline dentry * bitmap 1 * 27 = 27 1 * 23 = 23 * reserved 1 * 3 = 3 1 * 7 = 7 @@ -501,24 +498,6 @@ struct f2fs_dentry_block { __u8 filename[NR_DENTRY_IN_BLOCK][F2FS_SLOT_LEN]; } __packed; -/* for inline dir */ -#define NR_INLINE_DENTRY (MAX_INLINE_DATA * BITS_PER_BYTE / \ - ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ - BITS_PER_BYTE + 1)) -#define INLINE_DENTRY_BITMAP_SIZE ((NR_INLINE_DENTRY + \ - BITS_PER_BYTE - 1) / BITS_PER_BYTE) -#define INLINE_RESERVED_SIZE (MAX_INLINE_DATA - \ - ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ - NR_INLINE_DENTRY + INLINE_DENTRY_BITMAP_SIZE)) - -/* inline directory entry structure */ -struct f2fs_inline_dentry { - __u8 dentry_bitmap[INLINE_DENTRY_BITMAP_SIZE]; - __u8 reserved[INLINE_RESERVED_SIZE]; - struct f2fs_dir_entry dentry[NR_INLINE_DENTRY]; - __u8 filename[NR_INLINE_DENTRY][F2FS_SLOT_LEN]; -} __packed; - /* file types used in inode_info->flags */ enum { F2FS_FT_UNKNOWN, From 40f4330d958ba11f77c3e443dc98ffe2b9f3cdd1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 19 Jul 2017 00:19:06 +0800 Subject: [PATCH 0378/1212] f2fs: enhance on-disk inode structure scalability This patch add new flag F2FS_EXTRA_ATTR storing in inode.i_inline to indicate that on-disk structure of current inode is extended. In order to extend, we changed the inode structure a bit: Original one: struct f2fs_inode { ... struct f2fs_extent i_ext; __le32 i_addr[DEF_ADDRS_PER_INODE]; __le32 i_nid[DEF_NIDS_PER_INODE]; } Extended one: struct f2fs_inode { ... struct f2fs_extent i_ext; union { struct { __le16 i_extra_isize; __le16 i_padding; __le32 i_extra_end[0]; }; __le32 i_addr[DEF_ADDRS_PER_INODE]; }; __le32 i_nid[DEF_NIDS_PER_INODE]; } Once F2FS_EXTRA_ATTR is set, we will steal four bytes in the head of i_addr field for storing i_extra_isize and i_padding. with i_extra_isize, we can calculate actual size of reserved space in i_addr, available attribute fields included in total extra attribute fields for current inode can be described as below: +--------------------+ | .i_mode | | ... | | .i_ext | +--------------------+ | .i_extra_isize |-----+ | .i_padding | | | .i_prjid | | | .i_atime_extra | | | .i_ctime_extra | | | .i_mtime_extra |<----+ | .i_inode_cs |<----- store blkaddr/inline from here | .i_xattr_cs | | ... | +--------------------+ | | | block address | | | +--------------------+ | .i_nid | +--------------------+ | node_footer | | (nid, ino, offset) | +--------------------+ Hence, with this patch, we would enhance scalability of f2fs inode for storing more newly added attribute. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 15 ++++++--- fs/f2fs/f2fs.h | 72 +++++++++++++++++++++++++++++++---------- fs/f2fs/file.c | 23 ++++++++----- fs/f2fs/gc.c | 2 +- fs/f2fs/inode.c | 32 +++++++++++------- fs/f2fs/namei.c | 5 +++ fs/f2fs/node.c | 7 ++-- fs/f2fs/recovery.c | 7 ++-- fs/f2fs/super.c | 11 +++++-- include/linux/f2fs_fs.h | 13 ++++++-- 10 files changed, 135 insertions(+), 52 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f31c71bcdf0e..376d59929ded 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -459,10 +459,14 @@ static void __set_data_blkaddr(struct dnode_of_data *dn) { struct f2fs_node *rn = F2FS_NODE(dn->node_page); __le32 *addr_array; + int base = 0; + + if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode)) + base = get_extra_isize(dn->inode); /* Get physical address of data block */ addr_array = blkaddr_in_node(rn); - addr_array[dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); + addr_array[base + dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); } /* @@ -506,8 +510,8 @@ int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) f2fs_wait_on_page_writeback(dn->node_page, NODE, true); for (; count > 0; dn->ofs_in_node++) { - block_t blkaddr = - datablock_addr(dn->node_page, dn->ofs_in_node); + block_t blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); if (blkaddr == NULL_ADDR) { dn->data_blkaddr = NEW_ADDR; __set_data_blkaddr(dn); @@ -754,7 +758,8 @@ static int __allocate_data_block(struct dnode_of_data *dn) if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; - dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); if (dn->data_blkaddr == NEW_ADDR) goto alloc; @@ -901,7 +906,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, end_offset = ADDRS_PER_PAGE(dn.node_page, inode); next_block: - blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) { if (create) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 153a487b1189..845ebcd4217e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -111,9 +111,10 @@ struct f2fs_mount_info { unsigned int opt; }; -#define F2FS_FEATURE_ENCRYPT 0x0001 -#define F2FS_FEATURE_BLKZONED 0x0002 -#define F2FS_FEATURE_ATOMIC_WRITE 0x0004 +#define F2FS_FEATURE_ENCRYPT 0x0001 +#define F2FS_FEATURE_BLKZONED 0x0002 +#define F2FS_FEATURE_ATOMIC_WRITE 0x0004 +#define F2FS_FEATURE_EXTRA_ATTR 0x0008 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -426,10 +427,10 @@ struct f2fs_flush_device { /* for inline stuff */ #define DEF_INLINE_RESERVED_SIZE 1 - -static inline int get_inline_reserved_size(struct inode *inode); -#define MAX_INLINE_DATA(inode) (sizeof(__le32) * (DEF_ADDRS_PER_INODE -\ - get_inline_reserved_size(inode) -\ +static inline int get_extra_isize(struct inode *inode); +#define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ + (CUR_ADDRS_PER_INODE(inode) - \ + DEF_INLINE_RESERVED_SIZE - \ F2FS_INLINE_XATTR_ADDRS)) /* for inline dir */ @@ -635,7 +636,7 @@ struct f2fs_inode_info { struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */ struct rw_semaphore i_mmap_sem; - int i_inline_reserved; /* reserved size in inline data */ + int i_extra_isize; /* size of extra space located in i_addr */ }; static inline void get_extent_info(struct extent_info *ext, @@ -1856,20 +1857,38 @@ static inline bool IS_INODE(struct page *page) return RAW_IS_INODE(p); } +static inline int offset_in_addr(struct f2fs_inode *i) +{ + return (i->i_inline & F2FS_EXTRA_ATTR) ? + (le16_to_cpu(i->i_extra_isize) / sizeof(__le32)) : 0; +} + static inline __le32 *blkaddr_in_node(struct f2fs_node *node) { return RAW_IS_INODE(node) ? node->i.i_addr : node->dn.addr; } -static inline block_t datablock_addr(struct page *node_page, - unsigned int offset) +static inline int f2fs_has_extra_attr(struct inode *inode); +static inline block_t datablock_addr(struct inode *inode, + struct page *node_page, unsigned int offset) { struct f2fs_node *raw_node; __le32 *addr_array; + int base = 0; + bool is_inode = IS_INODE(node_page); raw_node = F2FS_NODE(node_page); + + /* from GC path only */ + if (!inode) { + if (is_inode) + base = offset_in_addr(&raw_node->i); + } else if (f2fs_has_extra_attr(inode) && is_inode) { + base = get_extra_isize(inode); + } + addr_array = blkaddr_in_node(raw_node); - return le32_to_cpu(addr_array[offset]); + return le32_to_cpu(addr_array[base + offset]); } static inline int f2fs_test_bit(unsigned int nr, char *addr) @@ -1960,6 +1979,7 @@ enum { FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ FI_NO_PREALLOC, /* indicate skipped preallocated blocks */ FI_HOT_DATA, /* indicate file is hot */ + FI_EXTRA_ATTR, /* indicate file has extra attribute */ }; static inline void __mark_inode_dirty_flag(struct inode *inode, @@ -2079,6 +2099,8 @@ static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) set_bit(FI_DATA_EXIST, &fi->flags); if (ri->i_inline & F2FS_INLINE_DOTS) set_bit(FI_INLINE_DOTS, &fi->flags); + if (ri->i_inline & F2FS_EXTRA_ATTR) + set_bit(FI_EXTRA_ATTR, &fi->flags); } static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) @@ -2095,6 +2117,13 @@ static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) ri->i_inline |= F2FS_DATA_EXIST; if (is_inode_flag_set(inode, FI_INLINE_DOTS)) ri->i_inline |= F2FS_INLINE_DOTS; + if (is_inode_flag_set(inode, FI_EXTRA_ATTR)) + ri->i_inline |= F2FS_EXTRA_ATTR; +} + +static inline int f2fs_has_extra_attr(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_EXTRA_ATTR); } static inline int f2fs_has_inline_xattr(struct inode *inode) @@ -2105,8 +2134,8 @@ static inline int f2fs_has_inline_xattr(struct inode *inode) static inline unsigned int addrs_per_inode(struct inode *inode) { if (f2fs_has_inline_xattr(inode)) - return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS; - return DEF_ADDRS_PER_INODE; + return CUR_ADDRS_PER_INODE(inode) - F2FS_INLINE_XATTR_ADDRS; + return CUR_ADDRS_PER_INODE(inode); } static inline void *inline_xattr_addr(struct page *page) @@ -2168,9 +2197,9 @@ static inline bool f2fs_is_drop_cache(struct inode *inode) static inline void *inline_data_addr(struct inode *inode, struct page *page) { struct f2fs_inode *ri = F2FS_INODE(page); - int reserved_size = get_inline_reserved_size(inode); + int extra_size = get_extra_isize(inode); - return (void *)&(ri->i_addr[reserved_size]); + return (void *)&(ri->i_addr[extra_size + DEF_INLINE_RESERVED_SIZE]); } static inline int f2fs_has_inline_dentry(struct inode *inode) @@ -2281,15 +2310,19 @@ static inline void *kvzalloc(size_t size, gfp_t flags) return ret; } -static inline int get_inline_reserved_size(struct inode *inode) +static inline int get_extra_isize(struct inode *inode) { - return F2FS_I(inode)->i_inline_reserved; + return F2FS_I(inode)->i_extra_isize / sizeof(__le32); } #define get_inode_mode(i) \ ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) +#define F2FS_TOTAL_EXTRA_ATTR_SIZE \ + (offsetof(struct f2fs_inode, i_extra_end) - \ + offsetof(struct f2fs_inode, i_extra_isize)) \ + /* * file.c */ @@ -2882,6 +2915,11 @@ static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED); } +static inline int f2fs_sb_has_extra_attr(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_EXTRA_ATTR); +} + #ifdef CONFIG_BLK_DEV_ZONED static inline int get_blkz_type(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkaddr) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index bc732b17be91..f6a9ae012471 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -385,7 +385,8 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) dn.ofs_in_node++, pgofs++, data_ofs = (loff_t)pgofs << PAGE_SHIFT) { block_t blkaddr; - blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + blkaddr = datablock_addr(dn.inode, + dn.node_page, dn.ofs_in_node); if (__found_offset(blkaddr, dirty, pgofs, whence)) { f2fs_put_dnode(&dn); @@ -470,9 +471,13 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) struct f2fs_node *raw_node; int nr_free = 0, ofs = dn->ofs_in_node, len = count; __le32 *addr; + int base = 0; + + if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode)) + base = get_extra_isize(dn->inode); raw_node = F2FS_NODE(dn->node_page); - addr = blkaddr_in_node(raw_node) + ofs; + addr = blkaddr_in_node(raw_node) + base + ofs; for (; count > 0; count--, addr++, dn->ofs_in_node++) { block_t blkaddr = le32_to_cpu(*addr); @@ -912,7 +917,8 @@ static int __read_out_blkaddrs(struct inode *inode, block_t *blkaddr, done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, inode) - dn.ofs_in_node, len); for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) { - *blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + *blkaddr = datablock_addr(dn.inode, + dn.node_page, dn.ofs_in_node); if (!is_checkpointed_data(sbi, *blkaddr)) { if (test_opt(sbi, LFS)) { @@ -988,8 +994,8 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, ADDRS_PER_PAGE(dn.node_page, dst_inode) - dn.ofs_in_node, len - i); do { - dn.data_blkaddr = datablock_addr(dn.node_page, - dn.ofs_in_node); + dn.data_blkaddr = datablock_addr(dn.inode, + dn.node_page, dn.ofs_in_node); truncate_data_blocks_range(&dn, 1); if (do_replace[i]) { @@ -1158,7 +1164,8 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, int ret; for (; index < end; index++, dn->ofs_in_node++) { - if (datablock_addr(dn->node_page, dn->ofs_in_node) == NULL_ADDR) + if (datablock_addr(dn->inode, dn->node_page, + dn->ofs_in_node) == NULL_ADDR) count++; } @@ -1169,8 +1176,8 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, dn->ofs_in_node = ofs_in_node; for (index = start; index < end; index++, dn->ofs_in_node++) { - dn->data_blkaddr = - datablock_addr(dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); /* * reserve_new_blocks will not guarantee entire block * allocation. diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 8f30dae0fe46..f74685ae008b 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -587,7 +587,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, } *nofs = ofs_of_node(node_page); - source_blkaddr = datablock_addr(node_page, ofs_in_node); + source_blkaddr = datablock_addr(NULL, node_page, ofs_in_node); f2fs_put_page(node_page, 1); if (source_blkaddr != blkaddr) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 32ec6b23fe01..0a6699a23dfb 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -49,20 +49,22 @@ void f2fs_set_inode_flags(struct inode *inode) static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) { + int extra_size = get_extra_isize(inode); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { - if (ri->i_addr[0]) - inode->i_rdev = - old_decode_dev(le32_to_cpu(ri->i_addr[0])); + if (ri->i_addr[extra_size]) + inode->i_rdev = old_decode_dev( + le32_to_cpu(ri->i_addr[extra_size])); else - inode->i_rdev = - new_decode_dev(le32_to_cpu(ri->i_addr[1])); + inode->i_rdev = new_decode_dev( + le32_to_cpu(ri->i_addr[extra_size + 1])); } } static bool __written_first_block(struct f2fs_inode *ri) { - block_t addr = le32_to_cpu(ri->i_addr[0]); + block_t addr = le32_to_cpu(ri->i_addr[offset_in_addr(ri)]); if (addr != NEW_ADDR && addr != NULL_ADDR) return true; @@ -71,16 +73,18 @@ static bool __written_first_block(struct f2fs_inode *ri) static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri) { + int extra_size = get_extra_isize(inode); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { if (old_valid_dev(inode->i_rdev)) { - ri->i_addr[0] = + ri->i_addr[extra_size] = cpu_to_le32(old_encode_dev(inode->i_rdev)); - ri->i_addr[1] = 0; + ri->i_addr[extra_size + 1] = 0; } else { - ri->i_addr[0] = 0; - ri->i_addr[1] = + ri->i_addr[extra_size] = 0; + ri->i_addr[extra_size + 1] = cpu_to_le32(new_encode_dev(inode->i_rdev)); - ri->i_addr[2] = 0; + ri->i_addr[extra_size + 2] = 0; } } } @@ -153,6 +157,9 @@ static int do_read_inode(struct inode *inode) get_inline_info(inode, ri); + fi->i_extra_isize = f2fs_has_extra_attr(inode) ? + le16_to_cpu(ri->i_extra_isize) : 0; + /* check data exist */ if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) __recover_inline_status(inode, node_page); @@ -292,6 +299,9 @@ int update_inode(struct inode *inode, struct page *node_page) ri->i_generation = cpu_to_le32(inode->i_generation); ri->i_dir_level = F2FS_I(inode)->i_dir_level; + if (f2fs_has_extra_attr(inode)) + ri->i_extra_isize = cpu_to_le16(F2FS_I(inode)->i_extra_isize); + __set_inode_rdev(inode, ri); set_cold_node(inode, node_page); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 541d755193c8..f098ae65363b 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -72,6 +72,11 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) set_inode_flag(inode, FI_NEW_INODE); + if (f2fs_sb_has_extra_attr(sbi->sb)) { + set_inode_flag(inode, FI_EXTRA_ATTR); + F2FS_I(inode)->i_extra_isize = F2FS_TOTAL_EXTRA_ATTR_SIZE; + } + if (test_opt(sbi, INLINE_XATTR)) set_inode_flag(inode, FI_INLINE_XATTR); if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 46fb5c2693ad..cde5526ec3fa 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -655,7 +655,8 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) dn->nid = nids[level]; dn->ofs_in_node = offset[level]; dn->node_page = npage[level]; - dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); return 0; release_pages: @@ -2266,7 +2267,9 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) dst->i_blocks = cpu_to_le64(1); dst->i_links = cpu_to_le32(1); dst->i_xattr_nid = 0; - dst->i_inline = src->i_inline & F2FS_INLINE_XATTR; + dst->i_inline = src->i_inline & (F2FS_INLINE_XATTR | F2FS_EXTRA_ATTR); + if (dst->i_inline & F2FS_EXTRA_ATTR) + dst->i_extra_isize = src->i_extra_isize; new_ni = old_ni; new_ni.ino = ino; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 907d6b7dde6a..2d9b8182691f 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -361,7 +361,8 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, return 0; truncate_out: - if (datablock_addr(tdn.node_page, tdn.ofs_in_node) == blkaddr) + if (datablock_addr(tdn.inode, tdn.node_page, + tdn.ofs_in_node) == blkaddr) truncate_data_blocks_range(&tdn, 1); if (dn->inode->i_ino == nid && !dn->inode_page_locked) unlock_page(dn->inode_page); @@ -414,8 +415,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, for (; start < end; start++, dn.ofs_in_node++) { block_t src, dest; - src = datablock_addr(dn.node_page, dn.ofs_in_node); - dest = datablock_addr(page, dn.ofs_in_node); + src = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); + dest = datablock_addr(dn.inode, page, dn.ofs_in_node); /* skip recovering if dest is the same as src */ if (src == dest) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ac719a3ef848..caf6f24ce3a5 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -447,8 +447,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; - fi->i_inline_reserved = DEF_INLINE_RESERVED_SIZE; - return &fi->vfs_inode; } @@ -1305,9 +1303,16 @@ static const struct export_operations f2fs_export_ops = { static loff_t max_file_blocks(void) { - loff_t result = (DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS); + loff_t result = 0; loff_t leaf_count = ADDRS_PER_BLOCK; + /* + * note: previously, result is equal to (DEF_ADDRS_PER_INODE - + * F2FS_INLINE_XATTR_ADDRS), but now f2fs try to reserve more + * space in inode.i_addr, it will be more safe to reassign + * result as zero. + */ + /* two direct node blocks */ result += (leaf_count * 2); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index bf27f140c21b..350c6b931fdb 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -186,6 +186,8 @@ struct f2fs_extent { #define F2FS_NAME_LEN 255 #define F2FS_INLINE_XATTR_ADDRS 50 /* 200 bytes for inline xattrs */ #define DEF_ADDRS_PER_INODE 923 /* Address Pointers in an Inode */ +#define CUR_ADDRS_PER_INODE(inode) (DEF_ADDRS_PER_INODE - \ + get_extra_isize(inode)) #define DEF_NIDS_PER_INODE 5 /* Node IDs in an Inode */ #define ADDRS_PER_INODE(inode) addrs_per_inode(inode) #define ADDRS_PER_BLOCK 1018 /* Address Pointers in a Direct Block */ @@ -205,6 +207,7 @@ struct f2fs_extent { #define F2FS_INLINE_DENTRY 0x04 /* file inline dentry flag */ #define F2FS_DATA_EXIST 0x08 /* file inline data exist flag */ #define F2FS_INLINE_DOTS 0x10 /* file having implicit dot dentries */ +#define F2FS_EXTRA_ATTR 0x20 /* file having extra attribute */ struct f2fs_inode { __le16 i_mode; /* file mode */ @@ -232,8 +235,14 @@ struct f2fs_inode { struct f2fs_extent i_ext; /* caching a largest extent */ - __le32 i_addr[DEF_ADDRS_PER_INODE]; /* Pointers to data blocks */ - + union { + struct { + __le16 i_extra_isize; /* extra inode attribute size */ + __le16 i_padding; /* padding */ + __le32 i_extra_end[0]; /* for attribute size calculation */ + }; + __le32 i_addr[DEF_ADDRS_PER_INODE]; /* Pointers to data blocks */ + }; __le32 i_nid[DEF_NIDS_PER_INODE]; /* direct(2), indirect(2), double_indirect(1) node id */ } __packed; From 186801baf7a2bee5fd187a123a533e46ccfc8e2b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 24 Jul 2017 17:12:06 +0800 Subject: [PATCH 0379/1212] f2fs: record quota during dot{,dot} recovery In ->lookup(), we will have a try to recover dot or dotdot for corrupted directory, once disk quota is on, if it allocates new block during dotdot recovery, we need to record disk quota info for the allocation, so this patch fixes this issue by adding missing dquot_initialize() in __recover_dot_dentries. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index f098ae65363b..a0bd1c68ec9c 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -266,6 +266,10 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) return 0; } + err = dquot_initialize(dir); + if (err) + return err; + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); From fbe3ba58535fe619c191b5f384b6ec84e5e46e61 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 26 Jul 2017 00:01:41 +0800 Subject: [PATCH 0380/1212] f2fs: support project quota This patch adds to support plain project quota. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 1 + fs/f2fs/f2fs.h | 29 ++++++++++++++++++++++++++++ fs/f2fs/file.c | 13 ------------- fs/f2fs/inode.c | 24 ++++++++++++++++++++++- fs/f2fs/namei.c | 31 ++++++++++++++++++++++++++++++ fs/f2fs/node.c | 7 ++++++- fs/f2fs/super.c | 22 ++++++++++++++++++++- include/linux/f2fs_fs.h | 3 +++ 8 files changed, 114 insertions(+), 16 deletions(-) diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index 3ba27469a8dd..5cf383f7fa8a 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -162,6 +162,7 @@ io_bits=%u Set the bit size of write IO requests. It should be set with "mode=lfs". usrquota Enable plain user disk quota accounting. grpquota Enable plain group disk quota accounting. +prjquota Enable plain project quota accounting. ================================================================================ DEBUGFS ENTRIES diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 845ebcd4217e..dce0857a72f6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -92,6 +92,7 @@ extern char *fault_name[FAULT_MAX]; #define F2FS_MOUNT_LFS 0x00040000 #define F2FS_MOUNT_USRQUOTA 0x00080000 #define F2FS_MOUNT_GRPQUOTA 0x00100000 +#define F2FS_MOUNT_PRJQUOTA 0x00200000 #define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option) @@ -115,6 +116,7 @@ struct f2fs_mount_info { #define F2FS_FEATURE_BLKZONED 0x0002 #define F2FS_FEATURE_ATOMIC_WRITE 0x0004 #define F2FS_FEATURE_EXTRA_ATTR 0x0008 +#define F2FS_FEATURE_PRJQUOTA 0x0010 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -637,6 +639,7 @@ struct f2fs_inode_info { struct rw_semaphore i_mmap_sem; int i_extra_isize; /* size of extra space located in i_addr */ + kprojid_t i_projid; /* id for project quota */ }; static inline void get_extent_info(struct extent_info *ext, @@ -1951,6 +1954,20 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) *addr ^= mask; } +#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) +#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) +#define F2FS_FL_INHERITED (FS_PROJINHERIT_FL) + +static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & F2FS_REG_FLMASK; + else + return flags & F2FS_OTHER_FLMASK; +} + /* used for f2fs_inode_info->flags */ enum { FI_NEW_INODE, /* indicate newly allocated inode */ @@ -1980,6 +1997,7 @@ enum { FI_NO_PREALLOC, /* indicate skipped preallocated blocks */ FI_HOT_DATA, /* indicate file is hot */ FI_EXTRA_ATTR, /* indicate file has extra attribute */ + FI_PROJ_INHERIT, /* indicate file inherits projectid */ }; static inline void __mark_inode_dirty_flag(struct inode *inode, @@ -2323,6 +2341,12 @@ static inline int get_extra_isize(struct inode *inode) (offsetof(struct f2fs_inode, i_extra_end) - \ offsetof(struct f2fs_inode, i_extra_isize)) \ +#define F2FS_OLD_ATTRIBUTE_SIZE (offsetof(struct f2fs_inode, i_addr)) +#define F2FS_FITS_IN_INODE(f2fs_inode, extra_isize, field) \ + ((offsetof(typeof(*f2fs_inode), field) + \ + sizeof((f2fs_inode)->field)) \ + <= (F2FS_OLD_ATTRIBUTE_SIZE + extra_isize)) \ + /* * file.c */ @@ -2920,6 +2944,11 @@ static inline int f2fs_sb_has_extra_attr(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_EXTRA_ATTR); } +static inline int f2fs_sb_has_project_quota(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_PRJQUOTA); +} + #ifdef CONFIG_BLK_DEV_ZONED static inline int get_blkz_type(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkaddr) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f6a9ae012471..5f4355e9c336 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1503,19 +1503,6 @@ static int f2fs_file_flush(struct file *file, fl_owner_t id) return 0; } -#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) -#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) - -static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) -{ - if (S_ISDIR(mode)) - return flags; - else if (S_ISREG(mode)) - return flags & F2FS_REG_FLMASK; - else - return flags & F2FS_OTHER_FLMASK; -} - static int f2fs_ioc_getflags(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 0a6699a23dfb..f15e663a1a15 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -114,6 +114,7 @@ static int do_read_inode(struct inode *inode) struct f2fs_inode_info *fi = F2FS_I(inode); struct page *node_page; struct f2fs_inode *ri; + projid_t i_projid; /* Check if ino is within scope */ if (check_nid_range(sbi, inode->i_ino)) { @@ -173,6 +174,16 @@ static int do_read_inode(struct inode *inode) if (!need_inode_block_update(sbi, inode->i_ino)) fi->last_disk_size = inode->i_size; + if (fi->i_flags & FS_PROJINHERIT_FL) + set_inode_flag(inode, FI_PROJ_INHERIT); + + if (f2fs_has_extra_attr(inode) && f2fs_sb_has_project_quota(sbi->sb) && + F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_projid)) + i_projid = (projid_t)le32_to_cpu(ri->i_projid); + else + i_projid = F2FS_DEF_PROJID; + fi->i_projid = make_kprojid(&init_user_ns, i_projid); + f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); @@ -299,9 +310,20 @@ int update_inode(struct inode *inode, struct page *node_page) ri->i_generation = cpu_to_le32(inode->i_generation); ri->i_dir_level = F2FS_I(inode)->i_dir_level; - if (f2fs_has_extra_attr(inode)) + if (f2fs_has_extra_attr(inode)) { ri->i_extra_isize = cpu_to_le16(F2FS_I(inode)->i_extra_isize); + if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) && + F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, + i_projid)) { + projid_t i_projid; + + i_projid = from_kprojid(&init_user_ns, + F2FS_I(inode)->i_projid); + ri->i_projid = cpu_to_le32(i_projid); + } + } + __set_inode_rdev(inode, ri); set_cold_node(inode, node_page); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a0bd1c68ec9c..621b164bbe3c 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -58,6 +58,13 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) goto fail; } + if (f2fs_sb_has_project_quota(sbi->sb) && + (F2FS_I(dir)->i_flags & FS_PROJINHERIT_FL)) + F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid; + else + F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns, + F2FS_DEF_PROJID); + err = dquot_initialize(inode); if (err) goto fail_drop; @@ -90,6 +97,12 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) stat_inc_inline_inode(inode); stat_inc_inline_dir(inode); + F2FS_I(inode)->i_flags = + f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED); + + if (F2FS_I(inode)->i_flags & FS_PROJINHERIT_FL) + set_inode_flag(inode, FI_PROJ_INHERIT); + trace_f2fs_new_inode(inode, 0); return inode; @@ -209,6 +222,11 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, !fscrypt_has_permitted_context(dir, inode)) return -EPERM; + if (is_inode_flag_set(dir, FI_PROJ_INHERIT) && + (!projid_eq(F2FS_I(dir)->i_projid, + F2FS_I(old_dentry->d_inode)->i_projid))) + return -EXDEV; + err = dquot_initialize(dir); if (err) return err; @@ -730,6 +748,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; } + if (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + (!projid_eq(F2FS_I(new_dir)->i_projid, + F2FS_I(old_dentry->d_inode)->i_projid))) + return -EXDEV; + err = dquot_initialize(old_dir); if (err) goto out; @@ -918,6 +941,14 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, !fscrypt_has_permitted_context(old_dir, new_inode))) return -EPERM; + if ((is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + !projid_eq(F2FS_I(new_dir)->i_projid, + F2FS_I(old_dentry->d_inode)->i_projid)) || + (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + !projid_eq(F2FS_I(old_dir)->i_projid, + F2FS_I(new_dentry->d_inode)->i_projid))) + return -EXDEV; + err = dquot_initialize(old_dir); if (err) goto out; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index cde5526ec3fa..62f7bb2227bf 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2268,8 +2268,13 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) dst->i_links = cpu_to_le32(1); dst->i_xattr_nid = 0; dst->i_inline = src->i_inline & (F2FS_INLINE_XATTR | F2FS_EXTRA_ATTR); - if (dst->i_inline & F2FS_EXTRA_ATTR) + if (dst->i_inline & F2FS_EXTRA_ATTR) { dst->i_extra_isize = src->i_extra_isize; + if (f2fs_sb_has_project_quota(sbi->sb) && + F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), + i_projid)) + dst->i_projid = src->i_projid; + } new_ni = old_ni; new_ni.ino = ino; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index caf6f24ce3a5..e641418751c1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -109,6 +109,7 @@ enum { Opt_nolazytime, Opt_usrquota, Opt_grpquota, + Opt_prjquota, Opt_err, }; @@ -146,6 +147,7 @@ static match_table_t f2fs_tokens = { {Opt_nolazytime, "nolazytime"}, {Opt_usrquota, "usrquota"}, {Opt_grpquota, "grpquota"}, + {Opt_prjquota, "prjquota"}, {Opt_err, NULL}, }; @@ -392,9 +394,13 @@ static int parse_options(struct super_block *sb, char *options) case Opt_grpquota: set_opt(sbi, GRPQUOTA); break; + case Opt_prjquota: + set_opt(sbi, PRJQUOTA); + break; #else case Opt_usrquota: case Opt_grpquota: + case Opt_prjquota: f2fs_msg(sb, KERN_INFO, "quota operations not supported"); break; @@ -815,6 +821,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",usrquota"); if (test_opt(sbi, GRPQUOTA)) seq_puts(seq, ",grpquota"); + if (test_opt(sbi, PRJQUOTA)) + seq_puts(seq, ",prjquota"); #endif return 0; @@ -1173,6 +1181,14 @@ static void f2fs_quota_off_umount(struct super_block *sb) f2fs_quota_off(sb, type); } +#if 0 +int f2fs_get_projid(struct inode *inode, kprojid_t *projid) +{ + *projid = F2FS_I(inode)->i_projid; + return 0; +} +#endif + static const struct dquot_operations f2fs_quota_operations = { .get_reserved_space = f2fs_get_reserved_space, .write_dquot = dquot_commit, @@ -1182,6 +1198,10 @@ static const struct dquot_operations f2fs_quota_operations = { .write_info = dquot_commit_info, .alloc_dquot = dquot_alloc, .destroy_dquot = dquot_destroy, +#if 0 + .get_projid = f2fs_get_projid, + .get_next_id = dquot_get_next_id, +#endif }; static const struct quotactl_ops f2fs_quotactl_ops = { @@ -1967,7 +1987,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) #ifdef CONFIG_QUOTA sb->dq_op = &f2fs_quota_operations; sb->s_qcop = &f2fs_quotactl_ops; - sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; #endif sb->s_op = &f2fs_sops; diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 350c6b931fdb..5a6261a7f1ab 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -239,6 +239,7 @@ struct f2fs_inode { struct { __le16 i_extra_isize; /* extra inode attribute size */ __le16 i_padding; /* padding */ + __le32 i_projid; /* project id */ __le32 i_extra_end[0]; /* for attribute size calculation */ }; __le32 i_addr[DEF_ADDRS_PER_INODE]; /* Pointers to data blocks */ @@ -522,4 +523,6 @@ enum { #define S_SHIFT 12 +#define F2FS_DEF_PROJID 0 /* default project ID */ + #endif /* _LINUX_F2FS_FS_H */ From 8af6d9311d8beb57310d2a51d8ed876dc19f0b5d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 26 Jul 2017 11:24:13 -0700 Subject: [PATCH 0381/1212] f2fs: avoid naming confusion of sysfs init This patch changes the function names of sysfs init to follow ext4. f2fs_init_sysfs <-> f2fs_register_sysfs f2fs_exit_sysfs <-> f2fs_unregister_sysfs Suggested-by: Chao Yu Reivewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 8 ++++---- fs/f2fs/super.c | 12 ++++++------ fs/f2fs/sysfs.c | 8 ++++---- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index dce0857a72f6..95f366e1f7ae 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2904,10 +2904,10 @@ void destroy_extent_cache(void); /* * sysfs.c */ -int __init f2fs_register_sysfs(void); -void f2fs_unregister_sysfs(void); -int f2fs_init_sysfs(struct f2fs_sb_info *sbi); -void f2fs_exit_sysfs(struct f2fs_sb_info *sbi); +int __init f2fs_init_sysfs(void); +void f2fs_exit_sysfs(void); +int f2fs_register_sysfs(struct f2fs_sb_info *sbi); +void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi); /* * crypto support diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e641418751c1..24678120969e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -649,7 +649,7 @@ static void f2fs_put_super(struct super_block *sb) kfree(sbi->ckpt); - f2fs_exit_sysfs(sbi); + f2fs_unregister_sysfs(sbi); sb->s_fs_info = NULL; if (sbi->s_chksum_driver) @@ -2153,7 +2153,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) goto free_root_inode; } - err = f2fs_init_sysfs(sbi); + err = f2fs_register_sysfs(sbi); if (err) goto free_root_inode; @@ -2224,7 +2224,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) free_sysfs: f2fs_sync_inode_meta(sbi); - f2fs_exit_sysfs(sbi); + f2fs_unregister_sysfs(sbi); free_root_inode: dput(sb->s_root); sb->s_root = NULL; @@ -2342,7 +2342,7 @@ static int __init init_f2fs_fs(void) err = create_extent_cache(); if (err) goto free_checkpoint_caches; - err = f2fs_register_sysfs(); + err = f2fs_init_sysfs(); if (err) goto free_extent_cache; err = register_shrinker(&f2fs_shrinker_info); @@ -2361,7 +2361,7 @@ static int __init init_f2fs_fs(void) free_shrinker: unregister_shrinker(&f2fs_shrinker_info); free_sysfs: - f2fs_unregister_sysfs(); + f2fs_exit_sysfs(); free_extent_cache: destroy_extent_cache(); free_checkpoint_caches: @@ -2381,7 +2381,7 @@ static void __exit exit_f2fs_fs(void) f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); unregister_shrinker(&f2fs_shrinker_info); - f2fs_unregister_sysfs(); + f2fs_exit_sysfs(); destroy_extent_cache(); destroy_checkpoint_caches(); destroy_segment_manager_caches(); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 71191d89917d..5a78b9af92ef 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -304,7 +304,7 @@ static const struct file_operations f2fs_seq_##_name##_fops = { \ F2FS_PROC_FILE_DEF(segment_info); F2FS_PROC_FILE_DEF(segment_bits); -int __init f2fs_register_sysfs(void) +int __init f2fs_init_sysfs(void) { f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); @@ -314,13 +314,13 @@ int __init f2fs_register_sysfs(void) return 0; } -void f2fs_unregister_sysfs(void) +void f2fs_exit_sysfs(void) { kset_unregister(f2fs_kset); remove_proc_entry("fs/f2fs", NULL); } -int f2fs_init_sysfs(struct f2fs_sb_info *sbi) +int f2fs_register_sysfs(struct f2fs_sb_info *sbi) { struct super_block *sb = sbi->sb; int err; @@ -351,7 +351,7 @@ int f2fs_init_sysfs(struct f2fs_sb_info *sbi) return err; } -void f2fs_exit_sysfs(struct f2fs_sb_info *sbi) +void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) { kobject_del(&sbi->s_kobj); kobject_put(&sbi->s_kobj); From 4f71d28e090f62689f2b48ff25200120a68e07b6 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 28 Jul 2017 02:29:12 -0700 Subject: [PATCH 0382/1212] f2fs: don't need to wait for node writes for atomic write We have a node chain to serialize node block writes, so if any IOs for node block writes are reordered, we'll get broken node chain. IOWs, roll-forward recovery will see all or none node blocks given fsync mark. E.g., Node chain consists of: N1 -> N2 -> N3 -> NFSYNC -> N1' -> N2' -> N'FSYNC Reordered to: 1) N1 -> N2 -> N3 -> N2' -> NFSYNC -> N'FSYNC -> power-cut 2) N1 -> N2 -> N3 -> N1' -> NFSYNC -> power-cut 3) N1 -> N2 -> NFSYNC -> N1' -> N'FSYNC -> N3 -> power-cut 4) N1 -> NFSYNC -> N1' -> N2' -> N'FSYNC -> N3 -> power-cut Roll-forward recovery can proceed to: 1) N1 -> N2 -> N3 -> NFSYNC -> X 2) N1 -> N2 -> N3 -> NFSYNC -> N1' -> X 3) N1 -> N2 -> N3 -> FSYNC -> N1' -> X 4) N1 -> X Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5f4355e9c336..db3f5023c713 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -277,9 +277,19 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, goto sync_nodes; } - ret = wait_on_node_pages_writeback(sbi, ino); - if (ret) - goto out; + /* + * If it's atomic_write, it's just fine to keep write ordering. So + * here we don't need to wait for node write completion, since we use + * node chain which serializes node blocks. If one of node writes are + * reordered, we can see simply broken chain, resulting in stopping + * roll-forward recovery. It means we'll recover all or none node blocks + * given fsync mark. + */ + if (!atomic) { + ret = wait_on_node_pages_writeback(sbi, ino); + if (ret) + goto out; + } /* once recovery info is written, don't need to tack this */ remove_ino_entry(sbi, ino, APPEND_INO); From 9fcb9eca7376ff83a973e19431bc2596390708eb Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 29 Jul 2017 00:32:53 +0800 Subject: [PATCH 0383/1212] f2fs: introduce f2fs_statfs_project This patch introduces f2fs_statfs_project, it enables to show usage status of directory tree which is limited with project quota. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 24678120969e..991448cf762e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -706,6 +706,48 @@ static int f2fs_unfreeze(struct super_block *sb) return 0; } +#ifdef CONFIG_QUOTA +static int f2fs_statfs_project(struct super_block *sb, + kprojid_t projid, struct kstatfs *buf) +{ + struct kqid qid; + struct dquot *dquot; + u64 limit; + u64 curblock; + + qid = make_kqid_projid(projid); + dquot = dqget(sb, qid); + if (IS_ERR(dquot)) + return PTR_ERR(dquot); + spin_lock(&dq_data_lock); + + limit = (dquot->dq_dqb.dqb_bsoftlimit ? + dquot->dq_dqb.dqb_bsoftlimit : + dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits; + if (limit && buf->f_blocks > limit) { + curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits; + buf->f_blocks = limit; + buf->f_bfree = buf->f_bavail = + (buf->f_blocks > curblock) ? + (buf->f_blocks - curblock) : 0; + } + + limit = dquot->dq_dqb.dqb_isoftlimit ? + dquot->dq_dqb.dqb_isoftlimit : + dquot->dq_dqb.dqb_ihardlimit; + if (limit && buf->f_files > limit) { + buf->f_files = limit; + buf->f_ffree = + (buf->f_files > dquot->dq_dqb.dqb_curinodes) ? + (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; + } + + spin_unlock(&dq_data_lock); + dqput(dquot); + return 0; +} +#endif + static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; @@ -741,6 +783,12 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); +#ifdef CONFIG_QUOTA + if (is_inode_flag_set(dentry->d_inode, FI_PROJ_INHERIT) && + sb_has_quota_limits_enabled(sb, PRJQUOTA)) { + f2fs_statfs_project(sb, F2FS_I(dentry->d_inode)->i_projid, buf); + } +#endif return 0; } From 48ac27052b634bbf3bbefeb9b77c22cd8b1b7388 Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Thu, 27 Jul 2017 20:11:00 +0800 Subject: [PATCH 0384/1212] f2fs: provide f2fs_balance_fs to __write_node_page Let node writeback also do f2fs_balance_fs to ensure there are always enough free segments. Signed-off-by: Yunlong Song Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/f2fs.h | 3 ++- fs/f2fs/node.c | 16 ++++++++++------ 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index e8ceff42d09b..24976959ef4b 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1018,7 +1018,7 @@ static int block_operations(struct f2fs_sb_info *sbi) if (get_pages(sbi, F2FS_DIRTY_NODES)) { up_write(&sbi->node_write); - err = sync_node_pages(sbi, &wbc); + err = sync_node_pages(sbi, &wbc, false); if (err) { up_write(&sbi->node_change); f2fs_unlock_all(sbi); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 95f366e1f7ae..5175e5b1bdfc 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2471,7 +2471,8 @@ struct page *get_node_page_ra(struct page *parent, int start); void move_node_page(struct page *node_page, int gc_type); int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic); -int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc); +int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, + bool do_balance); void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount); bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid); void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 62f7bb2227bf..133afd288b0b 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1327,7 +1327,7 @@ static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) } static int __write_node_page(struct page *page, bool atomic, bool *submitted, - struct writeback_control *wbc) + struct writeback_control *wbc, bool do_balance) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); nid_t nid; @@ -1396,6 +1396,8 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, if (submitted) *submitted = fio.submitted; + if (do_balance) + f2fs_balance_fs(sbi, false); return 0; redirty_out: @@ -1406,7 +1408,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, static int f2fs_write_node_page(struct page *page, struct writeback_control *wbc) { - return __write_node_page(page, false, NULL, wbc); + return __write_node_page(page, false, NULL, wbc, false); } int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, @@ -1494,7 +1496,7 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, ret = __write_node_page(page, atomic && page == last_page, - &submitted, wbc); + &submitted, wbc, true); if (ret) { unlock_page(page); f2fs_put_page(last_page, 0); @@ -1531,7 +1533,8 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, return ret ? -EIO: 0; } -int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc) +int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, + bool do_balance) { pgoff_t index, end; struct pagevec pvec; @@ -1609,7 +1612,8 @@ int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc) set_fsync_mark(page, 0); set_dentry_mark(page, 0); - ret = __write_node_page(page, false, &submitted, wbc); + ret = __write_node_page(page, false, &submitted, + wbc, do_balance); if (ret) unlock_page(page); else if (submitted) @@ -1701,7 +1705,7 @@ static int f2fs_write_node_pages(struct address_space *mapping, diff = nr_pages_to_write(sbi, NODE, wbc); wbc->sync_mode = WB_SYNC_NONE; blk_start_plug(&plug); - sync_node_pages(sbi, wbc); + sync_node_pages(sbi, wbc, true); blk_finish_plug(&plug); wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return 0; From 63b0ac86e6dec65e671282ed23319e7a096b4587 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sun, 30 Jul 2017 09:45:14 -0700 Subject: [PATCH 0385/1212] f2fs: return wrong error number on f2fs_quota_write This must return size, not error number. Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 991448cf762e..a8aa498c88eb 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1123,7 +1123,7 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type, } if (len == towrite) - return err; + return 0; inode->i_version++; inode->i_mtime = inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, false); From 12832f18b49d43473a9c59d7666887ee1d21d03c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 31 Jul 2017 20:19:09 +0800 Subject: [PATCH 0386/1212] f2fs: support inode checksum This patch adds to support inode checksum in f2fs. Signed-off-by: Chao Yu [Jaegeuk Kim: fix verification flow] Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 32 +++++++++++++++++++ fs/f2fs/inode.c | 70 +++++++++++++++++++++++++++++++++++++++++ fs/f2fs/node.c | 7 +++++ fs/f2fs/segment.c | 5 ++- fs/f2fs/super.c | 5 +++ include/linux/f2fs_fs.h | 1 + 6 files changed, 119 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5175e5b1bdfc..fc958df78748 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -117,6 +117,7 @@ struct f2fs_mount_info { #define F2FS_FEATURE_ATOMIC_WRITE 0x0004 #define F2FS_FEATURE_EXTRA_ATTR 0x0008 #define F2FS_FEATURE_PRJQUOTA 0x0010 +#define F2FS_FEATURE_INODE_CHKSUM 0x0020 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -1149,6 +1150,9 @@ struct f2fs_sb_info { /* Reference to checksum algorithm driver via cryptoapi */ struct crypto_shash *s_chksum_driver; + /* Precomputed FS UUID checksum for seeding other checksums */ + __u32 s_chksum_seed; + /* For fault injection */ #ifdef CONFIG_F2FS_FAULT_INJECTION struct f2fs_fault_info fault_info; @@ -1237,6 +1241,27 @@ static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc, return f2fs_crc32(sbi, buf, buf_size) == blk_crc; } +static inline u32 f2fs_chksum(struct f2fs_sb_info *sbi, u32 crc, + const void *address, unsigned int length) +{ + struct { + struct shash_desc shash; + char ctx[4]; + } desc; + int err; + + BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver) != sizeof(desc.ctx)); + + desc.shash.tfm = sbi->s_chksum_driver; + desc.shash.flags = 0; + *(u32 *)desc.ctx = crc; + + err = crypto_shash_update(&desc.shash, address, length); + BUG_ON(err); + + return *(u32 *)desc.ctx; +} + static inline struct f2fs_inode_info *F2FS_I(struct inode *inode) { return container_of(inode, struct f2fs_inode_info, vfs_inode); @@ -2366,6 +2391,8 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); * inode.c */ void f2fs_set_inode_flags(struct inode *inode); +bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page); +void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page); struct inode *f2fs_iget(struct super_block *sb, unsigned long ino); struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino); int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink); @@ -2950,6 +2977,11 @@ static inline int f2fs_sb_has_project_quota(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_PRJQUOTA); } +static inline int f2fs_sb_has_inode_chksum(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CHKSUM); +} + #ifdef CONFIG_BLK_DEV_ZONED static inline int get_blkz_type(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkaddr) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index f15e663a1a15..b4c401d456e7 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -108,6 +108,76 @@ static void __recover_inline_status(struct inode *inode, struct page *ipage) return; } +static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *ri = &F2FS_NODE(page)->i; + int extra_isize = le32_to_cpu(ri->i_extra_isize); + + if (!f2fs_sb_has_inode_chksum(sbi->sb)) + return false; + + if (!RAW_IS_INODE(F2FS_NODE(page)) || !(ri->i_inline & F2FS_EXTRA_ATTR)) + return false; + + if (!F2FS_FITS_IN_INODE(ri, extra_isize, i_inode_checksum)) + return false; + + return true; +} + +static __u32 f2fs_inode_chksum(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_node *node = F2FS_NODE(page); + struct f2fs_inode *ri = &node->i; + __le32 ino = node->footer.ino; + __le32 gen = ri->i_generation; + __u32 chksum, chksum_seed; + __u32 dummy_cs = 0; + unsigned int offset = offsetof(struct f2fs_inode, i_inode_checksum); + unsigned int cs_size = sizeof(dummy_cs); + + chksum = f2fs_chksum(sbi, sbi->s_chksum_seed, (__u8 *)&ino, + sizeof(ino)); + chksum_seed = f2fs_chksum(sbi, chksum, (__u8 *)&gen, sizeof(gen)); + + chksum = f2fs_chksum(sbi, chksum_seed, (__u8 *)ri, offset); + chksum = f2fs_chksum(sbi, chksum, (__u8 *)&dummy_cs, cs_size); + offset += cs_size; + chksum = f2fs_chksum(sbi, chksum, (__u8 *)ri + offset, + F2FS_BLKSIZE - offset); + return chksum; +} + +bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *ri; + __u32 provided, calculated; + + if (!f2fs_enable_inode_chksum(sbi, page)) + return true; + + ri = &F2FS_NODE(page)->i; + provided = le32_to_cpu(ri->i_inode_checksum); + calculated = f2fs_inode_chksum(sbi, page); + + if (provided != calculated) + f2fs_msg(sbi->sb, KERN_WARNING, + "checksum invalid, ino = %x, %x vs. %x", + ino_of_node(page), provided, calculated); + + return provided == calculated; +} + +void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *ri = &F2FS_NODE(page)->i; + + if (!f2fs_enable_inode_chksum(sbi, page)) + return; + + ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, page)); +} + static int do_read_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 133afd288b0b..6c7cc7cdf776 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1171,6 +1171,11 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, err = -EIO; goto out_err; } + + if (!f2fs_inode_chksum_verify(sbi, page)) { + err = -EBADMSG; + goto out_err; + } page_hit: if(unlikely(nid != nid_of_node(page))) { f2fs_msg(sbi->sb, KERN_WARNING, "inconsistent node block, " @@ -2278,6 +2283,8 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), i_projid)) dst->i_projid = src->i_projid; + + f2fs_inode_chksum_set(sbi, ipage); } new_ni = old_ni; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 3573b95f4fab..af7da1b62e94 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2294,9 +2294,12 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, mutex_unlock(&sit_i->sentry_lock); - if (page && IS_NODESEG(type)) + if (page && IS_NODESEG(type)) { fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg)); + f2fs_inode_chksum_set(sbi, page); + } + if (add_list) { struct f2fs_bio_info *io; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a8aa498c88eb..dd28d8bed37a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2001,6 +2001,11 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) sb->s_fs_info = sbi; sbi->raw_super = raw_super; + /* precompute checksum seed for metadata */ + if (f2fs_sb_has_inode_chksum(sb)) + sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid, + sizeof(raw_super->uuid)); + /* * The BLKZONED feature indicates that the drive was formatted with * zone alignment optimization. This is optional for host-aware diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 5a6261a7f1ab..c2a975e4a711 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -240,6 +240,7 @@ struct f2fs_inode { __le16 i_extra_isize; /* extra inode attribute size */ __le16 i_padding; /* padding */ __le32 i_projid; /* project id */ + __le32 i_inode_checksum;/* inode meta checksum */ __le32 i_extra_end[0]; /* for attribute size calculation */ }; __le32 i_addr[DEF_ADDRS_PER_INODE]; /* Pointers to data blocks */ From 4eaf3d7698b8d5213a7f50e099be72aeea0ae6fa Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 21 Jul 2017 17:14:09 -0700 Subject: [PATCH 0387/1212] f2fs: expose features to sysfs entry This patch exposes what features are supported by current f2fs build to sysfs entry via: /sys/fs/f2fs/features/ /sys/fs/f2fs/dev/features Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 156 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 130 insertions(+), 26 deletions(-) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 5a78b9af92ef..1e31d0c5b6ab 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -18,7 +18,6 @@ #include "gc.h" static struct proc_dir_entry *f2fs_proc_root; -static struct kset *f2fs_kset; /* Sysfs support for f2fs */ enum { @@ -41,6 +40,7 @@ struct f2fs_attr { const char *, size_t); int struct_type; int offset; + int id; }; static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) @@ -76,6 +76,34 @@ static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, BD_PART_WRITTEN(sbi))); } +static ssize_t features_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->sb; + int len = 0; + + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); + + if (f2fs_sb_has_crypto(sb)) + len += snprintf(buf, PAGE_SIZE - len, "%s", + "encryption"); + if (f2fs_sb_mounted_blkzoned(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "blkzoned"); + if (f2fs_sb_has_extra_attr(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "extra_attr"); + if (f2fs_sb_has_project_quota(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "projquota"); + if (f2fs_sb_has_inode_chksum(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "inode_checksum"); + len += snprintf(buf + len, PAGE_SIZE - len, "\n"); + return len; +} + static ssize_t f2fs_sbi_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -155,6 +183,30 @@ static void f2fs_sb_release(struct kobject *kobj) complete(&sbi->s_kobj_unregister); } +enum feat_id { + FEAT_CRYPTO = 0, + FEAT_BLKZONED, + FEAT_ATOMIC_WRITE, + FEAT_EXTRA_ATTR, + FEAT_PROJECT_QUOTA, + FEAT_INODE_CHECKSUM, +}; + +static ssize_t f2fs_feature_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + switch (a->id) { + case FEAT_CRYPTO: + case FEAT_BLKZONED: + case FEAT_ATOMIC_WRITE: + case FEAT_EXTRA_ATTR: + case FEAT_PROJECT_QUOTA: + case FEAT_INODE_CHECKSUM: + return snprintf(buf, PAGE_SIZE, "supported\n"); + } + return 0; +} + #define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \ static struct f2fs_attr f2fs_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = _mode }, \ @@ -172,6 +224,13 @@ static struct f2fs_attr f2fs_attr_##_name = { \ #define F2FS_GENERAL_RO_ATTR(name) \ static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL) +#define F2FS_FEATURE_RO_ATTR(_name, _id) \ +static struct f2fs_attr f2fs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = 0444 }, \ + .show = f2fs_feature_show, \ + .id = _id, \ +} + F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); @@ -196,6 +255,18 @@ F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); #endif F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); +F2FS_GENERAL_RO_ATTR(features); + +#ifdef CONFIG_F2FS_FS_ENCRYPTION +F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO); +#endif +#ifdef CONFIG_BLK_DEV_ZONED +F2FS_FEATURE_RO_ATTR(block_zoned, FEAT_BLKZONED); +#endif +F2FS_FEATURE_RO_ATTR(atomic_write, FEAT_ATOMIC_WRITE); +F2FS_FEATURE_RO_ATTR(extra_attr, FEAT_EXTRA_ATTR); +F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA); +F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -222,21 +293,53 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(inject_type), #endif ATTR_LIST(lifetime_write_kbytes), + ATTR_LIST(features), ATTR_LIST(reserved_blocks), NULL, }; +static struct attribute *f2fs_feat_attrs[] = { +#ifdef CONFIG_F2FS_FS_ENCRYPTION + ATTR_LIST(encryption), +#endif +#ifdef CONFIG_BLK_DEV_ZONED + ATTR_LIST(block_zoned), +#endif + ATTR_LIST(atomic_write), + ATTR_LIST(extra_attr), + ATTR_LIST(project_quota), + ATTR_LIST(inode_checksum), + NULL, +}; + static const struct sysfs_ops f2fs_attr_ops = { .show = f2fs_attr_show, .store = f2fs_attr_store, }; -static struct kobj_type f2fs_ktype = { +static struct kobj_type f2fs_sb_ktype = { .default_attrs = f2fs_attrs, .sysfs_ops = &f2fs_attr_ops, .release = f2fs_sb_release, }; +static struct kobj_type f2fs_ktype = { + .sysfs_ops = &f2fs_attr_ops, +}; + +static struct kset f2fs_kset = { + .kobj = {.ktype = &f2fs_ktype}, +}; + +static struct kobj_type f2fs_feat_ktype = { + .default_attrs = f2fs_feat_attrs, + .sysfs_ops = &f2fs_attr_ops, +}; + +static struct kobject f2fs_feat = { + .kset = &f2fs_kset, +}; + static int segment_info_seq_show(struct seq_file *seq, void *offset) { struct super_block *sb = seq->private; @@ -306,18 +409,29 @@ F2FS_PROC_FILE_DEF(segment_bits); int __init f2fs_init_sysfs(void) { - f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); + int ret; - f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj); - if (!f2fs_kset) - return -ENOMEM; - return 0; + kobject_set_name(&f2fs_kset.kobj, "f2fs"); + f2fs_kset.kobj.parent = fs_kobj; + ret = kset_register(&f2fs_kset); + if (ret) + return ret; + + ret = kobject_init_and_add(&f2fs_feat, &f2fs_feat_ktype, + NULL, "features"); + if (ret) + kset_unregister(&f2fs_kset); + else + f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); + return ret; } void f2fs_exit_sysfs(void) { - kset_unregister(f2fs_kset); + kobject_put(&f2fs_feat); + kset_unregister(&f2fs_kset); remove_proc_entry("fs/f2fs", NULL); + f2fs_proc_root = NULL; } int f2fs_register_sysfs(struct f2fs_sb_info *sbi) @@ -325,6 +439,13 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) struct super_block *sb = sbi->sb; int err; + sbi->s_kobj.kset = &f2fs_kset; + init_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &f2fs_sb_ktype, NULL, + "%s", sb->s_id); + if (err) + return err; + if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); @@ -334,32 +455,15 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) proc_create_data("segment_bits", S_IRUGO, sbi->s_proc, &f2fs_seq_segment_bits_fops, sb); } - - sbi->s_kobj.kset = f2fs_kset; - init_completion(&sbi->s_kobj_unregister); - err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL, - "%s", sb->s_id); - if (err) - goto err_out; return 0; -err_out: - if (sbi->s_proc) { - remove_proc_entry("segment_info", sbi->s_proc); - remove_proc_entry("segment_bits", sbi->s_proc); - remove_proc_entry(sb->s_id, f2fs_proc_root); - } - return err; } void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) { - kobject_del(&sbi->s_kobj); - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); - if (sbi->s_proc) { remove_proc_entry("segment_info", sbi->s_proc); remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); } + kobject_del(&sbi->s_kobj); } From 5ca0d2134d724b247e3f5df385b3b28e190cd09a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 2 Aug 2017 20:58:29 -0700 Subject: [PATCH 0388/1212] f2fs: use printk_ratelimited for f2fs_msg This patch reduces contention of printks. Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index dd28d8bed37a..d9a6f8132755 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -159,7 +159,7 @@ void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - printk("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf); + printk_ratelimited("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf); va_end(args); } From f18ec06e50207cb24e29523e7ad75237335d0b9c Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Wed, 2 Aug 2017 21:20:13 +0800 Subject: [PATCH 0389/1212] f2fs: update cur_valid_map_mir together with cur_valid_map When cur_valid_map passes the f2fs_test_and_set(,clear)_bit test, cur_valid_map_mir update is skipped unlikely, so fix it. The fix now changes the mirror check together with cur_valid_map all the time. Signed-off-by: Yunlong Song Signed-off-by: Chao Yu [Jaegeuk Kim: Fix unused variable and add unlikely for corner condition.] Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 48 +++++++++++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index af7da1b62e94..1a9737f764d1 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1583,6 +1583,10 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) struct seg_entry *se; unsigned int segno, offset; long int new_vblocks; + bool exist; +#ifdef CONFIG_F2FS_CHECK_FS + bool mir_exist; +#endif segno = GET_SEGNO(sbi, blkaddr); @@ -1599,17 +1603,23 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) /* Update valid block bitmap */ if (del > 0) { - if (f2fs_test_and_set_bit(offset, se->cur_valid_map)) { + exist = f2fs_test_and_set_bit(offset, se->cur_valid_map); #ifdef CONFIG_F2FS_CHECK_FS - if (f2fs_test_and_set_bit(offset, - se->cur_valid_map_mir)) - f2fs_bug_on(sbi, 1); - else - WARN_ON(1); -#else + mir_exist = f2fs_test_and_set_bit(offset, + se->cur_valid_map_mir); + if (unlikely(exist != mir_exist)) { + f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent error " + "when setting bitmap, blk:%u, old bit:%d", + blkaddr, exist); f2fs_bug_on(sbi, 1); -#endif } +#endif + if (unlikely(exist)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Bitmap was wrongly set, blk:%u", blkaddr); + f2fs_bug_on(sbi, 1); + } + if (f2fs_discard_en(sbi) && !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; @@ -1620,17 +1630,23 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) se->ckpt_valid_blocks++; } } else { - if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) { + exist = f2fs_test_and_clear_bit(offset, se->cur_valid_map); #ifdef CONFIG_F2FS_CHECK_FS - if (!f2fs_test_and_clear_bit(offset, - se->cur_valid_map_mir)) - f2fs_bug_on(sbi, 1); - else - WARN_ON(1); -#else + mir_exist = f2fs_test_and_clear_bit(offset, + se->cur_valid_map_mir); + if (unlikely(exist != mir_exist)) { + f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent error " + "when clearing bitmap, blk:%u, old bit:%d", + blkaddr, exist); f2fs_bug_on(sbi, 1); -#endif } +#endif + if (unlikely(!exist)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Bitmap was wrongly cleared, blk:%u", blkaddr); + f2fs_bug_on(sbi, 1); + } + if (f2fs_discard_en(sbi) && f2fs_test_and_clear_bit(offset, se->discard_map)) sbi->discard_blks++; From 98407fc7a07f5b7d21fc5caaf00844f3889e77a0 Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Wed, 2 Aug 2017 22:16:54 +0800 Subject: [PATCH 0390/1212] f2fs: do not change the valid_block value if cur_valid_map was wrongly set or cleared Signed-off-by: Yunlong Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1a9737f764d1..09df86430ed0 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1618,6 +1618,8 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) f2fs_msg(sbi->sb, KERN_ERR, "Bitmap was wrongly set, blk:%u", blkaddr); f2fs_bug_on(sbi, 1); + se->valid_blocks--; + del = 0; } if (f2fs_discard_en(sbi) && @@ -1645,6 +1647,8 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) f2fs_msg(sbi->sb, KERN_ERR, "Bitmap was wrongly cleared, blk:%u", blkaddr); f2fs_bug_on(sbi, 1); + se->valid_blocks++; + del = 0; } if (f2fs_discard_en(sbi) && From d39f75a593462334d1baf72b67e57bd93e9a1b0d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 2 Aug 2017 23:21:48 +0800 Subject: [PATCH 0391/1212] f2fs: add app/fs io stat This patch enables inner app/fs io stats and introduces below virtual fs nodes for exposing stats info: /sys/fs/f2fs//iostat_enable /proc/fs/f2fs//iostat_info Signed-off-by: Chao Yu [Jaegeuk Kim: fix wrong stat assignment] Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 34 +++++++++++++++++-------- fs/f2fs/data.c | 35 +++++++++++++++++++------- fs/f2fs/f2fs.h | 59 +++++++++++++++++++++++++++++++++++++++++--- fs/f2fs/file.c | 7 +++++- fs/f2fs/gc.c | 3 +++ fs/f2fs/inline.c | 1 + fs/f2fs/node.c | 15 ++++++----- fs/f2fs/segment.c | 21 ++++++++++++++-- fs/f2fs/super.c | 4 +++ fs/f2fs/sysfs.c | 52 ++++++++++++++++++++++++++++++++++++++ 10 files changed, 200 insertions(+), 31 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 24976959ef4b..2b29d8b836fa 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -231,8 +231,9 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true); } -static int f2fs_write_meta_page(struct page *page, - struct writeback_control *wbc) +static int __f2fs_write_meta_page(struct page *page, + struct writeback_control *wbc, + enum iostat_type io_type) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); @@ -245,7 +246,7 @@ static int f2fs_write_meta_page(struct page *page, if (unlikely(f2fs_cp_error(sbi))) goto redirty_out; - write_meta_page(sbi, page); + write_meta_page(sbi, page, io_type); dec_page_count(sbi, F2FS_DIRTY_META); if (wbc->for_reclaim) @@ -264,6 +265,12 @@ static int f2fs_write_meta_page(struct page *page, return AOP_WRITEPAGE_ACTIVATE; } +static int f2fs_write_meta_page(struct page *page, + struct writeback_control *wbc) +{ + return __f2fs_write_meta_page(page, wbc, FS_META_IO); +} + static int f2fs_write_meta_pages(struct address_space *mapping, struct writeback_control *wbc) { @@ -284,7 +291,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping, trace_f2fs_writepages(mapping->host, wbc, META); diff = nr_pages_to_write(sbi, META, wbc); - written = sync_meta_pages(sbi, META, wbc->nr_to_write); + written = sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO); mutex_unlock(&sbi->cp_mutex); wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); return 0; @@ -296,7 +303,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping, } long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, - long nr_to_write) + long nr_to_write, enum iostat_type io_type) { struct address_space *mapping = META_MAPPING(sbi); pgoff_t index = 0, end = ULONG_MAX, prev = ULONG_MAX; @@ -347,7 +354,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, if (!clear_page_dirty_for_io(page)) goto continue_unlock; - if (mapping->a_ops->writepage(page, &wbc)) { + if (__f2fs_write_meta_page(page, &wbc, io_type)) { unlock_page(page); break; } @@ -905,7 +912,14 @@ int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) if (inode) { unsigned long cur_ino = inode->i_ino; + if (is_dir) + F2FS_I(inode)->cp_task = current; + filemap_fdatawrite(inode->i_mapping); + + if (is_dir) + F2FS_I(inode)->cp_task = NULL; + iput(inode); /* We need to give cpu to another writers. */ if (ino == cur_ino) { @@ -1018,7 +1032,7 @@ static int block_operations(struct f2fs_sb_info *sbi) if (get_pages(sbi, F2FS_DIRTY_NODES)) { up_write(&sbi->node_write); - err = sync_node_pages(sbi, &wbc, false); + err = sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO); if (err) { up_write(&sbi->node_change); f2fs_unlock_all(sbi); @@ -1116,7 +1130,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) { - sync_meta_pages(sbi, META, LONG_MAX); + sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); if (unlikely(f2fs_cp_error(sbi))) return -EIO; } @@ -1195,7 +1209,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* Flush all the NAT BITS pages */ while (get_pages(sbi, F2FS_DIRTY_META)) { - sync_meta_pages(sbi, META, LONG_MAX); + sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); if (unlikely(f2fs_cp_error(sbi))) return -EIO; } @@ -1250,7 +1264,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) percpu_counter_set(&sbi->alloc_valid_block_count, 0); /* Here, we only have one bio having CP pack */ - sync_meta_pages(sbi, META_FLUSH, LONG_MAX); + sync_meta_pages(sbi, META_FLUSH, LONG_MAX, FS_CP_META_IO); /* wait for previous submitted meta pages writeback */ wait_on_all_pages_writeback(sbi); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 376d59929ded..47584eb07ddf 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1473,7 +1473,8 @@ int do_write_data_page(struct f2fs_io_info *fio) } static int __write_data_page(struct page *page, bool *submitted, - struct writeback_control *wbc) + struct writeback_control *wbc, + enum iostat_type io_type) { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -1494,6 +1495,7 @@ static int __write_data_page(struct page *page, bool *submitted, .encrypted_page = NULL, .submitted = false, .need_lock = LOCK_RETRY, + .io_type = io_type, }; trace_f2fs_writepage(page, DATA); @@ -1600,7 +1602,7 @@ static int __write_data_page(struct page *page, bool *submitted, static int f2fs_write_data_page(struct page *page, struct writeback_control *wbc) { - return __write_data_page(page, NULL, wbc); + return __write_data_page(page, NULL, wbc, FS_DATA_IO); } /* @@ -1609,7 +1611,8 @@ static int f2fs_write_data_page(struct page *page, * warm/hot data page. */ static int f2fs_write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc) + struct writeback_control *wbc, + enum iostat_type io_type) { int ret = 0; int done = 0; @@ -1699,7 +1702,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, if (!clear_page_dirty_for_io(page)) goto continue_unlock; - ret = __write_data_page(page, &submitted, wbc); + ret = __write_data_page(page, &submitted, wbc, io_type); if (unlikely(ret)) { /* * keep nr_to_write, since vfs uses this to @@ -1754,8 +1757,9 @@ static int f2fs_write_cache_pages(struct address_space *mapping, return ret; } -static int f2fs_write_data_pages(struct address_space *mapping, - struct writeback_control *wbc) +int __f2fs_write_data_pages(struct address_space *mapping, + struct writeback_control *wbc, + enum iostat_type io_type) { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -1792,7 +1796,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, goto skip_write; blk_start_plug(&plug); - ret = f2fs_write_cache_pages(mapping, wbc); + ret = f2fs_write_cache_pages(mapping, wbc, io_type); blk_finish_plug(&plug); if (wbc->sync_mode == WB_SYNC_ALL) @@ -1811,6 +1815,16 @@ static int f2fs_write_data_pages(struct address_space *mapping, return 0; } +static int f2fs_write_data_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + + return __f2fs_write_data_pages(mapping, wbc, + F2FS_I(inode)->cp_task == current ? + FS_CP_DATA_IO : FS_DATA_IO); +} + static void f2fs_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; @@ -2076,10 +2090,13 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, up_read(&F2FS_I(inode)->dio_rwsem[rw]); if (rw == WRITE) { - if (err > 0) + if (err > 0) { + f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO, + err); set_inode_flag(inode, FI_UPDATE_WRITE); - else if (err < 0) + } else if (err < 0) { f2fs_write_failed(mapping, offset + count); + } } trace_f2fs_direct_IO_exit(inode, offset, count, rw, err); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fc958df78748..976944eb8491 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -621,6 +621,7 @@ struct f2fs_inode_info { f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ struct task_struct *task; /* lookup and create consistency */ + struct task_struct *cp_task; /* separate cp/wb IO stats*/ nid_t i_xattr_nid; /* node id that contains xattrs */ loff_t last_disk_size; /* lastly written file size */ @@ -927,6 +928,23 @@ enum need_lock_type { LOCK_RETRY, }; +enum iostat_type { + APP_DIRECT_IO, /* app direct IOs */ + APP_BUFFERED_IO, /* app buffered IOs */ + APP_WRITE_IO, /* app write IOs */ + APP_MAPPED_IO, /* app mapped IOs */ + FS_DATA_IO, /* data IOs from kworker/fsync/reclaimer */ + FS_NODE_IO, /* node IOs from kworker/fsync/reclaimer */ + FS_META_IO, /* meta IOs from kworker/reclaimer */ + FS_GC_DATA_IO, /* data IOs from forground gc */ + FS_GC_NODE_IO, /* node IOs from forground gc */ + FS_CP_DATA_IO, /* data IOs from checkpoint */ + FS_CP_NODE_IO, /* node IOs from checkpoint */ + FS_CP_META_IO, /* meta IOs from checkpoint */ + FS_DISCARD, /* discard */ + NR_IO_TYPE, +}; + struct f2fs_io_info { struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ @@ -941,6 +959,7 @@ struct f2fs_io_info { bool submitted; /* indicate IO submission */ int need_lock; /* indicate we need to lock cp_rwsem */ bool in_list; /* indicate fio is in io_list */ + enum iostat_type io_type; /* io type */ }; #define is_read_io(rw) ((rw) == READ) @@ -1132,6 +1151,11 @@ struct f2fs_sb_info { #endif spinlock_t stat_lock; /* lock for stat operations */ + /* For app/fs IO statistics */ + spinlock_t iostat_lock; + unsigned long long write_iostat[NR_IO_TYPE]; + bool iostat_enable; + /* For sysfs suppport */ struct kobject s_kobj; struct completion s_kobj_unregister; @@ -2372,6 +2396,31 @@ static inline int get_extra_isize(struct inode *inode) sizeof((f2fs_inode)->field)) \ <= (F2FS_OLD_ATTRIBUTE_SIZE + extra_isize)) \ +static inline void f2fs_reset_iostat(struct f2fs_sb_info *sbi) +{ + int i; + + spin_lock(&sbi->iostat_lock); + for (i = 0; i < NR_IO_TYPE; i++) + sbi->write_iostat[i] = 0; + spin_unlock(&sbi->iostat_lock); +} + +static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, + enum iostat_type type, unsigned long long io_bytes) +{ + if (!sbi->iostat_enable) + return; + spin_lock(&sbi->iostat_lock); + sbi->write_iostat[type] += io_bytes; + + if (type == APP_WRITE_IO || type == APP_DIRECT_IO) + sbi->write_iostat[APP_BUFFERED_IO] = + sbi->write_iostat[APP_WRITE_IO] - + sbi->write_iostat[APP_DIRECT_IO]; + spin_unlock(&sbi->iostat_lock); +} + /* * file.c */ @@ -2499,7 +2548,7 @@ void move_node_page(struct page *node_page, int gc_type); int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic); int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, - bool do_balance); + bool do_balance, enum iostat_type io_type); void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount); bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid); void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); @@ -2542,7 +2591,8 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range); bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc); struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno); void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr); -void write_meta_page(struct f2fs_sb_info *sbi, struct page *page); +void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, + enum iostat_type io_type); void write_node_page(unsigned int nid, struct f2fs_io_info *fio); void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio); int rewrite_data_page(struct f2fs_io_info *fio); @@ -2583,7 +2633,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync); void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index); long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, - long nr_to_write); + long nr_to_write, enum iostat_type io_type); void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); void release_ino_entry(struct f2fs_sb_info *sbi, bool all); @@ -2636,6 +2686,9 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); void f2fs_set_page_dirty_nobuffers(struct page *page); +int __f2fs_write_data_pages(struct address_space *mapping, + struct writeback_control *wbc, + enum iostat_type io_type); void f2fs_invalidate_page(struct page *page, unsigned int offset, unsigned int length); int f2fs_release_page(struct page *page, gfp_t wait); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index db3f5023c713..a606dadcedee 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -101,6 +101,8 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, if (!PageUptodate(page)) SetPageUptodate(page); + f2fs_update_iostat(sbi, APP_MAPPED_IO, F2FS_BLKSIZE); + trace_f2fs_vm_page_mkwrite(page, DATA); mapped: /* fill the page */ @@ -1792,7 +1794,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) f2fs_stop_checkpoint(sbi, false); break; case F2FS_GOING_DOWN_METAFLUSH: - sync_meta_pages(sbi, META, LONG_MAX); + sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO); f2fs_stop_checkpoint(sbi, false); break; default: @@ -2473,6 +2475,9 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = __generic_file_write_iter(iocb, from); blk_finish_plug(&plug); clear_inode_flag(inode, FI_NO_PREALLOC); + + if (ret > 0) + f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret); } inode_unlock(inode); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index f74685ae008b..0cf76a5e3997 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -689,6 +689,8 @@ static void move_encrypted_block(struct inode *inode, block_t bidx, fio.new_blkaddr = newaddr; f2fs_submit_page_write(&fio); + f2fs_update_iostat(fio.sbi, FS_GC_DATA_IO, F2FS_BLKSIZE); + f2fs_update_data_blkaddr(&dn, newaddr); set_inode_flag(inode, FI_APPEND_WRITE); if (page->index == 0) @@ -736,6 +738,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, .page = page, .encrypted_page = NULL, .need_lock = LOCK_REQ, + .io_type = FS_GC_DATA_IO, }; bool is_dirty = PageDirty(page); int err; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index ed5b1153901e..041072017ef8 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -117,6 +117,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) .op_flags = REQ_SYNC | REQ_NOIDLE | REQ_PRIO, .page = page, .encrypted_page = NULL, + .io_type = FS_DATA_IO, }; int dirty, err; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 6c7cc7cdf776..bc748df0b04f 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1332,7 +1332,8 @@ static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) } static int __write_node_page(struct page *page, bool atomic, bool *submitted, - struct writeback_control *wbc, bool do_balance) + struct writeback_control *wbc, bool do_balance, + enum iostat_type io_type) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); nid_t nid; @@ -1345,6 +1346,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, .page = page, .encrypted_page = NULL, .submitted = false, + .io_type = io_type, }; trace_f2fs_writepage(page, NODE); @@ -1413,7 +1415,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, static int f2fs_write_node_page(struct page *page, struct writeback_control *wbc) { - return __write_node_page(page, false, NULL, wbc, false); + return __write_node_page(page, false, NULL, wbc, false, FS_NODE_IO); } int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, @@ -1501,7 +1503,8 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, ret = __write_node_page(page, atomic && page == last_page, - &submitted, wbc, true); + &submitted, wbc, true, + FS_NODE_IO); if (ret) { unlock_page(page); f2fs_put_page(last_page, 0); @@ -1539,7 +1542,7 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, } int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, - bool do_balance) + bool do_balance, enum iostat_type io_type) { pgoff_t index, end; struct pagevec pvec; @@ -1618,7 +1621,7 @@ int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, set_dentry_mark(page, 0); ret = __write_node_page(page, false, &submitted, - wbc, do_balance); + wbc, do_balance, io_type); if (ret) unlock_page(page); else if (submitted) @@ -1710,7 +1713,7 @@ static int f2fs_write_node_pages(struct address_space *mapping, diff = nr_pages_to_write(sbi, NODE, wbc); wbc->sync_mode = WB_SYNC_NONE; blk_start_plug(&plug); - sync_node_pages(sbi, wbc, true); + sync_node_pages(sbi, wbc, true, FS_NODE_IO); blk_finish_plug(&plug); wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return 0; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 09df86430ed0..edc7c3d254c7 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -292,6 +292,7 @@ static int __commit_inmem_pages(struct inode *inode, .type = DATA, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC | REQ_PRIO, + .io_type = FS_DATA_IO, }; pgoff_t last_idx = ULONG_MAX; int err = 0; @@ -903,6 +904,8 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, submit_bio(REQ_SYNC, bio); list_move_tail(&dc->list, &dcc->wait_list); __check_sit_bitmap(sbi, dc->start, dc->start + dc->len); + + f2fs_update_iostat(sbi, FS_DISCARD, 1); } } else { __remove_discard_cmd(sbi, dc); @@ -2351,7 +2354,8 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) } } -void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) +void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, + enum iostat_type io_type) { struct f2fs_io_info fio = { .sbi = sbi, @@ -2370,6 +2374,8 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) set_page_writeback(page); f2fs_submit_page_write(&fio); + + f2fs_update_iostat(sbi, io_type, F2FS_BLKSIZE); } void write_node_page(unsigned int nid, struct f2fs_io_info *fio) @@ -2378,6 +2384,8 @@ void write_node_page(unsigned int nid, struct f2fs_io_info *fio) set_summary(&sum, nid, 0, 0); do_write_page(&sum, fio); + + f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); } void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) @@ -2391,13 +2399,22 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); do_write_page(&sum, fio); f2fs_update_data_blkaddr(dn, fio->new_blkaddr); + + f2fs_update_iostat(sbi, fio->io_type, F2FS_BLKSIZE); } int rewrite_data_page(struct f2fs_io_info *fio) { + int err; + fio->new_blkaddr = fio->old_blkaddr; stat_inc_inplace_blocks(fio->sbi); - return f2fs_submit_page_bio(fio); + + err = f2fs_submit_page_bio(fio); + + f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); + + return err; } void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d9a6f8132755..318df0660b74 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2064,6 +2064,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) set_sbi_flag(sbi, SBI_POR_DOING); spin_lock_init(&sbi->stat_lock); + /* init iostat info */ + spin_lock_init(&sbi->iostat_lock); + sbi->iostat_enable = false; + for (i = 0; i < NR_PAGE_TYPE; i++) { int n = (i == META) ? 1: NR_TEMP_TYPE; int j; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 1e31d0c5b6ab..3d6bbdb743b0 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -153,6 +153,10 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, return count; } *ui = t; + + if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0) + f2fs_reset_iostat(sbi); + return count; } @@ -250,6 +254,7 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); #ifdef CONFIG_F2FS_FAULT_INJECTION F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); @@ -288,6 +293,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(dirty_nats_ratio), ATTR_LIST(cp_interval), ATTR_LIST(idle_interval), + ATTR_LIST(iostat_enable), #ifdef CONFIG_F2FS_FAULT_INJECTION ATTR_LIST(inject_rate), ATTR_LIST(inject_type), @@ -391,6 +397,48 @@ static int segment_bits_seq_show(struct seq_file *seq, void *offset) return 0; } +static int iostat_info_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + time64_t now = ktime_get_real_seconds(); + + if (!sbi->iostat_enable) + return 0; + + seq_printf(seq, "time: %-16llu\n", now); + + /* print app IOs */ + seq_printf(seq, "app buffered: %-16llu\n", + sbi->write_iostat[APP_BUFFERED_IO]); + seq_printf(seq, "app direct: %-16llu\n", + sbi->write_iostat[APP_DIRECT_IO]); + seq_printf(seq, "app mapped: %-16llu\n", + sbi->write_iostat[APP_MAPPED_IO]); + + /* print fs IOs */ + seq_printf(seq, "fs data: %-16llu\n", + sbi->write_iostat[FS_DATA_IO]); + seq_printf(seq, "fs node: %-16llu\n", + sbi->write_iostat[FS_NODE_IO]); + seq_printf(seq, "fs meta: %-16llu\n", + sbi->write_iostat[FS_META_IO]); + seq_printf(seq, "fs gc data: %-16llu\n", + sbi->write_iostat[FS_GC_DATA_IO]); + seq_printf(seq, "fs gc node: %-16llu\n", + sbi->write_iostat[FS_GC_NODE_IO]); + seq_printf(seq, "fs cp data: %-16llu\n", + sbi->write_iostat[FS_CP_DATA_IO]); + seq_printf(seq, "fs cp node: %-16llu\n", + sbi->write_iostat[FS_CP_NODE_IO]); + seq_printf(seq, "fs cp meta: %-16llu\n", + sbi->write_iostat[FS_CP_META_IO]); + seq_printf(seq, "fs discard: %-16llu\n", + sbi->write_iostat[FS_DISCARD]); + + return 0; +} + #define F2FS_PROC_FILE_DEF(_name) \ static int _name##_open_fs(struct inode *inode, struct file *file) \ { \ @@ -406,6 +454,7 @@ static const struct file_operations f2fs_seq_##_name##_fops = { \ F2FS_PROC_FILE_DEF(segment_info); F2FS_PROC_FILE_DEF(segment_bits); +F2FS_PROC_FILE_DEF(iostat_info); int __init f2fs_init_sysfs(void) { @@ -454,6 +503,8 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) &f2fs_seq_segment_info_fops, sb); proc_create_data("segment_bits", S_IRUGO, sbi->s_proc, &f2fs_seq_segment_bits_fops, sb); + proc_create_data("iostat_info", S_IRUGO, sbi->s_proc, + &f2fs_seq_iostat_info_fops, sb); } return 0; } @@ -461,6 +512,7 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) { if (sbi->s_proc) { + remove_proc_entry("iostat_info", sbi->s_proc); remove_proc_entry("segment_info", sbi->s_proc); remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); From c9881425b5b16c4dd9656d6ae0b95029157ccc50 Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Fri, 4 Aug 2017 17:07:15 +0800 Subject: [PATCH 0392/1212] f2fs: fix the size value in __check_sit_bitmap The current size value is not correct and will miss bitmap check. Signed-off-by: Yunlong Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index edc7c3d254c7..20f466ace8b0 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -868,11 +868,14 @@ void __check_sit_bitmap(struct f2fs_sb_info *sbi, sentry = get_seg_entry(sbi, segno); offset = GET_BLKOFF_FROM_SEG0(sbi, blk); - size = min((unsigned long)(end - blk), max_blocks); + if (end < START_BLOCK(sbi, segno + 1)) + size = GET_BLKOFF_FROM_SEG0(sbi, end); + else + size = max_blocks; map = (unsigned long *)(sentry->cur_valid_map); offset = __find_rev_next_bit(map, size, offset); f2fs_bug_on(sbi, offset != size); - blk += size; + blk = START_BLOCK(sbi, segno + 1); } #endif } From 2d982d49c3205e47fc8bf92ee98c4a4e90e67cfd Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sat, 5 Aug 2017 14:25:08 -0700 Subject: [PATCH 0393/1212] f2fs: use IPU for cold files We expect cold files write data sequentially, but sometimes some of small data can be updated, which incurs fragmentation. Let's avoid that. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index e9ba1f1d9723..84242eb5226f 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -577,6 +577,10 @@ static inline bool need_inplace_update_policy(struct inode *inode, if (test_opt(sbi, LFS)) return false; + /* if this is cold file, we should overwrite to avoid fragmentation */ + if (file_is_cold(inode)) + return true; + if (policy & (0x1 << F2FS_IPU_FORCE)) return true; if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi)) From bdf6e5ea9299f0893e3b304316941ae40a7a3897 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sun, 6 Aug 2017 22:09:00 -0700 Subject: [PATCH 0394/1212] f2fs: introduce gc_urgent mode for background GC This patch adds a sysfs entry to control urgent mode for background GC. If this is set, background GC thread conducts GC with gc_urgent_sleep_time all the time. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 12 ++++++++++++ Documentation/filesystems/f2fs.txt | 9 +++++++++ fs/f2fs/gc.c | 17 +++++++++++++++-- fs/f2fs/gc.h | 4 ++++ fs/f2fs/sysfs.c | 9 +++++++++ 5 files changed, 49 insertions(+), 2 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 2805ce062fdb..6c2c50b4e781 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -124,3 +124,15 @@ Date: June 2017 Contact: "Chao Yu" Description: Controls current reserved blocks in system. + +What: /sys/fs/f2fs//gc_urgent +Date: August 2017 +Contact: "Jaegeuk Kim" +Description: + Do background GC agressively + +What: /sys/fs/f2fs//gc_urgent_sleep_time +Date: August 2017 +Contact: "Jaegeuk Kim" +Description: + Controls sleep time of GC urgent mode diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index 5cf383f7fa8a..8a3f991098ad 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -208,6 +208,15 @@ Files in /sys/fs/f2fs/ gc_idle = 1 will select the Cost Benefit approach & setting gc_idle = 2 will select the greedy approach. + gc_urgent This parameter controls triggering background GCs + urgently or not. Setting gc_urgent = 0 [default] + makes back to default behavior, while if it is set + to 1, background thread starts to do GC by given + gc_urgent_sleep_time interval. + + gc_urgent_sleep_time This parameter controls sleep time for gc_urgent. + 500 ms is set by default. See above gc_urgent. + reclaim_segments This parameter controls the number of prefree segments to be reclaimed. If the number of prefree segments is larger than the number of segments diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 0cf76a5e3997..3c05eea382b9 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -35,9 +35,14 @@ static int gc_thread_func(void *data) set_freezable(); do { wait_event_interruptible_timeout(*wq, - kthread_should_stop() || freezing(current), + kthread_should_stop() || freezing(current) || + gc_th->gc_wake, msecs_to_jiffies(wait_ms)); + /* give it a try one time */ + if (gc_th->gc_wake) + gc_th->gc_wake = 0; + if (try_to_freeze()) continue; if (kthread_should_stop()) @@ -74,6 +79,11 @@ static int gc_thread_func(void *data) if (!mutex_trylock(&sbi->gc_mutex)) goto next; + if (gc_th->gc_urgent) { + wait_ms = gc_th->urgent_sleep_time; + goto do_gc; + } + if (!is_idle(sbi)) { increase_sleep_time(gc_th, &wait_ms); mutex_unlock(&sbi->gc_mutex); @@ -84,7 +94,7 @@ static int gc_thread_func(void *data) decrease_sleep_time(gc_th, &wait_ms); else increase_sleep_time(gc_th, &wait_ms); - +do_gc: stat_inc_bggc_count(sbi); /* if return value is not zero, no victim was selected */ @@ -115,11 +125,14 @@ int start_gc_thread(struct f2fs_sb_info *sbi) goto out; } + gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME; gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME; gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME; gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME; gc_th->gc_idle = 0; + gc_th->gc_urgent = 0; + gc_th->gc_wake= 0; sbi->gc_thread = gc_th; init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index a993967dcdb9..57a9000ce3af 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -13,6 +13,7 @@ * whether IO subsystem is idle * or not */ +#define DEF_GC_THREAD_URGENT_SLEEP_TIME 500 /* 500 ms */ #define DEF_GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */ #define DEF_GC_THREAD_MAX_SLEEP_TIME 60000 #define DEF_GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */ @@ -27,12 +28,15 @@ struct f2fs_gc_kthread { wait_queue_head_t gc_wait_queue_head; /* for gc sleep time */ + unsigned int urgent_sleep_time; unsigned int min_sleep_time; unsigned int max_sleep_time; unsigned int no_gc_sleep_time; /* for changing gc mode */ unsigned int gc_idle; + unsigned int gc_urgent; + unsigned int gc_wake; }; struct gc_inode_list { diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 3d6bbdb743b0..c40e5d24df9f 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -156,6 +156,10 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0) f2fs_reset_iostat(sbi); + if (!strcmp(a->attr.name, "gc_urgent") && t == 1 && sbi->gc_thread) { + sbi->gc_thread->gc_wake = 1; + wake_up_interruptible_all(&sbi->gc_thread->gc_wait_queue_head); + } return count; } @@ -235,10 +239,13 @@ static struct f2fs_attr f2fs_attr_##_name = { \ .id = _id, \ } +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent_sleep_time, + urgent_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent, gc_urgent); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); @@ -275,10 +282,12 @@ F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { + ATTR_LIST(gc_urgent_sleep_time), ATTR_LIST(gc_min_sleep_time), ATTR_LIST(gc_max_sleep_time), ATTR_LIST(gc_no_gc_sleep_time), ATTR_LIST(gc_idle), + ATTR_LIST(gc_urgent), ATTR_LIST(reclaim_segments), ATTR_LIST(max_small_discards), ATTR_LIST(batched_trim_sections), From 3f42e75b2df897eeaac4f5c8c38d2c072499753a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 7 Aug 2017 16:37:59 +0800 Subject: [PATCH 0395/1212] f2fs: avoid unneeded sync on quota file We only need to sync quota file with appointed quota type instead of all types in f2fs_quota_{on,off}. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 318df0660b74..801ab4ceeb36 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1177,7 +1177,7 @@ static int f2fs_quota_on(struct super_block *sb, int type, int format_id, struct inode *inode; int err; - err = f2fs_quota_sync(sb, -1); + err = f2fs_quota_sync(sb, type); if (err) return err; @@ -1205,7 +1205,7 @@ static int f2fs_quota_off(struct super_block *sb, int type) if (!inode || !igrab(inode)) return dquot_quota_off(sb, type); - f2fs_quota_sync(sb, -1); + f2fs_quota_sync(sb, type); err = dquot_quota_off(sb, type); if (err) From 3ba499e5fa949b7711d45b8957c670029adf51c5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 7 Aug 2017 23:12:46 +0800 Subject: [PATCH 0396/1212] f2fs: fix potential overflow when adjusting GC cycle While comparing signed and unsigned variables, compiler will converts the signed value to unsigned one, due to this reason, {in,de}crease_sleep_time may return overflowed result. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 2 +- fs/f2fs/gc.h | 23 +++++++++++++++-------- include/trace/events/f2fs.h | 6 +++--- 3 files changed, 19 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 3c05eea382b9..faed28e56203 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -28,7 +28,7 @@ static int gc_thread_func(void *data) struct f2fs_sb_info *sbi = data; struct f2fs_gc_kthread *gc_th = sbi->gc_thread; wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; - long wait_ms; + unsigned int wait_ms; wait_ms = gc_th->min_sleep_time; diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 57a9000ce3af..9325191fab2d 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -69,25 +69,32 @@ static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi) } static inline void increase_sleep_time(struct f2fs_gc_kthread *gc_th, - long *wait) + unsigned int *wait) { + unsigned int min_time = gc_th->min_sleep_time; + unsigned int max_time = gc_th->max_sleep_time; + if (*wait == gc_th->no_gc_sleep_time) return; - *wait += gc_th->min_sleep_time; - if (*wait > gc_th->max_sleep_time) - *wait = gc_th->max_sleep_time; + if ((long long)*wait + (long long)min_time > (long long)max_time) + *wait = max_time; + else + *wait += min_time; } static inline void decrease_sleep_time(struct f2fs_gc_kthread *gc_th, - long *wait) + unsigned int *wait) { + unsigned int min_time = gc_th->min_sleep_time; + if (*wait == gc_th->no_gc_sleep_time) *wait = gc_th->max_sleep_time; - *wait -= gc_th->min_sleep_time; - if (*wait <= gc_th->min_sleep_time) - *wait = gc_th->min_sleep_time; + if ((long long)*wait - (long long)min_time < (long long)min_time) + *wait = min_time; + else + *wait -= min_time; } static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 20c4556ab56d..167c40850f98 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -535,14 +535,14 @@ TRACE_EVENT(f2fs_map_blocks, TRACE_EVENT(f2fs_background_gc, - TP_PROTO(struct super_block *sb, long wait_ms, + TP_PROTO(struct super_block *sb, unsigned int wait_ms, unsigned int prefree, unsigned int free), TP_ARGS(sb, wait_ms, prefree, free), TP_STRUCT__entry( __field(dev_t, dev) - __field(long, wait_ms) + __field(unsigned int, wait_ms) __field(unsigned int, prefree) __field(unsigned int, free) ), @@ -554,7 +554,7 @@ TRACE_EVENT(f2fs_background_gc, __entry->free = free; ), - TP_printk("dev = (%d,%d), wait_ms = %ld, prefree = %u, free = %u", + TP_printk("dev = (%d,%d), wait_ms = %u, prefree = %u, free = %u", show_dev(__entry->dev), __entry->wait_ms, __entry->prefree, From 9e6ece8a411241dd247f1c8afa0ca5860d1ba1d7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 8 Aug 2017 10:54:31 +0800 Subject: [PATCH 0397/1212] f2fs: support journalled quota This patch supports to enable f2fs to accept quota information through mount option: - {usr,grp,prj}jquota= - jqfmt= Then, in ->mount flow, we can recover quota file during log replaying, by this, journelled quota can be supported. Signed-off-by: Chao Yu [Jaegeuk Kim: Fix wrong return values.] Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 9 + fs/f2fs/checkpoint.c | 26 ++- fs/f2fs/f2fs.h | 9 + fs/f2fs/recovery.c | 72 ++++++- fs/f2fs/super.c | 326 +++++++++++++++++++++++++++-- 5 files changed, 412 insertions(+), 30 deletions(-) diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index 8a3f991098ad..6cf9ad12c57f 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -163,6 +163,15 @@ io_bits=%u Set the bit size of write IO requests. It should be set usrquota Enable plain user disk quota accounting. grpquota Enable plain group disk quota accounting. prjquota Enable plain project quota accounting. +usrjquota= Appoint specified file and type during mount, so that quota +grpjquota= information can be properly updated during recovery flow, +prjjquota= : must be in root directory; +jqfmt= : [vfsold,vfsv0,vfsv1]. +offusrjquota Turn off user journelled quota. +offgrpjquota Turn off group journelled quota. +offprjjquota Turn off project journelled quota. +quota Enable plain user disk quota accounting. +noquota Disable all plain disk quota option. ================================================================================ DEBUGFS ENTRIES diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 2b29d8b836fa..e86f67ac96c6 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -589,11 +589,24 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) int recover_orphan_inodes(struct f2fs_sb_info *sbi) { block_t start_blk, orphan_blocks, i, j; - int err; + unsigned int s_flags = sbi->sb->s_flags; + int err = 0; if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG)) return 0; + if (s_flags & MS_RDONLY) { + f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs"); + sbi->sb->s_flags &= ~MS_RDONLY; + } + +#ifdef CONFIG_QUOTA + /* Needed for iput() to work correctly and not trash data */ + sbi->sb->s_flags |= MS_ACTIVE; + /* Turn on quotas so that they are updated correctly */ + f2fs_enable_quota_files(sbi); +#endif + start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi); @@ -609,14 +622,21 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) err = recover_orphan_inode(sbi, ino); if (err) { f2fs_put_page(page, 1); - return err; + goto out; } } f2fs_put_page(page, 1); } /* clear Orphan Flag */ clear_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG); - return 0; +out: +#ifdef CONFIG_QUOTA + /* Turn quotas off */ + f2fs_quota_off_umount(sbi->sb); +#endif + sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ + + return err; } static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 976944eb8491..310d8588ad3c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -93,6 +93,7 @@ extern char *fault_name[FAULT_MAX]; #define F2FS_MOUNT_USRQUOTA 0x00080000 #define F2FS_MOUNT_GRPQUOTA 0x00100000 #define F2FS_MOUNT_PRJQUOTA 0x00200000 +#define F2FS_MOUNT_QUOTA 0x00400000 #define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option) @@ -1181,6 +1182,12 @@ struct f2fs_sb_info { #ifdef CONFIG_F2FS_FAULT_INJECTION struct f2fs_fault_info fault_info; #endif + +#ifdef CONFIG_QUOTA + /* Names of quota files with journalled quota */ + char *s_qf_names[MAXQUOTAS]; + int s_jquota_fmt; /* Format of quota to use */ +#endif }; #ifdef CONFIG_F2FS_FAULT_INJECTION @@ -2510,6 +2517,8 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) */ int f2fs_inode_dirtied(struct inode *inode, bool sync); void f2fs_inode_synced(struct inode *inode); +void f2fs_enable_quota_files(struct f2fs_sb_info *sbi); +void f2fs_quota_off_umount(struct super_block *sb); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); extern __printf(3, 4) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 2d9b8182691f..a3d02613934a 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -69,20 +69,34 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, } static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi, - struct list_head *head, nid_t ino) + struct list_head *head, nid_t ino, bool quota_inode) { struct inode *inode; struct fsync_inode_entry *entry; + int err; inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) return ERR_CAST(inode); + err = dquot_initialize(inode); + if (err) + goto err_out; + + if (quota_inode) { + err = dquot_alloc_inode(inode); + if (err) + goto err_out; + } + entry = f2fs_kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO); entry->inode = inode; list_add_tail(&entry->list, head); return entry; +err_out: + iput(inode); + return ERR_PTR(err); } static void del_fsync_inode(struct fsync_inode_entry *entry) @@ -107,7 +121,8 @@ static int recover_dentry(struct inode *inode, struct page *ipage, entry = get_fsync_inode(dir_list, pino); if (!entry) { - entry = add_fsync_inode(F2FS_I_SB(inode), dir_list, pino); + entry = add_fsync_inode(F2FS_I_SB(inode), dir_list, + pino, false); if (IS_ERR(entry)) { dir = ERR_CAST(entry); err = PTR_ERR(entry); @@ -140,6 +155,13 @@ static int recover_dentry(struct inode *inode, struct page *ipage, err = -EEXIST; goto out_unmap_put; } + + err = dquot_initialize(einode); + if (err) { + iput(einode); + goto out_unmap_put; + } + err = acquire_orphan_inode(F2FS_I_SB(inode)); if (err) { iput(einode); @@ -226,18 +248,22 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, entry = get_fsync_inode(head, ino_of_node(page)); if (!entry) { + bool quota_inode = false; + if (!check_only && IS_INODE(page) && is_dent_dnode(page)) { err = recover_inode_page(sbi, page); if (err) break; + quota_inode = true; } /* * CP | dnode(F) | inode(DF) * For this case, we should not give up now. */ - entry = add_fsync_inode(sbi, head, ino_of_node(page)); + entry = add_fsync_inode(sbi, head, ino_of_node(page), + quota_inode); if (IS_ERR(entry)) { err = PTR_ERR(entry); if (err == -ENOENT) { @@ -328,10 +354,18 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, f2fs_put_page(node_page, 1); if (ino != dn->inode->i_ino) { + int ret; + /* Deallocate previous index in the node page */ inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) return PTR_ERR(inode); + + ret = dquot_initialize(inode); + if (ret) { + iput(inode); + return ret; + } } else { inode = dn->inode; } @@ -558,12 +592,27 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) struct list_head dir_list; int err; int ret = 0; + unsigned long s_flags = sbi->sb->s_flags; bool need_writecp = false; + if (s_flags & MS_RDONLY) { + f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs"); + sbi->sb->s_flags &= ~MS_RDONLY; + } + +#ifdef CONFIG_QUOTA + /* Needed for iput() to work correctly and not trash data */ + sbi->sb->s_flags |= MS_ACTIVE; + /* Turn on quotas so that they are updated correctly */ + f2fs_enable_quota_files(sbi); +#endif + fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", sizeof(struct fsync_inode_entry)); - if (!fsync_entry_slab) - return -ENOMEM; + if (!fsync_entry_slab) { + err = -ENOMEM; + goto out; + } INIT_LIST_HEAD(&inode_list); INIT_LIST_HEAD(&dir_list); @@ -574,11 +623,11 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) /* step #1: find fsynced inode numbers */ err = find_fsync_dnodes(sbi, &inode_list, check_only); if (err || list_empty(&inode_list)) - goto out; + goto skip; if (check_only) { ret = 1; - goto out; + goto skip; } need_writecp = true; @@ -587,7 +636,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) err = recover_data(sbi, &inode_list, &dir_list); if (!err) f2fs_bug_on(sbi, !list_empty(&inode_list)); -out: +skip: destroy_fsync_dnodes(&inode_list); /* truncate meta pages to be used by the recovery */ @@ -615,5 +664,12 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) } kmem_cache_destroy(fsync_entry_slab); +out: +#ifdef CONFIG_QUOTA + /* Turn quotas off */ + f2fs_quota_off_umount(sbi->sb); +#endif + sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ + return ret ? ret: err; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 801ab4ceeb36..4a5eae7ec64c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" @@ -107,9 +108,20 @@ enum { Opt_fault_injection, Opt_lazytime, Opt_nolazytime, + Opt_quota, + Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota, + Opt_usrjquota, + Opt_grpjquota, + Opt_prjjquota, + Opt_offusrjquota, + Opt_offgrpjquota, + Opt_offprjjquota, + Opt_jqfmt_vfsold, + Opt_jqfmt_vfsv0, + Opt_jqfmt_vfsv1, Opt_err, }; @@ -145,9 +157,20 @@ static match_table_t f2fs_tokens = { {Opt_fault_injection, "fault_injection=%u"}, {Opt_lazytime, "lazytime"}, {Opt_nolazytime, "nolazytime"}, + {Opt_quota, "quota"}, + {Opt_noquota, "noquota"}, {Opt_usrquota, "usrquota"}, {Opt_grpquota, "grpquota"}, {Opt_prjquota, "prjquota"}, + {Opt_usrjquota, "usrjquota=%s"}, + {Opt_grpjquota, "grpjquota=%s"}, + {Opt_prjjquota, "prjjquota=%s"}, + {Opt_offusrjquota, "usrjquota="}, + {Opt_offgrpjquota, "grpjquota="}, + {Opt_offprjjquota, "prjjquota="}, + {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, + {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, + {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, {Opt_err, NULL}, }; @@ -170,6 +193,104 @@ static void init_once(void *foo) inode_init_once(&fi->vfs_inode); } +#ifdef CONFIG_QUOTA +static const char * const quotatypes[] = INITQFNAMES; +#define QTYPE2NAME(t) (quotatypes[t]) +static int f2fs_set_qf_name(struct super_block *sb, int qtype, + substring_t *args) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + char *qname; + int ret = -EINVAL; + + if (sb_any_quota_loaded(sb) && !sbi->s_qf_names[qtype]) { + f2fs_msg(sb, KERN_ERR, + "Cannot change journaled " + "quota options when quota turned on"); + return -EINVAL; + } + qname = match_strdup(args); + if (!qname) { + f2fs_msg(sb, KERN_ERR, + "Not enough memory for storing quotafile name"); + return -EINVAL; + } + if (sbi->s_qf_names[qtype]) { + if (strcmp(sbi->s_qf_names[qtype], qname) == 0) + ret = 0; + else + f2fs_msg(sb, KERN_ERR, + "%s quota file already specified", + QTYPE2NAME(qtype)); + goto errout; + } + if (strchr(qname, '/')) { + f2fs_msg(sb, KERN_ERR, + "quotafile must be on filesystem root"); + goto errout; + } + sbi->s_qf_names[qtype] = qname; + set_opt(sbi, QUOTA); + return 0; +errout: + kfree(qname); + return ret; +} + +static int f2fs_clear_qf_name(struct super_block *sb, int qtype) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + if (sb_any_quota_loaded(sb) && sbi->s_qf_names[qtype]) { + f2fs_msg(sb, KERN_ERR, "Cannot change journaled quota options" + " when quota turned on"); + return -EINVAL; + } + kfree(sbi->s_qf_names[qtype]); + sbi->s_qf_names[qtype] = NULL; + return 0; +} + +static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) +{ + /* + * We do the test below only for project quotas. 'usrquota' and + * 'grpquota' mount options are allowed even without quota feature + * to support legacy quotas in quota files. + */ + if (test_opt(sbi, PRJQUOTA) && !f2fs_sb_has_project_quota(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_ERR, "Project quota feature not enabled. " + "Cannot enable project quota enforcement."); + return -1; + } + if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA] || + sbi->s_qf_names[PRJQUOTA]) { + if (test_opt(sbi, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) + clear_opt(sbi, USRQUOTA); + + if (test_opt(sbi, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) + clear_opt(sbi, GRPQUOTA); + + if (test_opt(sbi, PRJQUOTA) && sbi->s_qf_names[PRJQUOTA]) + clear_opt(sbi, PRJQUOTA); + + if (test_opt(sbi, GRPQUOTA) || test_opt(sbi, USRQUOTA) || + test_opt(sbi, PRJQUOTA)) { + f2fs_msg(sbi->sb, KERN_ERR, "old and new quota " + "format mixing"); + return -1; + } + + if (!sbi->s_jquota_fmt) { + f2fs_msg(sbi->sb, KERN_ERR, "journaled quota format " + "not specified"); + return -1; + } + } + return 0; +} +#endif + static int parse_options(struct super_block *sb, char *options) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -177,6 +298,9 @@ static int parse_options(struct super_block *sb, char *options) substring_t args[MAX_OPT_ARGS]; char *p, *name; int arg = 0; +#ifdef CONFIG_QUOTA + int ret; +#endif if (!options) return 0; @@ -388,6 +512,7 @@ static int parse_options(struct super_block *sb, char *options) sb->s_flags &= ~MS_LAZYTIME; break; #ifdef CONFIG_QUOTA + case Opt_quota: case Opt_usrquota: set_opt(sbi, USRQUOTA); break; @@ -397,10 +522,66 @@ static int parse_options(struct super_block *sb, char *options) case Opt_prjquota: set_opt(sbi, PRJQUOTA); break; + case Opt_usrjquota: + ret = f2fs_set_qf_name(sb, USRQUOTA, &args[0]); + if (ret) + return ret; + break; + case Opt_grpjquota: + ret = f2fs_set_qf_name(sb, GRPQUOTA, &args[0]); + if (ret) + return ret; + break; + case Opt_prjjquota: + ret = f2fs_set_qf_name(sb, PRJQUOTA, &args[0]); + if (ret) + return ret; + break; + case Opt_offusrjquota: + ret = f2fs_clear_qf_name(sb, USRQUOTA); + if (ret) + return ret; + break; + case Opt_offgrpjquota: + ret = f2fs_clear_qf_name(sb, GRPQUOTA); + if (ret) + return ret; + break; + case Opt_offprjjquota: + ret = f2fs_clear_qf_name(sb, PRJQUOTA); + if (ret) + return ret; + break; + case Opt_jqfmt_vfsold: + sbi->s_jquota_fmt = QFMT_VFS_OLD; + break; + case Opt_jqfmt_vfsv0: + sbi->s_jquota_fmt = QFMT_VFS_V0; + break; + case Opt_jqfmt_vfsv1: + sbi->s_jquota_fmt = QFMT_VFS_V1; + break; + case Opt_noquota: + clear_opt(sbi, QUOTA); + clear_opt(sbi, USRQUOTA); + clear_opt(sbi, GRPQUOTA); + clear_opt(sbi, PRJQUOTA); + break; #else + case Opt_quota: case Opt_usrquota: case Opt_grpquota: case Opt_prjquota: + case Opt_usrjquota: + case Opt_grpjquota: + case Opt_prjjquota: + case Opt_offusrjquota: + case Opt_offgrpjquota: + case Opt_offprjjquota: + case Opt_jqfmt_vfsold: + case Opt_jqfmt_vfsv0: + case Opt_jqfmt_vfsv1: + case Opt_noquota: f2fs_msg(sb, KERN_INFO, "quota operations not supported"); break; @@ -412,6 +593,10 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; } } +#ifdef CONFIG_QUOTA + if (f2fs_check_quota_options(sbi)) + return -EINVAL; +#endif if (F2FS_IO_SIZE_BITS(sbi) && !test_opt(sbi, LFS)) { f2fs_msg(sb, KERN_ERR, @@ -591,7 +776,6 @@ static void destroy_device_list(struct f2fs_sb_info *sbi) kfree(sbi->devs); } -static void f2fs_quota_off_umount(struct super_block *sb); static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -659,6 +843,10 @@ static void f2fs_put_super(struct super_block *sb) destroy_device_list(sbi); if (sbi->write_io_dummy) mempool_destroy(sbi->write_io_dummy); +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +#endif destroy_percpu_info(sbi); for (i = 0; i < NR_PAGE_TYPE; i++) kfree(sbi->write_io[i]); @@ -672,6 +860,9 @@ int f2fs_sync_fs(struct super_block *sb, int sync) trace_f2fs_sync_fs(sb, sync); + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + return -EAGAIN; + if (sync) { struct cp_control cpc; @@ -792,6 +983,40 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } +static inline void f2fs_show_quota_options(struct seq_file *seq, + struct super_block *sb) +{ +#ifdef CONFIG_QUOTA + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + if (sbi->s_jquota_fmt) { + char *fmtname = ""; + + switch (sbi->s_jquota_fmt) { + case QFMT_VFS_OLD: + fmtname = "vfsold"; + break; + case QFMT_VFS_V0: + fmtname = "vfsv0"; + break; + case QFMT_VFS_V1: + fmtname = "vfsv1"; + break; + } + seq_printf(seq, ",jqfmt=%s", fmtname); + } + + if (sbi->s_qf_names[USRQUOTA]) + seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]); + + if (sbi->s_qf_names[GRPQUOTA]) + seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]); + + if (sbi->s_qf_names[PRJQUOTA]) + seq_show_option(seq, "prjjquota", sbi->s_qf_names[PRJQUOTA]); +#endif +} + static int f2fs_show_options(struct seq_file *seq, struct dentry *root) { struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb); @@ -865,6 +1090,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) sbi->fault_info.inject_rate); #endif #ifdef CONFIG_QUOTA + if (test_opt(sbi, QUOTA)) + seq_puts(seq, ",quota"); if (test_opt(sbi, USRQUOTA)) seq_puts(seq, ",usrquota"); if (test_opt(sbi, GRPQUOTA)) @@ -872,6 +1099,7 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) if (test_opt(sbi, PRJQUOTA)) seq_puts(seq, ",prjquota"); #endif + f2fs_show_quota_options(seq, sbi->sb); return 0; } @@ -920,6 +1148,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) #ifdef CONFIG_F2FS_FAULT_INJECTION struct f2fs_fault_info ffi = sbi->fault_info; #endif +#ifdef CONFIG_QUOTA + int s_jquota_fmt; + char *s_qf_names[MAXQUOTAS]; + int i, j; +#endif /* * Save the old mount options in case we @@ -929,6 +1162,23 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) old_sb_flags = sb->s_flags; active_logs = sbi->active_logs; +#ifdef CONFIG_QUOTA + s_jquota_fmt = sbi->s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) { + if (sbi->s_qf_names[i]) { + s_qf_names[i] = kstrdup(sbi->s_qf_names[i], + GFP_KERNEL); + if (!s_qf_names[i]) { + for (j = 0; j < i; j++) + kfree(s_qf_names[j]); + return -ENOMEM; + } + } else { + s_qf_names[i] = NULL; + } + } +#endif + /* recover superblocks we couldn't write due to previous RO mount */ if (!(*flags & MS_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) { err = f2fs_commit_super(sbi, false); @@ -1010,6 +1260,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) goto restore_gc; } skip: +#ifdef CONFIG_QUOTA + /* Release old quota file names */ + for (i = 0; i < MAXQUOTAS; i++) + kfree(s_qf_names[i]); +#endif /* Update the POSIXACL Flag */ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); @@ -1024,6 +1279,13 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) stop_gc_thread(sbi); } restore_opts: +#ifdef CONFIG_QUOTA + sbi->s_jquota_fmt = s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) { + kfree(sbi->s_qf_names[i]); + sbi->s_qf_names[i] = s_qf_names[i]; + } +#endif sbi->mount_opt = org_mount_opt; sbi->active_logs = active_logs; sb->s_flags = old_sb_flags; @@ -1140,6 +1402,27 @@ static qsize_t *f2fs_get_reserved_space(struct inode *inode) return &F2FS_I(inode)->i_reserved_quota; } +static int f2fs_quota_on_mount(struct f2fs_sb_info *sbi, int type) +{ + return dquot_quota_on_mount(sbi->sb, sbi->s_qf_names[type], + sbi->s_jquota_fmt, type); +} + +void f2fs_enable_quota_files(struct f2fs_sb_info *sbi) +{ + int i, ret; + + for (i = 0; i < MAXQUOTAS; i++) { + if (sbi->s_qf_names[i]) { + ret = f2fs_quota_on_mount(sbi, i); + if (ret < 0) + f2fs_msg(sbi->sb, KERN_ERR, + "Cannot turn on journaled " + "quota: error %d", ret); + } + } +} + static int f2fs_quota_sync(struct super_block *sb, int type) { struct quota_info *dqopt = sb_dqopt(sb); @@ -1221,7 +1504,7 @@ static int f2fs_quota_off(struct super_block *sb, int type) return err; } -static void f2fs_quota_off_umount(struct super_block *sb) +void f2fs_quota_off_umount(struct super_block *sb) { int type; @@ -1262,7 +1545,7 @@ static const struct quotactl_ops f2fs_quotactl_ops = { .set_dqblk = dquot_set_dqblk, }; #else -static inline void f2fs_quota_off_umount(struct super_block *sb) +void f2fs_quota_off_umount(struct super_block *sb) { } #endif @@ -2186,11 +2469,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (err) goto free_nm; - /* if there are nt orphan nodes free them */ - err = recover_orphan_inodes(sbi); - if (err) - goto free_node_inode; - /* read root inode and dentry */ root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); if (IS_ERR(root)) { @@ -2214,6 +2492,11 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (err) goto free_root_inode; + /* if there are nt orphan nodes free them */ + err = recover_orphan_inodes(sbi); + if (err) + goto free_sysfs; + /* recover fsynced data */ if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { /* @@ -2223,7 +2506,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (bdev_read_only(sb->s_bdev) && !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { err = -EROFS; - goto free_sysfs; + goto free_meta; } if (need_fsck) @@ -2237,7 +2520,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) need_fsck = true; f2fs_msg(sb, KERN_ERR, "Cannot recover all fsync data errno=%d", err); - goto free_sysfs; + goto free_meta; } } else { err = recover_fsync_data(sbi, true); @@ -2261,7 +2544,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) /* After POR, we can run background GC thread.*/ err = start_gc_thread(sbi); if (err) - goto free_sysfs; + goto free_meta; } kfree(options); @@ -2279,8 +2562,16 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) f2fs_update_time(sbi, REQ_TIME); return 0; -free_sysfs: +free_meta: f2fs_sync_inode_meta(sbi); + /* + * Some dirty meta pages can be produced by recover_orphan_inodes() + * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg() + * followed by write_checkpoint() through f2fs_write_node_pages(), which + * falls into an infinite loop in sync_meta_pages(). + */ + truncate_inode_pages_final(META_MAPPING(sbi)); +free_sysfs: f2fs_unregister_sysfs(sbi); free_root_inode: dput(sb->s_root); @@ -2290,13 +2581,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) mutex_lock(&sbi->umount_mutex); release_ino_entry(sbi, true); f2fs_leave_shrinker(sbi); - /* - * Some dirty meta pages can be produced by recover_orphan_inodes() - * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg() - * followed by write_checkpoint() through f2fs_write_node_pages(), which - * falls into an infinite loop in sync_meta_pages(). - */ - truncate_inode_pages_final(META_MAPPING(sbi)); iput(sbi->node_inode); mutex_unlock(&sbi->umount_mutex); f2fs_destroy_stats(sbi); @@ -2316,6 +2600,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) for (i = 0; i < NR_PAGE_TYPE; i++) kfree(sbi->write_io[i]); destroy_percpu_info(sbi); +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +#endif kfree(options); free_sb_buf: kfree(raw_super); From 99dae6bc11ad5687760004133f26ad3e14d86a74 Mon Sep 17 00:00:00 2001 From: Qiuyang Sun Date: Wed, 9 Aug 2017 17:27:30 +0800 Subject: [PATCH 0398/1212] f2fs: merge equivalent flags F2FS_GET_BLOCK_[READ|DIO] Currently, the two flags F2FS_GET_BLOCK_[READ|DIO] are totally equivalent and can be used interchangably in all scenarios they are involved in. Neither of the flags is referenced in f2fs_map_blocks(), making them both the default case. To remove the ambiguity, this patch merges both flags into F2FS_GET_BLOCK_DEFAULT, and introduces an enum for all distinct flags. Signed-off-by: Qiuyang Sun Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 ++-- fs/f2fs/f2fs.h | 13 +++++++------ fs/f2fs/file.c | 4 ++-- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 47584eb07ddf..d7aa2e908570 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1043,7 +1043,7 @@ static int get_data_block_dio(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_DIO, NULL); + F2FS_GET_BLOCK_DEFAULT, NULL); } static int get_data_block_bmap(struct inode *inode, sector_t iblock, @@ -1242,7 +1242,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, map.m_len = last_block - block_in_file; if (f2fs_map_blocks(inode, &map, 0, - F2FS_GET_BLOCK_READ)) + F2FS_GET_BLOCK_DEFAULT)) goto set_error_page; } got_it: diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 310d8588ad3c..1aac76dd1938 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -574,12 +574,13 @@ struct f2fs_map_blocks { }; /* for flag in get_data_block */ -#define F2FS_GET_BLOCK_READ 0 -#define F2FS_GET_BLOCK_DIO 1 -#define F2FS_GET_BLOCK_FIEMAP 2 -#define F2FS_GET_BLOCK_BMAP 3 -#define F2FS_GET_BLOCK_PRE_DIO 4 -#define F2FS_GET_BLOCK_PRE_AIO 5 +enum { + F2FS_GET_BLOCK_DEFAULT, + F2FS_GET_BLOCK_FIEMAP, + F2FS_GET_BLOCK_BMAP, + F2FS_GET_BLOCK_PRE_DIO, + F2FS_GET_BLOCK_PRE_AIO, +}; /* * i_advise uses FADVISE_XXX_BIT. We can add additional hints later. diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a606dadcedee..30dc356d922c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2051,7 +2051,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, */ while (map.m_lblk < pg_end) { map.m_len = pg_end - map.m_lblk; - err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ); + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT); if (err) goto out; @@ -2093,7 +2093,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, do_map: map.m_len = pg_end - map.m_lblk; - err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ); + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT); if (err) goto clear_out; From 2b476db7b17c8525f0c5b9b42b6225118b38dd0e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 10 Aug 2017 17:35:04 -0700 Subject: [PATCH 0399/1212] f2fs: let fill_super handle roll-forward errors If we set CP_ERROR_FLAG in roll-forward error, f2fs is no longer to proceed any IOs due to f2fs_cp_error(). But, for example, if some stale data is involved on roll-forward process, we're able to get -ENOENT, getting fs stuck. If we get any error, let fill_super set SBI_NEED_FSCK and try to recover back to stable point. Cc: Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index a3d02613934a..f707d810c87d 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -649,8 +649,6 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) } clear_sbi_flag(sbi, SBI_POR_DOING); - if (err) - set_ckpt_flags(sbi, CP_ERROR_FLAG); mutex_unlock(&sbi->cp_mutex); /* let's drop all the directory inodes for clean checkpoint */ From f542a0378dc8b18ffb09bcdbf23aa55c260d6acc Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 8 Aug 2017 19:09:08 +0800 Subject: [PATCH 0400/1212] f2fs: retry to revoke atomic commit in -ENOMEM case During atomic committing, if we encounter -ENOMEM in revoke path, it's better to give a chance to retry revoking. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 20f466ace8b0..03849778b881 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -213,9 +213,15 @@ static int __revoke_inmem_pages(struct inode *inode, struct node_info ni; trace_f2fs_commit_inmem_page(page, INMEM_REVOKE); - +retry: set_new_dnode(&dn, inode, NULL, NULL, 0); - if (get_dnode_of_data(&dn, page->index, LOOKUP_NODE)) { + err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); + if (err) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + cond_resched(); + goto retry; + } err = -EAGAIN; goto next; } From 6ae3dde9ed3cba9eba3d5e95ba8dba1635134d91 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 11 Aug 2017 18:00:15 +0800 Subject: [PATCH 0401/1212] f2fs: add tracepoint for f2fs_gc This patch adds tracepoint for f2fs_gc. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 50 ++++++++++++----- include/trace/events/f2fs.h | 107 ++++++++++++++++++++++++++++++++++++ 2 files changed, 143 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index faed28e56203..ccb00de9b0b0 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -919,7 +919,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, struct blk_plug plug; unsigned int segno = start_segno; unsigned int end_segno = start_segno + sbi->segs_per_sec; - int sec_freed = 0; + int seg_freed = 0; unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ? SUM_TYPE_DATA : SUM_TYPE_NODE; @@ -965,6 +965,10 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, gc_type); stat_inc_seg_count(sbi, type, gc_type); + + if (gc_type == FG_GC && + get_valid_blocks(sbi, segno, false) == 0) + seg_freed++; next: f2fs_put_page(sum_page, 0); } @@ -975,21 +979,17 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, blk_finish_plug(&plug); - if (gc_type == FG_GC && - get_valid_blocks(sbi, start_segno, true) == 0) - sec_freed = 1; - stat_inc_call_count(sbi->stat_info); - return sec_freed; + return seg_freed; } int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, unsigned int segno) { int gc_type = sync ? FG_GC : BG_GC; - int sec_freed = 0; - int ret; + int sec_freed = 0, seg_freed = 0, total_freed = 0; + int ret = 0; struct cp_control cpc; unsigned int init_segno = segno; struct gc_inode_list gc_list = { @@ -997,6 +997,15 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, .iroot = RADIX_TREE_INIT(GFP_NOFS), }; + trace_f2fs_gc_begin(sbi->sb, sync, background, + get_pages(sbi, F2FS_DIRTY_NODES), + get_pages(sbi, F2FS_DIRTY_DENTS), + get_pages(sbi, F2FS_DIRTY_IMETA), + free_sections(sbi), + free_segments(sbi), + reserved_segments(sbi), + prefree_segments(sbi)); + cpc.reason = __get_cp_reason(sbi); gc_more: if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) { @@ -1023,17 +1032,20 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, gc_type = FG_GC; } - ret = -EINVAL; /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ - if (gc_type == BG_GC && !background) + if (gc_type == BG_GC && !background) { + ret = -EINVAL; goto stop; - if (!__get_victim(sbi, &segno, gc_type)) + } + if (!__get_victim(sbi, &segno, gc_type)) { + ret = -ENODATA; goto stop; - ret = 0; + } - if (do_garbage_collect(sbi, segno, &gc_list, gc_type) && - gc_type == FG_GC) + seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type); + if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec) sec_freed++; + total_freed += seg_freed; if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; @@ -1050,6 +1062,16 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, stop: SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno; + + trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed, + get_pages(sbi, F2FS_DIRTY_NODES), + get_pages(sbi, F2FS_DIRTY_DENTS), + get_pages(sbi, F2FS_DIRTY_IMETA), + free_sections(sbi), + free_segments(sbi), + reserved_segments(sbi), + prefree_segments(sbi)); + mutex_unlock(&sbi->gc_mutex); put_gc_inode(&gc_list); diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 167c40850f98..7063bbcca03b 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -561,6 +561,113 @@ TRACE_EVENT(f2fs_background_gc, __entry->free) ); +TRACE_EVENT(f2fs_gc_begin, + + TP_PROTO(struct super_block *sb, bool sync, bool background, + long long dirty_nodes, long long dirty_dents, + long long dirty_imeta, unsigned int free_sec, + unsigned int free_seg, int reserved_seg, + unsigned int prefree_seg), + + TP_ARGS(sb, sync, background, dirty_nodes, dirty_dents, dirty_imeta, + free_sec, free_seg, reserved_seg, prefree_seg), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(bool, sync) + __field(bool, background) + __field(long long, dirty_nodes) + __field(long long, dirty_dents) + __field(long long, dirty_imeta) + __field(unsigned int, free_sec) + __field(unsigned int, free_seg) + __field(int, reserved_seg) + __field(unsigned int, prefree_seg) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->sync = sync; + __entry->background = background; + __entry->dirty_nodes = dirty_nodes; + __entry->dirty_dents = dirty_dents; + __entry->dirty_imeta = dirty_imeta; + __entry->free_sec = free_sec; + __entry->free_seg = free_seg; + __entry->reserved_seg = reserved_seg; + __entry->prefree_seg = prefree_seg; + ), + + TP_printk("dev = (%d,%d), sync = %d, background = %d, nodes = %lld, " + "dents = %lld, imeta = %lld, free_sec:%u, free_seg:%u, " + "rsv_seg:%d, prefree_seg:%u", + show_dev(__entry->dev), + __entry->sync, + __entry->background, + __entry->dirty_nodes, + __entry->dirty_dents, + __entry->dirty_imeta, + __entry->free_sec, + __entry->free_seg, + __entry->reserved_seg, + __entry->prefree_seg) +); + +TRACE_EVENT(f2fs_gc_end, + + TP_PROTO(struct super_block *sb, int ret, int seg_freed, + int sec_freed, long long dirty_nodes, + long long dirty_dents, long long dirty_imeta, + unsigned int free_sec, unsigned int free_seg, + int reserved_seg, unsigned int prefree_seg), + + TP_ARGS(sb, ret, seg_freed, sec_freed, dirty_nodes, dirty_dents, + dirty_imeta, free_sec, free_seg, reserved_seg, prefree_seg), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, ret) + __field(int, seg_freed) + __field(int, sec_freed) + __field(long long, dirty_nodes) + __field(long long, dirty_dents) + __field(long long, dirty_imeta) + __field(unsigned int, free_sec) + __field(unsigned int, free_seg) + __field(int, reserved_seg) + __field(unsigned int, prefree_seg) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->ret = ret; + __entry->seg_freed = seg_freed; + __entry->sec_freed = sec_freed; + __entry->dirty_nodes = dirty_nodes; + __entry->dirty_dents = dirty_dents; + __entry->dirty_imeta = dirty_imeta; + __entry->free_sec = free_sec; + __entry->free_seg = free_seg; + __entry->reserved_seg = reserved_seg; + __entry->prefree_seg = prefree_seg; + ), + + TP_printk("dev = (%d,%d), ret = %d, seg_freed = %d, sec_freed = %d, " + "nodes = %lld, dents = %lld, imeta = %lld, free_sec:%u, " + "free_seg:%u, rsv_seg:%d, prefree_seg:%u", + show_dev(__entry->dev), + __entry->ret, + __entry->seg_freed, + __entry->sec_freed, + __entry->dirty_nodes, + __entry->dirty_dents, + __entry->dirty_imeta, + __entry->free_sec, + __entry->free_seg, + __entry->reserved_seg, + __entry->prefree_seg) +); + TRACE_EVENT(f2fs_get_victim, TP_PROTO(struct super_block *sb, int type, int gc_type, From 8a8cce5b1f1705f757747ea558985e5eb2a7c69d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sat, 12 Aug 2017 21:33:23 -0700 Subject: [PATCH 0402/1212] f2fs: check hot_data for roll-forward recovery We need to check HOT_DATA to truncate any previous data block when doing roll-forward recovery. Cc: Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index f707d810c87d..9626758bc762 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -317,7 +317,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, return 0; /* Get the previous summary */ - for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) { + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); if (curseg->segno == segno) { sum = curseg->sum_blk->entries[blkoff]; From a50bb55fd335f59297c2b2a9f11f5fd209ef0a77 Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Mon, 14 Aug 2017 16:52:43 +0800 Subject: [PATCH 0403/1212] f2fs: remove unused function overprovision_sections Signed-off-by: Yunlong Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 84242eb5226f..a843751b253b 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -492,11 +492,6 @@ static inline int overprovision_segments(struct f2fs_sb_info *sbi) return SM_I(sbi)->ovp_segments; } -static inline int overprovision_sections(struct f2fs_sb_info *sbi) -{ - return GET_SEC_FROM_SEG(sbi, (unsigned int)overprovision_segments(sbi)); -} - static inline int reserved_sections(struct f2fs_sb_info *sbi) { return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi)); From 4ff6d9bf5af4c74a6a69e32a630975c92f9614d1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 7 Aug 2017 23:09:56 +0800 Subject: [PATCH 0404/1212] f2fs: introduce discard_granularity sysfs entry Commit d618ebaf0aa8 ("f2fs: enable small discard by default") enables f2fs to issue 4K size discard in real-time discard mode. However, issuing smaller discard may cost more lifetime but releasing less free space in flash device. Since f2fs has ability of separating hot/cold data and garbage collection, we can expect that small-sized invalid region would expand soon with OPU, deletion or garbage collection on valid datas, so it's better to delay or skip issuing smaller size discards, it could help to reduce overmuch consumption of IO bandwidth and lifetime of flash storage. This patch makes f2fs selectng 64K size as its default minimal granularity, and issue discard with the size which is not smaller than minimal granularity. Also it exposes discard granularity as sysfs entry for configuration in different scenario. Jaegeuk Kim: We must issue all the accumulated discard commands when fstrim is called. So, I've added pend_list_tag[] to indicate whether we should issue the commands or not. If tag sets P_ACTIVE or P_TRIM, we have to issue them. P_TRIM is set once at a time, given fstrim trigger. In addition, issue_discard_thread is calling too much due to the number of discard commands remaining in the pending list. I added a timer to control it likewise gc_thread. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 9 +++ fs/f2fs/f2fs.h | 12 ++++ fs/f2fs/segment.c | 91 +++++++++++++++++++++---- fs/f2fs/sysfs.c | 23 +++++++ 4 files changed, 121 insertions(+), 14 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 6c2c50b4e781..500c60403653 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -57,6 +57,15 @@ Contact: "Jaegeuk Kim" Description: Controls the issue rate of small discard commands. +What: /sys/fs/f2fs//discard_granularity +Date: July 2017 +Contact: "Chao Yu" +Description: + Controls discard granularity of inner discard thread, inner thread + will not issue discards with size that is smaller than granularity. + The unit size is one block, now only support configuring in range + of [1, 512]. + What: /sys/fs/f2fs//max_victim_search Date: January 2014 Contact: "Jaegeuk Kim" diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1aac76dd1938..e13daceb7995 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -215,6 +215,8 @@ enum { (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) #define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi) #define DISCARD_ISSUE_RATE 8 +#define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */ +#define DEF_MAX_DISCARD_ISSUE_TIME 60000 /* 60 s, if no candidates */ #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ @@ -263,11 +265,18 @@ struct discard_entry { unsigned char discard_map[SIT_VBLOCK_MAP_SIZE]; /* segment discard bitmap */ }; +/* default discard granularity of inner discard thread, unit: block count */ +#define DEFAULT_DISCARD_GRANULARITY 16 + /* max discard pend list number */ #define MAX_PLIST_NUM 512 #define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \ (MAX_PLIST_NUM - 1) : (blk_num - 1)) +#define P_ACTIVE 0x01 +#define P_TRIM 0x02 +#define plist_issue(tag) (((tag) & P_ACTIVE) || ((tag) & P_TRIM)) + enum { D_PREP, D_SUBMIT, @@ -303,11 +312,14 @@ struct discard_cmd_control { struct task_struct *f2fs_issue_discard; /* discard thread */ struct list_head entry_list; /* 4KB discard entry list */ struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */ + unsigned char pend_list_tag[MAX_PLIST_NUM];/* tag for pending entries */ struct list_head wait_list; /* store on-flushing entries */ wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ + unsigned int discard_wake; /* to wake up discard thread */ struct mutex cmd_lock; unsigned int nr_discards; /* # of discards in the list */ unsigned int max_discards; /* max. discards to be issued */ + unsigned int discard_granularity; /* discard granularity */ unsigned int undiscard_blks; /* # of undiscard blocks */ atomic_t issued_discard; /* # of issued discard */ atomic_t issing_discard; /* # of issing discard */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 03849778b881..97d43373e10e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1096,32 +1096,65 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, return 0; } -static void __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) +static int __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *pend_list; struct discard_cmd *dc, *tmp; struct blk_plug plug; - int i, iter = 0; + int iter = 0, issued = 0; + int i; mutex_lock(&dcc->cmd_lock); f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); blk_start_plug(&plug); - for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + for (i = MAX_PLIST_NUM - 1; + i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) { pend_list = &dcc->pend_list[i]; list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); - if (!issue_cond || is_idle(sbi)) + /* Hurry up to finish fstrim */ + if (dcc->pend_list_tag[i] & P_TRIM) { __submit_discard_cmd(sbi, dc); + issued++; + continue; + } + + if (!issue_cond || is_idle(sbi)) { + issued++; + __submit_discard_cmd(sbi, dc); + } if (issue_cond && iter++ > DISCARD_ISSUE_RATE) goto out; } + if (list_empty(pend_list) && dcc->pend_list_tag[i] & P_TRIM) + dcc->pend_list_tag[i] &= (~P_TRIM); } out: blk_finish_plug(&plug); mutex_unlock(&dcc->cmd_lock); + + return issued; +} + +static void __drop_discard_cmd(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *pend_list; + struct discard_cmd *dc, *tmp; + int i; + + mutex_lock(&dcc->cmd_lock); + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + pend_list = &dcc->pend_list[i]; + list_for_each_entry_safe(dc, tmp, pend_list, list) { + f2fs_bug_on(sbi, dc->state != D_PREP); + __remove_discard_cmd(sbi, dc); + } + } + mutex_unlock(&dcc->cmd_lock); } static void __wait_one_discard_bio(struct f2fs_sb_info *sbi, @@ -1206,34 +1239,56 @@ void stop_discard_thread(struct f2fs_sb_info *sbi) void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) { __issue_discard_cmd(sbi, false); + __drop_discard_cmd(sbi); __wait_discard_cmd(sbi, false); } +static void mark_discard_range_all(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + int i; + + mutex_lock(&dcc->cmd_lock); + for (i = 0; i < MAX_PLIST_NUM; i++) + dcc->pend_list_tag[i] |= P_TRIM; + mutex_unlock(&dcc->cmd_lock); +} + static int issue_discard_thread(void *data) { struct f2fs_sb_info *sbi = data; struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; wait_queue_head_t *q = &dcc->discard_wait_queue; + unsigned int wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; + int issued; set_freezable(); do { - wait_event_interruptible(*q, kthread_should_stop() || - freezing(current) || - atomic_read(&dcc->discard_cmd_cnt)); + wait_event_interruptible_timeout(*q, + kthread_should_stop() || freezing(current) || + dcc->discard_wake, + msecs_to_jiffies(wait_ms)); if (try_to_freeze()) continue; if (kthread_should_stop()) return 0; + if (dcc->discard_wake) + dcc->discard_wake = 0; + sb_start_intwrite(sbi->sb); - __issue_discard_cmd(sbi, true); - __wait_discard_cmd(sbi, true); + issued = __issue_discard_cmd(sbi, true); + if (issued) { + __wait_discard_cmd(sbi, true); + wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; + } else { + wait_ms = DEF_MAX_DISCARD_ISSUE_TIME; + } sb_end_intwrite(sbi->sb); - congestion_wait(BLK_RW_SYNC, HZ/50); } while (!kthread_should_stop()); return 0; } @@ -1424,7 +1479,8 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) { - struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list); + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *head = &dcc->entry_list; struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; @@ -1506,11 +1562,12 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) goto find_next; list_del(&entry->list); - SM_I(sbi)->dcc_info->nr_discards -= total_len; + dcc->nr_discards -= total_len; kmem_cache_free(discard_entry_slab, entry); } - wake_up(&SM_I(sbi)->dcc_info->discard_wait_queue); + dcc->discard_wake = 1; + wake_up_interruptible_all(&dcc->discard_wait_queue); } static int create_discard_cmd_control(struct f2fs_sb_info *sbi) @@ -1528,9 +1585,13 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) if (!dcc) return -ENOMEM; + dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; INIT_LIST_HEAD(&dcc->entry_list); - for (i = 0; i < MAX_PLIST_NUM; i++) + for (i = 0; i < MAX_PLIST_NUM; i++) { INIT_LIST_HEAD(&dcc->pend_list[i]); + if (i >= dcc->discard_granularity - 1) + dcc->pend_list_tag[i] |= P_ACTIVE; + } INIT_LIST_HEAD(&dcc->wait_list); mutex_init(&dcc->cmd_lock); atomic_set(&dcc->issued_discard, 0); @@ -2207,6 +2268,8 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) schedule(); } + /* It's time to issue all the filed discards */ + mark_discard_range_all(sbi); out: range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); return err; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index c40e5d24df9f..4bcaa9059026 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -152,6 +152,27 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, spin_unlock(&sbi->stat_lock); return count; } + + if (!strcmp(a->attr.name, "discard_granularity")) { + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + int i; + + if (t == 0 || t > MAX_PLIST_NUM) + return -EINVAL; + if (t == *ui) + return count; + + mutex_lock(&dcc->cmd_lock); + for (i = 0; i < MAX_PLIST_NUM; i++) { + if (i >= t - 1) + dcc->pend_list_tag[i] |= P_ACTIVE; + else + dcc->pend_list_tag[i] &= (~P_ACTIVE); + } + mutex_unlock(&dcc->cmd_lock); + return count; + } + *ui = t; if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0) @@ -248,6 +269,7 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent, gc_urgent); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); @@ -290,6 +312,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(gc_urgent), ATTR_LIST(reclaim_segments), ATTR_LIST(max_small_discards), + ATTR_LIST(discard_granularity), ATTR_LIST(batched_trim_sections), ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), From 8f8b9cda392501633678aa75ed8830d31e8e79f9 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 15 Aug 2017 21:27:19 -0700 Subject: [PATCH 0405/1212] f2fs: issue discard commands if gc_urgent is set It's time to issue all the discard commands, if user sets the idle time. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 6 +++++- fs/f2fs/sysfs.c | 5 +++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 97d43373e10e..abfa55174d0c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -21,6 +21,7 @@ #include "f2fs.h" #include "segment.h" #include "node.h" +#include "gc.h" #include "trace.h" #include @@ -1274,8 +1275,11 @@ static int issue_discard_thread(void *data) if (kthread_should_stop()) return 0; - if (dcc->discard_wake) + if (dcc->discard_wake) { dcc->discard_wake = 0; + if (sbi->gc_thread && sbi->gc_thread->gc_urgent) + mark_discard_range_all(sbi); + } sb_start_intwrite(sbi->sb); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 4bcaa9059026..b9ad9041559f 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -178,8 +178,13 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0) f2fs_reset_iostat(sbi); if (!strcmp(a->attr.name, "gc_urgent") && t == 1 && sbi->gc_thread) { + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + sbi->gc_thread->gc_wake = 1; wake_up_interruptible_all(&sbi->gc_thread->gc_wait_queue_head); + + dcc->discard_wake = 1; + wake_up_interruptible_all(&dcc->discard_wait_queue); } return count; From 440c08fb62d22b49bcb846b1805a20441d297172 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 21 Aug 2017 22:53:45 +0800 Subject: [PATCH 0406/1212] f2fs: fix out-of-order execution in f2fs_issue_flush In f2fs_issue_flush, due to out-of-order execution of CPU, wake_up can be called before we insert issue_list, result in long latency of wait_for_completion. Fix this by adding smp_mb() to force the order of related codes. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index abfa55174d0c..e9416ae025aa 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -549,7 +549,10 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) atomic_inc(&fcc->issing_flush); llist_add(&cmd.llnode, &fcc->issue_list); - if (!fcc->dispatch_list) + /* update issue_list before we wake up issue_flush thread */ + smp_mb(); + + if (waitqueue_active(&fcc->flush_wait_queue)) wake_up(&fcc->flush_wait_queue); if (fcc->f2fs_issue_flush) { From 9071bb1c094f70ab08253b9335e62b8a79e20b15 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 18 Aug 2017 23:37:36 +0800 Subject: [PATCH 0407/1212] f2fs: clear FI_HOT_DATA correctly This patch fixes to clear FI_HOT_DATA correctly in below path: - error handling in f2fs_ioc_start_atomic_write - after commit atomic write in f2fs_ioc_commit_atomic_write - after drop atomic write in drop_inmem_pages Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 ++ fs/f2fs/segment.c | 1 + 2 files changed, 3 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 30dc356d922c..25087401b2e6 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1617,6 +1617,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) { clear_inode_flag(inode, FI_ATOMIC_FILE); + clear_inode_flag(inode, FI_HOT_DATA); goto out; } @@ -1655,6 +1656,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); if (!ret) { clear_inode_flag(inode, FI_ATOMIC_FILE); + clear_inode_flag(inode, FI_HOT_DATA); stat_dec_atomic_write(inode); } } else { diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e9416ae025aa..78a0e8ee62b8 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -255,6 +255,7 @@ void drop_inmem_pages(struct inode *inode) mutex_unlock(&fi->inmem_lock); clear_inode_flag(inode, FI_ATOMIC_FILE); + clear_inode_flag(inode, FI_HOT_DATA); stat_dec_atomic_write(inode); } From e1a34a55188890b7addd174aa83aee14840c8c9e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 18 Aug 2017 16:20:33 +0800 Subject: [PATCH 0408/1212] f2fs: trigger normal fsync for non-atomic_write file If file was not opened with atomic write mode, but user uses atomic write ioctl to fsync datas, in the flow, we should not fsync that file with atomic write mode. Fixes: 608514deba38 ("f2fs: set fsync mark only for the last dnode") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 25087401b2e6..1c3dffc987b1 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1660,7 +1660,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) stat_dec_atomic_write(inode); } } else { - ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, false); } err_out: inode_unlock(inode); From 5469cedba6796ebb2e1dceca9a7c3b605e215cd6 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 21 Aug 2017 13:51:32 -0700 Subject: [PATCH 0409/1212] f2fs: return error when accessing insane flie offset If file offset is insane, we have to return error instead of kernel panic. Reported-by: Eric Zhang Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index bc748df0b04f..a825a973dcf2 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -555,7 +555,7 @@ static int get_node_path(struct inode *inode, long block, level = 3; goto got; } else { - BUG(); + return -E2BIG; } got: return level; @@ -579,6 +579,8 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) int err = 0; level = get_node_path(dn->inode, index, offset, noffset); + if (level < 0) + return level; nids[0] = dn->inode->i_ino; npage[0] = dn->inode_page; @@ -878,6 +880,8 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from) trace_f2fs_truncate_inode_blocks_enter(inode, from); level = get_node_path(inode, from, offset, noffset); + if (level < 0) + return level; page = get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) { From 1e5c4e7c8dc42ce706d09ace943ab29fbe6aa6ac Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 22 Aug 2017 21:15:43 -0700 Subject: [PATCH 0410/1212] f2fs: wake up discard_thread iff there is a candidate This patch fixes to avoid needless wake ups. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 +-- fs/f2fs/segment.h | 25 +++++++++++++++++++++++++ fs/f2fs/sysfs.c | 6 +----- 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 78a0e8ee62b8..00253111c227 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1574,8 +1574,7 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) kmem_cache_free(discard_entry_slab, entry); } - dcc->discard_wake = 1; - wake_up_interruptible_all(&dcc->discard_wait_queue); + wake_up_discard_thread(sbi, false); } static int create_discard_cmd_control(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index a843751b253b..b8aa84109bf5 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -797,3 +797,28 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, wbc->nr_to_write = desired; return desired - nr_to_write; } + +static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + bool wakeup = false; + int i; + + if (force) + goto wake_up; + + mutex_lock(&dcc->cmd_lock); + for (i = MAX_PLIST_NUM - 1; + i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) { + if (!list_empty(&dcc->pend_list[i])) { + wakeup = true; + break; + } + } + mutex_unlock(&dcc->cmd_lock); + if (!wakeup) + return; +wake_up: + dcc->discard_wake = 1; + wake_up_interruptible_all(&dcc->discard_wait_queue); +} diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index b9ad9041559f..962735dc9c63 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -178,13 +178,9 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0) f2fs_reset_iostat(sbi); if (!strcmp(a->attr.name, "gc_urgent") && t == 1 && sbi->gc_thread) { - struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - sbi->gc_thread->gc_wake = 1; wake_up_interruptible_all(&sbi->gc_thread->gc_wait_queue_head); - - dcc->discard_wake = 1; - wake_up_interruptible_all(&dcc->discard_wait_queue); + wake_up_discard_thread(sbi, true); } return count; From 0520ca37ef89ae0ce3679da7616b7a66a0cfa774 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 23 Aug 2017 18:23:24 +0800 Subject: [PATCH 0411/1212] f2fs: fix to avoid race in between aio and gc We won't wait DIO synchronously when doing AIO, so there will be potential IO reorder in between AIO and GC, which will cause data corruption. This patch adds inode_dio_wait to serialize aio and data GC to avoid this issue. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index ccb00de9b0b0..382b7d386ffb 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -875,6 +875,9 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, continue; } locked = true; + + /* wait for all inflight aio data */ + inode_dio_wait(inode); } start_bidx = start_bidx_of_node(nofs, inode) From 077e22bf7a87013157dca27ea9e7ff3adcd75385 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 23 Aug 2017 18:23:25 +0800 Subject: [PATCH 0412/1212] f2fs: trigger fdatasync for non-atomic_write file Sqlite only cares about synchronization of file data instead of other data unrelated attribute of inode, so in commit flow, call fdatasync is enough. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 1c3dffc987b1..6c2ebe91afeb 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1660,7 +1660,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) stat_dec_atomic_write(inode); } } else { - ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, false); + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); } err_out: inode_unlock(inode); From c953aed665079b3f3878497e0dcaf763a5645f50 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 31 Aug 2017 11:10:58 -0700 Subject: [PATCH 0413/1212] f2fs: don't need to update inode checksum for recovery This patch fixes "f2fs: support inode checksum". The recovered inode page will be rewritten with valid checksum. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index a825a973dcf2..d789cff5ffb1 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2290,8 +2290,6 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), i_projid)) dst->i_projid = src->i_projid; - - f2fs_inode_chksum_set(sbi, ipage); } new_ni = old_ni; From 85825456837e849a5cf8d6de78edf8771fe44b98 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 31 Aug 2017 16:54:51 -0700 Subject: [PATCH 0414/1212] f2fs: don't check inode's checksum if it was dirtied or writebacked If another thread already made the page dirtied or writebacked, we must avoid to verify checksum. If we got an error, we need to remove its uptodate as well. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 3 ++- fs/f2fs/node.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index b4c401d456e7..c33b05aec1a1 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -153,7 +153,8 @@ bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page) struct f2fs_inode *ri; __u32 provided, calculated; - if (!f2fs_enable_inode_chksum(sbi, page)) + if (!f2fs_enable_inode_chksum(sbi, page) || + PageDirty(page) || PageWriteback(page)) return true; ri = &F2FS_NODE(page)->i; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d789cff5ffb1..32474db18ad9 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1187,9 +1187,9 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, nid, nid_of_node(page), ino_of_node(page), ofs_of_node(page), cpver_of_node(page), next_blkaddr_of_node(page)); - ClearPageUptodate(page); err = -EINVAL; out_err: + ClearPageUptodate(page); f2fs_put_page(page, 1); return ERR_PTR(err); } From 6337ccbeee428f9859925f90fe935093d0c692b7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 30 Aug 2017 18:04:47 +0800 Subject: [PATCH 0415/1212] f2fs: update i_flags correctly f2fs enables hash-indexed directory by default, so we need to tag FS_INDEX_FL in inode::i_flags during directory creataion, in order to show correct status of inode in lsattr: Before: ------------------- /mnt/f2fs/dir/ After: -----------I------- /mnt/f2fs/dir/ Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 621b164bbe3c..d92b8e9064cb 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -100,6 +100,9 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) F2FS_I(inode)->i_flags = f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED); + if (S_ISDIR(inode->i_mode)) + F2FS_I(inode)->i_flags |= FS_INDEX_FL; + if (F2FS_I(inode)->i_flags & FS_PROJINHERIT_FL) set_inode_flag(inode, FI_PROJ_INHERIT); From bc0c8fe8b11e30ac7d881638ed422a7473827bb7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 30 Aug 2017 18:04:48 +0800 Subject: [PATCH 0416/1212] f2fs: remove unneeded parameter of change_curseg allocate_segment_by_default is the only caller of change_curseg passing @reuse with 'false', but commit 763bfe1bc575 ("f2fs: remove reusing any prefree segments") removes the calling, after that, @reuse in change_curseg always be true, so, let's clean up the unneeded parameter. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 00253111c227..a44c6fd2f1c5 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2092,7 +2092,7 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, * This function always allocates a used segment(from dirty seglist) by SSR * manner, so it should recover the existing segment information of valid blocks */ -static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse) +static void change_curseg(struct f2fs_sb_info *sbi, int type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -2113,12 +2113,10 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse) curseg->alloc_type = SSR; __next_free_blkoff(sbi, curseg, 0); - if (reuse) { - sum_page = get_sum_page(sbi, new_segno); - sum_node = (struct f2fs_summary_block *)page_address(sum_page); - memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); - f2fs_put_page(sum_page, 1); - } + sum_page = get_sum_page(sbi, new_segno); + sum_node = (struct f2fs_summary_block *)page_address(sum_page); + memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); + f2fs_put_page(sum_page, 1); } static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) @@ -2182,7 +2180,7 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) new_curseg(sbi, type, false); else if (need_SSR(sbi) && get_ssr_segment(sbi, type)) - change_curseg(sbi, type, true); + change_curseg(sbi, type); else new_curseg(sbi, type, false); @@ -2535,7 +2533,7 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, /* change the current segment */ if (segno != curseg->segno) { curseg->next_segno = segno; - change_curseg(sbi, type, true); + change_curseg(sbi, type); } curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); @@ -2554,7 +2552,7 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (recover_curseg) { if (old_cursegno != curseg->segno) { curseg->next_segno = old_cursegno; - change_curseg(sbi, type, true); + change_curseg(sbi, type); } curseg->next_blkoff = old_blkoff; } From 3b8bbd990ce57815f0b0b4b60029ba8d173f5912 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 31 Aug 2017 18:56:05 +0800 Subject: [PATCH 0417/1212] f2fs: avoid race in between atomic_read & atomic_inc Previously, we will miss merging flush command during fsync due to below race condition: Thread A Thread B Thread C - f2fs_issue_flush - atomic_read(&issing_flush) - f2fs_issue_flush - atomic_read(&issing_flush) - f2fs_issue_flush - atomic_read(&issing_flush) - atomic_inc(&issing_flush) - atomic_inc(&issing_flush) - atomic_inc(&issing_flush) - submit_flush_wait - submit_flush_wait - submit_flush_wait It needs to use atomic_inc_return instead to avoid such race. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a44c6fd2f1c5..370b4ca0e294 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -536,8 +536,7 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) return ret; } - if (!atomic_read(&fcc->issing_flush)) { - atomic_inc(&fcc->issing_flush); + if (atomic_inc_return(&fcc->issing_flush) == 1) { ret = submit_flush_wait(sbi); atomic_dec(&fcc->issing_flush); @@ -547,7 +546,6 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) init_completion(&cmd.wait); - atomic_inc(&fcc->issing_flush); llist_add(&cmd.llnode, &fcc->issue_list); /* update issue_list before we wake up issue_flush thread */ From ccb0b5d09d8c46c03e91c8e7a62b57fedee0662e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 31 Aug 2017 18:56:06 +0800 Subject: [PATCH 0418/1212] f2fs: fix to wake up all sleeping flusher In scenario of remount_ro vs flush, after flush_thread exits in ->remount_fs, flusher will only clean up golbal issue_list, but without waking up flushers waiting on that list, result in hang related user threads. In order to fix this issue, this patch enables the flusher to take charge of issue_flush thread: executes merged flush command, and wake up all sleeping flushers. Fixes: 5eba8c5d1fb3 ("f2fs: fix to access nullified flush_cmd_control pointer") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 370b4ca0e294..9d8d32b38073 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -558,8 +558,27 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) wait_for_completion(&cmd.wait); atomic_dec(&fcc->issing_flush); } else { - llist_del_all(&fcc->issue_list); - atomic_set(&fcc->issing_flush, 0); + struct llist_node *list; + + list = llist_del_all(&fcc->issue_list); + if (!list) { + wait_for_completion(&cmd.wait); + atomic_dec(&fcc->issing_flush); + } else { + struct flush_cmd *tmp, *next; + + ret = submit_flush_wait(sbi); + + llist_for_each_entry_safe(tmp, next, list, llnode) { + if (tmp == &cmd) { + cmd.ret = ret; + atomic_dec(&fcc->issing_flush); + continue; + } + tmp->ret = ret; + complete(&tmp->wait); + } + } } return cmd.ret; From f24eafa643946942a200e14db46115b7082ce5bf Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Thu, 31 Aug 2017 15:06:24 +0530 Subject: [PATCH 0419/1212] f2fs: constify super_operations super_operations are not supposed to change at runtime. "struct super_block" working with super_operations provided by work with const super_operations. So mark the non-const structs as const Signed-off-by: Arvind Yadav Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4a5eae7ec64c..731794142009 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1550,7 +1550,7 @@ void f2fs_quota_off_umount(struct super_block *sb) } #endif -static struct super_operations f2fs_sops = { +static const struct super_operations f2fs_sops = { .alloc_inode = f2fs_alloc_inode, .drop_inode = f2fs_drop_inode, .destroy_inode = f2fs_destroy_inode, From 9f467e94d08d4b0d674436bc637fdbdfb76490dd Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Mon, 4 Sep 2017 11:10:18 +0800 Subject: [PATCH 0420/1212] Revert "f2fs: add a new function get_ssr_cost" This reverts commit b7b7c4cf1c9ef0272a65f1480457cbfdadcda19d. se->ckpt_valid_blocks will never be smaller than se->valid_blocks, so just remove get_ssr_cost. Signed-off-by: Yunlong Song Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 382b7d386ffb..427f53489591 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -277,20 +277,11 @@ static unsigned int get_greedy_cost(struct f2fs_sb_info *sbi, valid_blocks * 2 : valid_blocks; } -static unsigned int get_ssr_cost(struct f2fs_sb_info *sbi, - unsigned int segno) -{ - struct seg_entry *se = get_seg_entry(sbi, segno); - - return se->ckpt_valid_blocks > se->valid_blocks ? - se->ckpt_valid_blocks : se->valid_blocks; -} - static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno, struct victim_sel_policy *p) { if (p->alloc_mode == SSR) - return get_ssr_cost(sbi, segno); + return get_seg_entry(sbi, segno)->ckpt_valid_blocks; /* alloc_mode == LFS */ if (p->gc_mode == GC_GREEDY) From 4445c7cfbdcc36f07598294b9585f545595e7051 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 5 Sep 2017 16:54:24 -0700 Subject: [PATCH 0421/1212] f2fs: introduce f2fs_encrypted_file for clean-up This patch replaces (f2fs_encrypted_inode() && S_ISREG()) with f2fs_encrypted_file(), which gives no functional change. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 10 +++++----- fs/f2fs/f2fs.h | 5 +++++ fs/f2fs/file.c | 2 +- fs/f2fs/gc.c | 5 ++--- fs/f2fs/inline.c | 2 +- 5 files changed, 14 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d7aa2e908570..f850060ff2e5 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -580,7 +580,7 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, .encrypted_page = NULL, }; - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + if (f2fs_encrypted_file(inode)) return read_mapping_page(mapping, index, NULL); page = f2fs_grab_cache_page(mapping, index, for_write); @@ -785,7 +785,7 @@ static int __allocate_data_block(struct dnode_of_data *dn) static inline bool __force_buffered_io(struct inode *inode, int rw) { - return ((f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) || + return (f2fs_encrypted_file(inode) || (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) || F2FS_I_SB(inode)->s_ndevs); } @@ -1156,7 +1156,7 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, struct fscrypt_ctx *ctx = NULL; struct bio *bio; - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { + if (f2fs_encrypted_file(inode)) { ctx = fscrypt_get_ctx(inode, GFP_NOFS); if (IS_ERR(ctx)) return ERR_CAST(ctx); @@ -1343,7 +1343,7 @@ static int encrypt_one_page(struct f2fs_io_info *fio) struct inode *inode = fio->page->mapping->host; gfp_t gfp_flags = GFP_NOFS; - if (!f2fs_encrypted_inode(inode) || !S_ISREG(inode->i_mode)) + if (!f2fs_encrypted_file(inode)) return 0; /* wait for GCed encrypted page writeback */ @@ -1971,7 +1971,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, f2fs_wait_on_page_writeback(page, DATA, false); /* wait for GCed encrypted page writeback */ - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + if (f2fs_encrypted_file(inode)) f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); if (len == PAGE_SIZE || PageUptodate(page)) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e13daceb7995..7b9246197b23 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3020,6 +3020,11 @@ static inline bool f2fs_encrypted_inode(struct inode *inode) return file_is_encrypt(inode); } +static inline bool f2fs_encrypted_file(struct inode *inode) +{ + return f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode); +} + static inline void f2fs_set_encrypted_inode(struct inode *inode) { #ifdef CONFIG_F2FS_FS_ENCRYPTION diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 6c2ebe91afeb..2632d447c996 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -109,7 +109,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, f2fs_wait_on_page_writeback(page, DATA, false); /* wait for GCed encrypted page writeback */ - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + if (f2fs_encrypted_file(inode)) f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr); out_sem: diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 427f53489591..d36130233d9e 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -831,8 +831,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, continue; /* if encrypted inode, let's go phase 3 */ - if (f2fs_encrypted_inode(inode) && - S_ISREG(inode->i_mode)) { + if (f2fs_encrypted_file(inode)) { add_gc_inode(gc_list, inode); continue; } @@ -873,7 +872,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, start_bidx = start_bidx_of_node(nofs, inode) + ofs_in_node; - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + if (f2fs_encrypted_file(inode)) move_encrypted_block(inode, start_bidx, segno, off); else move_data_page(inode, start_bidx, gc_type, segno, off); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 041072017ef8..92b5a4a89ed2 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -25,7 +25,7 @@ bool f2fs_may_inline_data(struct inode *inode) if (i_size_read(inode) > MAX_INLINE_DATA(inode)) return false; - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + if (f2fs_encrypted_file(inode)) return false; return true; From e2cd416ffa3262e4cffb03aec48f6ad15f996f95 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 5 Sep 2017 17:04:35 -0700 Subject: [PATCH 0422/1212] f2fs: use generic terms used for encrypted block management This patch renames functions regarding to buffer management via META_MAPPING used for encrypted blocks especially. We can actually use them in generic way. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 6 +++--- fs/f2fs/f2fs.h | 3 +-- fs/f2fs/file.c | 2 +- fs/f2fs/gc.c | 13 +++++++++---- fs/f2fs/segment.c | 3 +-- 5 files changed, 15 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f850060ff2e5..4d79696c3429 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1162,7 +1162,7 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, return ERR_CAST(ctx); /* wait the page to be moved by cleaning */ - f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); + f2fs_wait_on_block_writeback(sbi, blkaddr); } bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); @@ -1347,7 +1347,7 @@ static int encrypt_one_page(struct f2fs_io_info *fio) return 0; /* wait for GCed encrypted page writeback */ - f2fs_wait_on_encrypted_page_writeback(fio->sbi, fio->old_blkaddr); + f2fs_wait_on_block_writeback(fio->sbi, fio->old_blkaddr); retry_encrypt: fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, @@ -1972,7 +1972,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, /* wait for GCed encrypted page writeback */ if (f2fs_encrypted_file(inode)) - f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); + f2fs_wait_on_block_writeback(sbi, blkaddr); if (len == PAGE_SIZE || PageUptodate(page)) return 0; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7b9246197b23..04ab25448c51 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2631,8 +2631,7 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, struct f2fs_io_info *fio, bool add_list); void f2fs_wait_on_page_writeback(struct page *page, enum page_type type, bool ordered); -void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi, - block_t blkaddr); +void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr); void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk); void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk); int lookup_journal_in_cursum(struct f2fs_journal *journal, int type, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 2632d447c996..531379f513fa 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -110,7 +110,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, /* wait for GCed encrypted page writeback */ if (f2fs_encrypted_file(inode)) - f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr); + f2fs_wait_on_block_writeback(sbi, dn.data_blkaddr); out_sem: up_read(&F2FS_I(inode)->i_mmap_sem); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d36130233d9e..bd16e6631cf3 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -599,8 +599,12 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return true; } -static void move_encrypted_block(struct inode *inode, block_t bidx, - unsigned int segno, int off) +/* + * Move data block via META_MAPPING while keeping locked data page. + * This can be used to move blocks, aka LBAs, directly on disk. + */ +static void move_data_block(struct inode *inode, block_t bidx, + unsigned int segno, int off) { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), @@ -873,9 +877,10 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, start_bidx = start_bidx_of_node(nofs, inode) + ofs_in_node; if (f2fs_encrypted_file(inode)) - move_encrypted_block(inode, start_bidx, segno, off); + move_data_block(inode, start_bidx, segno, off); else - move_data_page(inode, start_bidx, gc_type, segno, off); + move_data_page(inode, start_bidx, gc_type, + segno, off); if (locked) { up_write(&fi->dio_rwsem[WRITE]); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 9d8d32b38073..e95470071030 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2608,8 +2608,7 @@ void f2fs_wait_on_page_writeback(struct page *page, } } -void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi, - block_t blkaddr) +void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr) { struct page *cpage; From fc9c6007a268f2c48643ce32efef6862433580ff Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 6 Sep 2017 21:04:44 -0700 Subject: [PATCH 0423/1212] f2fs: make get_lock_data_page to handle encrypted inode This patch refactors get_lock_data_page() to handle encryption case directly. In order to do that, it introduces common f2fs_submit_page_read(). Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 109 +++++++++++++++++++++++-------------------------- 1 file changed, 51 insertions(+), 58 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 4d79696c3429..a275cbe57042 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -455,6 +455,53 @@ int f2fs_submit_page_write(struct f2fs_io_info *fio) return err; } +static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, + unsigned nr_pages) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct fscrypt_ctx *ctx = NULL; + struct bio *bio; + + if (f2fs_encrypted_file(inode)) { + ctx = fscrypt_get_ctx(inode, GFP_NOFS); + if (IS_ERR(ctx)) + return ERR_CAST(ctx); + + /* wait the page to be moved by cleaning */ + f2fs_wait_on_block_writeback(sbi, blkaddr); + } + + bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); + if (!bio) { + if (ctx) + fscrypt_release_ctx(ctx); + return ERR_PTR(-ENOMEM); + } + f2fs_target_device(sbi, blkaddr, bio); + bio->bi_end_io = f2fs_read_end_io; + bio->bi_private = ctx; + bio_set_op_attrs(bio, REQ_OP_READ, 0); + + return bio; +} + +/* This can handle encryption stuffs */ +static int f2fs_submit_page_read(struct inode *inode, struct page *page, + block_t blkaddr) +{ + struct bio *bio = f2fs_grab_read_bio(inode, blkaddr, 1); + + if (IS_ERR(bio)) + return PTR_ERR(bio); + + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + bio_put(bio); + return -EFAULT; + } + __submit_bio(F2FS_I_SB(inode), bio, DATA); + return 0; +} + static void __set_data_blkaddr(struct dnode_of_data *dn) { struct f2fs_node *rn = F2FS_NODE(dn->node_page); @@ -572,16 +619,6 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, struct page *page; struct extent_info ei = {0,0,0}; int err; - struct f2fs_io_info fio = { - .sbi = F2FS_I_SB(inode), - .type = DATA, - .op = REQ_OP_READ, - .op_flags = op_flags, - .encrypted_page = NULL, - }; - - if (f2fs_encrypted_file(inode)) - return read_mapping_page(mapping, index, NULL); page = f2fs_grab_cache_page(mapping, index, for_write); if (!page) @@ -622,9 +659,7 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, return page; } - fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; - fio.page = page; - err = f2fs_submit_page_bio(&fio); + err = f2fs_submit_page_read(inode, page, dn.data_blkaddr); if (err) goto put_err; return page; @@ -1149,35 +1184,6 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return ret; } -static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, - unsigned nr_pages) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct fscrypt_ctx *ctx = NULL; - struct bio *bio; - - if (f2fs_encrypted_file(inode)) { - ctx = fscrypt_get_ctx(inode, GFP_NOFS); - if (IS_ERR(ctx)) - return ERR_CAST(ctx); - - /* wait the page to be moved by cleaning */ - f2fs_wait_on_block_writeback(sbi, blkaddr); - } - - bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); - if (!bio) { - if (ctx) - fscrypt_release_ctx(ctx); - return ERR_PTR(-ENOMEM); - } - f2fs_target_device(sbi, blkaddr, bio); - bio->bi_end_io = f2fs_read_end_io; - bio->bi_private = ctx; - - return bio; -} - /* * This function was originally taken from fs/mpage.c, and customized for f2fs. * Major change was from block_size == page_size in f2fs by default. @@ -1273,12 +1279,11 @@ static int f2fs_mpage_readpages(struct address_space *mapping, bio = NULL; } if (bio == NULL) { - bio = f2fs_grab_bio(inode, block_nr, nr_pages); + bio = f2fs_grab_read_bio(inode, block_nr, nr_pages); if (IS_ERR(bio)) { bio = NULL; goto set_error_page; } - bio_set_op_attrs(bio, REQ_OP_READ, 0); } if (bio_add_page(bio, page, blocksize, 0) < blocksize) @@ -1986,21 +1991,9 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, zero_user_segment(page, 0, PAGE_SIZE); SetPageUptodate(page); } else { - struct bio *bio; - - bio = f2fs_grab_bio(inode, blkaddr, 1); - if (IS_ERR(bio)) { - err = PTR_ERR(bio); + err = f2fs_submit_page_read(inode, page, blkaddr); + if (err) goto fail; - } - bio->bi_rw = READ_SYNC; - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - bio_put(bio); - err = -EFAULT; - goto fail; - } - - __submit_bio(sbi, bio, DATA); lock_page(page); if (unlikely(page->mapping != mapping)) { From b70c5bf429edad6e1856c20a8d0a309df72af544 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Thu, 7 Sep 2017 10:40:54 +0800 Subject: [PATCH 0424/1212] f2fs: avoid race in between read xattr & write xattr Thread A: Thread B: -f2fs_getxattr -lookup_all_xattrs -xnid = F2FS_I(inode)->i_xattr_nid; -f2fs_setxattr -__f2fs_setxattr -write_all_xattrs -truncate_xattr_node ... ... -write_checkpoint ... ... -alloc_nid <- nid reuse -get_node_page -f2fs_bug_on <- nid != node_footer->nid It's need a rw_sem to avoid the race Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/super.c | 1 + fs/f2fs/xattr.c | 6 ++++++ 3 files changed, 8 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 04ab25448c51..9d96f6d51eef 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -653,6 +653,7 @@ struct f2fs_inode_info { struct extent_tree *extent_tree; /* cached extent_tree entry */ struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */ struct rw_semaphore i_mmap_sem; + struct rw_semaphore i_xattr_sem; /* avoid racing between reading and changing EAs */ int i_extra_isize; /* size of extra space located in i_addr */ kprojid_t i_projid; /* id for project quota */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 731794142009..315e59ad1483 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -630,6 +630,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) init_rwsem(&fi->dio_rwsem[READ]); init_rwsem(&fi->dio_rwsem[WRITE]); init_rwsem(&fi->i_mmap_sem); + init_rwsem(&fi->i_xattr_sem); #ifdef CONFIG_QUOTA memset(&fi->i_dquot, 0, sizeof(fi->i_dquot)); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index aad59c7c3a63..ab658419552b 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -520,8 +520,10 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, if (len > F2FS_NAME_LEN) return -ERANGE; + down_read(&F2FS_I(inode)->i_xattr_sem); error = lookup_all_xattrs(inode, ipage, index, len, name, &entry, &base_addr); + up_read(&F2FS_I(inode)->i_xattr_sem); if (error) return error; @@ -550,7 +552,9 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) int error = 0; size_t rest = buffer_size; + down_read(&F2FS_I(inode)->i_xattr_sem); error = read_all_xattrs(inode, NULL, &base_addr); + up_read(&F2FS_I(inode)->i_xattr_sem); if (error) return error; @@ -726,7 +730,9 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, f2fs_lock_op(sbi); /* protect xattr_ver */ down_write(&F2FS_I(inode)->i_sem); + down_write(&F2FS_I(inode)->i_xattr_sem); err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags); + up_write(&F2FS_I(inode)->i_xattr_sem); up_write(&F2FS_I(inode)->i_sem); f2fs_unlock_op(sbi); From ef75b9afda215c1446195bf487987af43a458959 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sat, 9 Sep 2017 12:03:23 -0700 Subject: [PATCH 0425/1212] f2fs: better to wait for fstrim completion In android, we'd better wait for fstrim completion instead of issuing the discard commands asynchronous. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e95470071030..8ee473b1830f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "f2fs.h" #include "segment.h" @@ -1141,6 +1142,9 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) if (dcc->pend_list_tag[i] & P_TRIM) { __submit_discard_cmd(sbi, dc); issued++; + + if (fatal_signal_pending(current)) + break; continue; } @@ -1257,7 +1261,7 @@ void stop_discard_thread(struct f2fs_sb_info *sbi) } } -/* This comes from f2fs_put_super */ +/* This comes from f2fs_put_super and f2fs_trim_fs */ void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) { __issue_discard_cmd(sbi, false); @@ -2292,6 +2296,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) } /* It's time to issue all the filed discards */ mark_discard_range_all(sbi); + f2fs_wait_discard_bios(sbi); out: range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); return err; From 29f775fa640d4df8d5a1a0dad0b1c44143a00007 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sat, 9 Sep 2017 11:11:04 -0700 Subject: [PATCH 0426/1212] f2fs: speed up gc_urgent mode with SSR This patch activates SSR in gc_urgent mode. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/segment.c | 15 +++++++++++++++ fs/f2fs/segment.h | 13 ------------- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9d96f6d51eef..ff694127243a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2592,6 +2592,7 @@ void destroy_node_manager_caches(void); /* * segment.c */ +bool need_SSR(struct f2fs_sb_info *sbi); void register_inmem_page(struct inode *inode, struct page *page); void drop_inmem_pages(struct inode *inode); void drop_inmem_page(struct inode *inode, struct page *page); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 8ee473b1830f..3244cfb1885f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -169,6 +169,21 @@ static unsigned long __find_rev_next_zero_bit(const unsigned long *addr, return result - size + __reverse_ffz(tmp); } +bool need_SSR(struct f2fs_sb_info *sbi) +{ + int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); + int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); + + if (test_opt(sbi, LFS)) + return false; + if (sbi->gc_thread && sbi->gc_thread->gc_urgent) + return true; + + return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + + 2 * reserved_sections(sbi)); +} + void register_inmem_page(struct inode *inode, struct page *page) { struct f2fs_inode_info *fi = F2FS_I(inode); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index b8aa84109bf5..ffa11274b0ce 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -497,19 +497,6 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi) return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi)); } -static inline bool need_SSR(struct f2fs_sb_info *sbi) -{ - int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); - int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); - int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); - - if (test_opt(sbi, LFS)) - return false; - - return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + - 2 * reserved_sections(sbi)); -} - static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed, int needed) { From 95b475cd685686460f04e6862cdc400ff021f843 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Mon, 11 Sep 2017 16:30:28 +0900 Subject: [PATCH 0427/1212] f2fs: clear radix tree dirty tag of pages whose dirty flag is cleared On a senario like writing out the first dirty page of the inode as the inline data, we only cleared dirty flags of the pages, but didn't clear the dirty tags of those pages in the radix tree. If we don't clear the dirty tags of the pages in the radix tree, the inodes which contain the pages will be marked with I_DIRTY_PAGES again and again, and writepages() for the inodes will be invoked in every writeback period. As a result, nothing will be done in every writepages() for the inodes and it will just consume CPU time meaninglessly. Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 7 +++++++ fs/f2fs/inline.c | 7 +++++++ mm/util.c | 1 + 3 files changed, 15 insertions(+) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 1380c442648b..4f2a8fedb313 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -705,6 +705,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, struct f2fs_dentry_block *dentry_blk; unsigned int bit_pos; int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); + struct address_space *mapping = page_mapping(page); + unsigned long flags; int i; f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); @@ -735,6 +737,11 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, if (bit_pos == NR_DENTRY_IN_BLOCK && !truncate_hole(dir, page->index, page->index + 1)) { + spin_lock_irqsave(&mapping->tree_lock, flags); + radix_tree_tag_clear(&mapping->page_tree, page_index(page), + PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + clear_page_dirty_for_io(page); ClearPagePrivate(page); ClearPageUptodate(page); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 92b5a4a89ed2..7e76c415b913 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -202,6 +202,8 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) { void *src_addr, *dst_addr; struct dnode_of_data dn; + struct address_space *mapping = page_mapping(page); + unsigned long flags; int err; set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -223,6 +225,11 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) kunmap_atomic(src_addr); set_page_dirty(dn.inode_page); + spin_lock_irqsave(&mapping->tree_lock, flags); + radix_tree_tag_clear(&mapping->page_tree, page_index(page), + PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + set_inode_flag(inode, FI_APPEND_WRITE); set_inode_flag(inode, FI_DATA_EXIST); diff --git a/mm/util.c b/mm/util.c index d5259b62f8d7..d7b1065644be 100644 --- a/mm/util.c +++ b/mm/util.c @@ -348,6 +348,7 @@ struct address_space *page_mapping(struct page *page) return NULL; return page->mapping; } +EXPORT_SYMBOL(page_mapping); int overcommit_ratio_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, From 11dcf7834966656d4337dd6da77b04f6f765a893 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 12 Sep 2017 14:04:05 +0800 Subject: [PATCH 0428/1212] f2fs: detect dirty inode in evict_inode Add a bugon in f2fs_evict_inode to detect inconsistent status between inode cache and related node page cache. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index c33b05aec1a1..50c88e37ed66 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -519,6 +519,9 @@ void f2fs_evict_inode(struct inode *inode) stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); + if (!is_set_ckpt_flags(sbi, CP_ERROR_FLAG)) + f2fs_bug_on(sbi, is_inode_flag_set(inode, FI_DIRTY_INODE)); + /* ino == 0, if f2fs_new_inode() was failed t*/ if (inode->i_ino) invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, From 603dde39653d6dadd170329feb8febe2ac19cde5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 12 Sep 2017 14:25:35 +0800 Subject: [PATCH 0429/1212] f2fs: fix to show correct discard_granularity in sysfs Fix below incorrect display when reading discard_granularity sysfs node. $ cat /sys/fs/f2fs//discard_granularity $ 16 $ echo 32 > /sys/fs/f2fs//discard_granularity $ cat /sys/fs/f2fs//discard_granularity $ 16 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 962735dc9c63..e2c258f717cd 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -170,6 +170,8 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, dcc->pend_list_tag[i] &= (~P_ACTIVE); } mutex_unlock(&dcc->cmd_lock); + + *ui = t; return count; } From c7fd9e2b4a687666fbf12b73e443134580976606 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 12 Sep 2017 21:35:12 +0800 Subject: [PATCH 0430/1212] f2fs: hurry up to issue discard after io interruption Once we encounter I/O interruption during issuing discards, we will delay long time before next round, but if system status is I/O idle during the time, it may loses opportunity to issue discards. So this patch changes to hurry up to issue discard after io interruption. Besides, this patch also fixes to issue discards accurately with assigned rate. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 3244cfb1885f..059a219b7740 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1142,6 +1142,7 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) struct blk_plug plug; int iter = 0, issued = 0; int i; + bool io_interrupted = false; mutex_lock(&dcc->cmd_lock); f2fs_bug_on(sbi, @@ -1163,11 +1164,20 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) continue; } - if (!issue_cond || is_idle(sbi)) { - issued++; + if (!issue_cond) { __submit_discard_cmd(sbi, dc); + issued++; + continue; } - if (issue_cond && iter++ > DISCARD_ISSUE_RATE) + + if (is_idle(sbi)) { + __submit_discard_cmd(sbi, dc); + issued++; + } else { + io_interrupted = true; + } + + if (++iter >= DISCARD_ISSUE_RATE) goto out; } if (list_empty(pend_list) && dcc->pend_list_tag[i] & P_TRIM) @@ -1177,6 +1187,9 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) blk_finish_plug(&plug); mutex_unlock(&dcc->cmd_lock); + if (!issued && io_interrupted) + issued = -1; + return issued; } From d5347b1e666dd8ef0d26fde2f4f55e7bbd987dce Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 2 Oct 2017 02:50:16 +0800 Subject: [PATCH 0431/1212] f2fs: fix potential panic during fstrim As Ju Hyung Park reported: "When 'fstrim' is called for manual trim, a BUG() can be triggered randomly with this patch. I'm seeing this issue on both x86 Desktop and arm64 Android phone. On x86 Desktop, this was caused during Ubuntu boot-up. I have a cronjob installed which calls 'fstrim -v /' during boot. On arm64 Android, this was caused during GC looping with 1ms gc_min_sleep_time & gc_max_sleep_time." Root cause of this issue is that f2fs_wait_discard_bios can only be used by f2fs_put_super, because during put_super there must be no other referrers, so it can ignore discard entry's reference count when removing the entry, otherwise in other caller we will hit bug_on in __remove_discard_cmd as there may be other issuer added reference count in discard entry. Thread A Thread B - issue_discard_thread - f2fs_ioc_fitrim - f2fs_trim_fs - f2fs_wait_discard_bios - __issue_discard_cmd - __submit_discard_cmd - __wait_discard_cmd - dc->ref++ - __wait_one_discard_bio - __wait_discard_cmd - __remove_discard_cmd - f2fs_bug_on(sbi, dc->ref) Fixes: 969d1b180d987c2be02de890d0fff0f66a0e80de Reported-by: Ju Hyung Park Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/segment.c | 6 +++--- fs/f2fs/super.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ff694127243a..dd840f60e172 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2606,7 +2606,7 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new); void stop_discard_thread(struct f2fs_sb_info *sbi); -void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); +void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi, bool umount); void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); void release_discard_addrs(struct f2fs_sb_info *sbi); int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 059a219b7740..f5c494389483 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1290,11 +1290,11 @@ void stop_discard_thread(struct f2fs_sb_info *sbi) } /* This comes from f2fs_put_super and f2fs_trim_fs */ -void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) +void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi, bool umount) { __issue_discard_cmd(sbi, false); __drop_discard_cmd(sbi); - __wait_discard_cmd(sbi, false); + __wait_discard_cmd(sbi, !umount); } static void mark_discard_range_all(struct f2fs_sb_info *sbi) @@ -2324,7 +2324,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) } /* It's time to issue all the filed discards */ mark_discard_range_all(sbi); - f2fs_wait_discard_bios(sbi); + f2fs_wait_discard_bios(sbi, false); out: range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); return err; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 315e59ad1483..482bb0333806 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -801,7 +801,7 @@ static void f2fs_put_super(struct super_block *sb) } /* be sure to wait for any on-going discard commands */ - f2fs_wait_discard_bios(sbi); + f2fs_wait_discard_bios(sbi, true); if (f2fs_discard_en(sbi) && !sbi->discard_blks) { struct cp_control cpc = { From 131bc9f6b7f9efc531eb81f8d542618c6c1cc3c5 Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Sat, 23 Sep 2017 17:02:18 +0800 Subject: [PATCH 0432/1212] Revert "f2fs: node segment is prior to data segment selected victim" This reverts commit b9cd20619e359d199b755543474c3d853c8e3415. That patch causes much fewer node segments (which can be used for SSR) than before, and in the corner case (e.g. create and delete *.txt files in one same directory, there will be very few node segments but many data segments), if the reserved free segments are all used up during gc, then the write_checkpoint can still flush dentry pages to data ssr segments, but will probably fail to flush node pages to node ssr segments, since there are not enough node ssr segments left (the left ones are all full). So revert this patch to give a fair chance to let node segments remain for SSR, which provides more robustness for corner cases. Conflicts: fs/f2fs/gc.c Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index bd16e6631cf3..3a6eaf01cdf7 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -267,16 +267,6 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) return UINT_MAX - ((100 * (100 - u) * age) / (100 + u)); } -static unsigned int get_greedy_cost(struct f2fs_sb_info *sbi, - unsigned int segno) -{ - unsigned int valid_blocks = - get_valid_blocks(sbi, segno, true); - - return IS_DATASEG(get_seg_entry(sbi, segno)->type) ? - valid_blocks * 2 : valid_blocks; -} - static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno, struct victim_sel_policy *p) { @@ -285,7 +275,7 @@ static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, /* alloc_mode == LFS */ if (p->gc_mode == GC_GREEDY) - return get_greedy_cost(sbi, segno); + return get_valid_blocks(sbi, segno, true); else return get_cb_cost(sbi, segno); } From dbce11e9ee5b89fd88e0fef40478c2bb8ff1ef68 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 25 Sep 2017 14:17:51 +0800 Subject: [PATCH 0433/1212] Revert "f2fs: reuse nids more aggressively" Commit 268344664603 ("f2fs: reuse nids more aggressively") tries to reuse nids as many as possilbe, in order to mitigate producing obsolete node pages in page cache. But acutally, before we reuse the nids and related node page cache, we will always invalidate that node page, so there will be not any obsolete node pages in cache. Let's just revert previous commit, so that nm_i::next_scan_nid can be increased ascendingly, making __build_free_nids traverses all NAT pages more easily, finally, free nid bitmap cache can be enabled as soon as possible. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 32474db18ad9..264ccd157858 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -327,10 +327,6 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) { unsigned char version = nat_get_version(e); nat_set_version(e, inc_node_version(version)); - - /* in order to reuse the nid */ - if (nm_i->next_scan_nid > ni->nid) - nm_i->next_scan_nid = ni->nid; } /* change address */ From 8ea6e1c327c53c785d9a29303e963d3b5c9f9ff4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 4 Sep 2017 18:58:02 +0800 Subject: [PATCH 0434/1212] f2fs: introduce read_inline_xattr Commit ba38c27eb93e ("f2fs: enhance lookup xattr") introduces lookup_all_xattrs duplicating from read_all_xattrs, which leaves lots of similar codes in between them, so introduce new help read_inline_xattr to clean up redundant codes. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 59 ++++++++++++++++++++++++------------------------- 1 file changed, 29 insertions(+), 30 deletions(-) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index ab658419552b..bbdf9955c2dc 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -288,6 +288,29 @@ static struct f2fs_xattr_entry *__find_inline_xattr(void *base_addr, return entry; } +static int read_inline_xattr(struct inode *inode, struct page *ipage, + void *txattr_addr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned int inline_size = inline_xattr_size(inode); + struct page *page = NULL; + void *inline_addr; + + if (ipage) { + inline_addr = inline_xattr_addr(ipage); + } else { + page = get_node_page(sbi, inode->i_ino); + if (IS_ERR(page)) + return PTR_ERR(page); + + inline_addr = inline_xattr_addr(page); + } + memcpy(txattr_addr, inline_addr, inline_size); + f2fs_put_page(page, 1); + + return 0; +} + static int lookup_all_xattrs(struct inode *inode, struct page *ipage, unsigned int index, unsigned int len, const char *name, struct f2fs_xattr_entry **xe, @@ -310,21 +333,9 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, /* read from inline xattr */ if (inline_size) { - struct page *page = NULL; - void *inline_addr; - - if (ipage) { - inline_addr = inline_xattr_addr(ipage); - } else { - page = get_node_page(sbi, inode->i_ino); - if (IS_ERR(page)) { - err = PTR_ERR(page); - goto out; - } - inline_addr = inline_xattr_addr(page); - } - memcpy(txattr_addr, inline_addr, inline_size); - f2fs_put_page(page, 1); + err = read_inline_xattr(inode, ipage, txattr_addr); + if (err) + goto out; *xe = __find_inline_xattr(txattr_addr, &last_addr, index, len, name); @@ -386,21 +397,9 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, /* read from inline xattr */ if (inline_size) { - struct page *page = NULL; - void *inline_addr; - - if (ipage) { - inline_addr = inline_xattr_addr(ipage); - } else { - page = get_node_page(sbi, inode->i_ino); - if (IS_ERR(page)) { - err = PTR_ERR(page); - goto fail; - } - inline_addr = inline_xattr_addr(page); - } - memcpy(txattr_addr, inline_addr, inline_size); - f2fs_put_page(page, 1); + err = read_inline_xattr(inode, ipage, txattr_addr); + if (err) + goto fail; } /* read from xattr node block */ From 6d625a93b4a8acf6eaa2cfebd21ce8bd7c7080dc Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 4 Sep 2017 18:58:03 +0800 Subject: [PATCH 0435/1212] f2fs: introduce read_xattr_block Commit ba38c27eb93e ("f2fs: enhance lookup xattr") introduces lookup_all_xattrs duplicating from read_all_xattrs, which leaves lots of similar codes in between them, so introduce new help read_xattr_block to clean up redundant codes. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 50 ++++++++++++++++++++++++------------------------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index bbdf9955c2dc..c5e6a7e42262 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -311,12 +311,31 @@ static int read_inline_xattr(struct inode *inode, struct page *ipage, return 0; } +static int read_xattr_block(struct inode *inode, void *txattr_addr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + unsigned int inline_size = inline_xattr_size(inode); + struct page *xpage; + void *xattr_addr; + + /* The inode already has an extended attribute block. */ + xpage = get_node_page(sbi, xnid); + if (IS_ERR(xpage)) + return PTR_ERR(xpage); + + xattr_addr = page_address(xpage); + memcpy(txattr_addr + inline_size, xattr_addr, VALID_XATTR_BLOCK_SIZE); + f2fs_put_page(xpage, 1); + + return 0; +} + static int lookup_all_xattrs(struct inode *inode, struct page *ipage, unsigned int index, unsigned int len, const char *name, struct f2fs_xattr_entry **xe, void **base_addr) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); void *cur_addr, *txattr_addr, *last_addr = NULL; nid_t xnid = F2FS_I(inode)->i_xattr_nid; unsigned int size = xnid ? VALID_XATTR_BLOCK_SIZE : 0; @@ -345,19 +364,9 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, /* read from xattr node block */ if (xnid) { - struct page *xpage; - void *xattr_addr; - - /* The inode already has an extended attribute block. */ - xpage = get_node_page(sbi, xnid); - if (IS_ERR(xpage)) { - err = PTR_ERR(xpage); + err = read_xattr_block(inode, txattr_addr); + if (err) goto out; - } - - xattr_addr = page_address(xpage); - memcpy(txattr_addr + inline_size, xattr_addr, size); - f2fs_put_page(xpage, 1); } if (last_addr) @@ -382,7 +391,6 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, static int read_all_xattrs(struct inode *inode, struct page *ipage, void **base_addr) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_xattr_header *header; nid_t xnid = F2FS_I(inode)->i_xattr_nid; unsigned int size = VALID_XATTR_BLOCK_SIZE; @@ -404,19 +412,9 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, /* read from xattr node block */ if (xnid) { - struct page *xpage; - void *xattr_addr; - - /* The inode already has an extended attribute block. */ - xpage = get_node_page(sbi, xnid); - if (IS_ERR(xpage)) { - err = PTR_ERR(xpage); + err = read_xattr_block(inode, txattr_addr); + if (err) goto fail; - } - - xattr_addr = page_address(xpage); - memcpy(txattr_addr + inline_size, xattr_addr, size); - f2fs_put_page(xpage, 1); } header = XATTR_HDR(txattr_addr); From 322a45d172124837d6a253828465ba2ccb652443 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 14 Sep 2017 10:18:01 +0800 Subject: [PATCH 0436/1212] f2fs: show flush list status in sysfs This patch adds to show flush list status in sysfs. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 5 ++++- fs/f2fs/f2fs.h | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 87f449845f5f..00c1d4a9f356 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -61,6 +61,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) atomic_read(&SM_I(sbi)->fcc_info->issued_flush); si->nr_flushing = atomic_read(&SM_I(sbi)->fcc_info->issing_flush); + si->flush_list_empty = + llist_empty(&SM_I(sbi)->fcc_info->issue_list); } if (SM_I(sbi) && SM_I(sbi)->dcc_info) { si->nr_discarded = @@ -349,10 +351,11 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: (%4d %4d), " + seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), " "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n", si->nr_wb_cp_data, si->nr_wb_data, si->nr_flushing, si->nr_flushed, + si->flush_list_empty, si->nr_discarding, si->nr_discarded, si->nr_discard_cmd, si->undiscard_blks); seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), " diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index dd840f60e172..7ed6e4abdd15 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2755,7 +2755,8 @@ struct f2fs_stat_info { int free_nids, avail_nids, alloc_nids; int total_count, utilization; int bg_gc, nr_wb_cp_data, nr_wb_data; - int nr_flushing, nr_flushed, nr_discarding, nr_discarded; + int nr_flushing, nr_flushed, flush_list_empty; + int nr_discarding, nr_discarded; int nr_discard_cmd; unsigned int undiscard_blks; int inline_xattr, inline_inode, inline_dir, append, update, orphans; From 4de0ceb6b7ef46851b82251e2470cd81920d48cc Mon Sep 17 00:00:00 2001 From: Hsiang Kao Date: Sun, 24 Sep 2017 02:45:42 +0800 Subject: [PATCH 0437/1212] f2fs: allow readpages with NULL file pointer Keep in line with the other Linux file system implementations since page_cache_sync_readahead supports NULL file pointer, and thus we can readahead data by f2fs itself without file opening (something like the btrfs behavior). Signed-off-by: Gao Xiang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a275cbe57042..3b1dea525f15 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1331,7 +1331,7 @@ static int f2fs_read_data_pages(struct file *file, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { - struct inode *inode = file->f_mapping->host; + struct inode *inode = mapping->host; struct page *page = list_last_entry(pages, struct page, lru); trace_f2fs_readpages(inode, page, nr_pages); From 75d3164ae128764bfef899df03a1facd31ab2f21 Mon Sep 17 00:00:00 2001 From: Weichao Guo Date: Fri, 29 Sep 2017 22:43:23 +0800 Subject: [PATCH 0438/1212] f2fs: convert inline data for direct I/O & FI_NO_PREALLOC In FI_NO_PREALLOC cases, direct I/O path may allocate blocks for an inode but keep its inline data flag. This inconsistency may trigger vfs clear_inode nrpages bug_on when evicting the inode. We should convert inline data first in this case. Signed-off-by: Weichao Guo Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 3b1dea525f15..143355c91873 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -831,6 +831,13 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) struct f2fs_map_blocks map; int err = 0; + /* convert inline data for Direct I/O*/ + if (iocb->ki_flags & IOCB_DIRECT) { + err = f2fs_convert_inline_inode(inode); + if (err) + return err; + } + if (is_inode_flag_set(inode, FI_NO_PREALLOC)) return 0; @@ -843,15 +850,11 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) map.m_next_pgofs = NULL; - if (iocb->ki_flags & IOCB_DIRECT) { - err = f2fs_convert_inline_inode(inode); - if (err) - return err; + if (iocb->ki_flags & IOCB_DIRECT) return f2fs_map_blocks(inode, &map, 1, __force_buffered_io(inode, WRITE) ? F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO); - } if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) { err = f2fs_convert_inline_inode(inode); if (err) From f555b0a117d38ea29b157b788437ff6f0c72bd37 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 29 Sep 2017 13:59:35 +0800 Subject: [PATCH 0439/1212] f2fs: obsolete ALLOC_NID_LIST list As Fan Li reported, there is no user traversing nid_list[ALLOC_NID_LIST] which is used for tracking preallocated nids. Let's drop it, and only track preallocated nids in free_nid_root radix-tree. Reported-by: Fan Li Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 8 ++-- fs/f2fs/f2fs.h | 15 ++++--- fs/f2fs/node.c | 97 ++++++++++++++++++++++------------------------ fs/f2fs/node.h | 15 ++----- fs/f2fs/shrinker.c | 2 +- 5 files changed, 64 insertions(+), 73 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 00c1d4a9f356..14095fbb4039 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -98,9 +98,9 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->dirty_nats = NM_I(sbi)->dirty_nat_cnt; si->sits = MAIN_SEGS(sbi); si->dirty_sits = SIT_I(sbi)->dirty_sentries; - si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID_LIST]; + si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID]; si->avail_nids = NM_I(sbi)->available_nids; - si->alloc_nids = NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]; + si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID]; si->bg_gc = sbi->bg_gc; si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) @@ -233,8 +233,8 @@ static void update_mem_info(struct f2fs_sb_info *sbi) } /* free nids */ - si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] + - NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]) * + si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID] + + NM_I(sbi)->nid_cnt[PREALLOC_NID]) * sizeof(struct free_nid); si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry); si->cache_mem += NM_I(sbi)->dirty_nat_cnt * diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7ed6e4abdd15..685145846946 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -730,10 +730,13 @@ static inline void __try_update_largest_extent(struct inode *inode, } } -enum nid_list { - FREE_NID_LIST, - ALLOC_NID_LIST, - MAX_NID_LIST, +/* + * For free nid management + */ +enum nid_state { + FREE_NID, /* newly added to free nid list */ + PREALLOC_NID, /* it is preallocated */ + MAX_NID_STATE, }; struct f2fs_nm_info { @@ -756,8 +759,8 @@ struct f2fs_nm_info { /* free node ids management */ struct radix_tree_root free_nid_root;/* root of the free_nid cache */ - struct list_head nid_list[MAX_NID_LIST];/* lists for free nids */ - unsigned int nid_cnt[MAX_NID_LIST]; /* the number of free node id */ + struct list_head free_nid_list; /* list for free nids excluding preallocated nids */ + unsigned int nid_cnt[MAX_NID_STATE]; /* the number of free node id */ spinlock_t nid_list_lock; /* protect nid lists ops */ struct mutex build_lock; /* lock for build free nids */ unsigned char (*free_nid_bitmap)[NAT_ENTRY_BITMAP_SIZE]; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 264ccd157858..513f5dfb1952 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -46,7 +46,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) * give 25%, 25%, 50%, 50%, 50% memory for each components respectively */ if (type == FREE_NIDS) { - mem_size = (nm_i->nid_cnt[FREE_NID_LIST] * + mem_size = (nm_i->nid_cnt[FREE_NID] * sizeof(struct free_nid)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); } else if (type == NAT_ENTRIES) { @@ -1760,8 +1760,8 @@ static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, return radix_tree_lookup(&nm_i->free_nid_root, n); } -static int __insert_nid_to_list(struct f2fs_sb_info *sbi, - struct free_nid *i, enum nid_list list, bool new) +static int __insert_free_nid(struct f2fs_sb_info *sbi, + struct free_nid *i, enum nid_state state, bool new) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -1771,22 +1771,22 @@ static int __insert_nid_to_list(struct f2fs_sb_info *sbi, return err; } - f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW : - i->state != NID_ALLOC); - nm_i->nid_cnt[list]++; - list_add_tail(&i->list, &nm_i->nid_list[list]); + f2fs_bug_on(sbi, state != i->state); + nm_i->nid_cnt[state]++; + if (state == FREE_NID) + list_add_tail(&i->list, &nm_i->free_nid_list); return 0; } -static void __remove_nid_from_list(struct f2fs_sb_info *sbi, - struct free_nid *i, enum nid_list list, bool reuse) +static void __remove_free_nid(struct f2fs_sb_info *sbi, + struct free_nid *i, enum nid_state state, bool reuse) { struct f2fs_nm_info *nm_i = NM_I(sbi); - f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW : - i->state != NID_ALLOC); - nm_i->nid_cnt[list]--; - list_del(&i->list); + f2fs_bug_on(sbi, state != i->state); + nm_i->nid_cnt[state]--; + if (state == FREE_NID) + list_del(&i->list); if (!reuse) radix_tree_delete(&nm_i->free_nid_root, i->nid); } @@ -1806,7 +1806,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS); i->nid = nid; - i->state = NID_NEW; + i->state = FREE_NID; if (radix_tree_preload(GFP_NOFS)) goto err; @@ -1819,7 +1819,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) * - f2fs_create * - f2fs_new_inode * - alloc_nid - * - __insert_nid_to_list(ALLOC_NID_LIST) + * - __insert_nid_to_list(PREALLOC_NID) * - f2fs_balance_fs_bg * - build_free_nids * - __build_free_nids @@ -1832,8 +1832,8 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) * - new_node_page * - set_node_addr * - alloc_nid_done - * - __remove_nid_from_list(ALLOC_NID_LIST) - * - __insert_nid_to_list(FREE_NID_LIST) + * - __remove_nid_from_list(PREALLOC_NID) + * - __insert_nid_to_list(FREE_NID) */ ne = __lookup_nat_cache(nm_i, nid); if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || @@ -1842,13 +1842,13 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) e = __lookup_free_nid_list(nm_i, nid); if (e) { - if (e->state == NID_NEW) + if (e->state == FREE_NID) ret = true; goto err_out; } } ret = true; - err = __insert_nid_to_list(sbi, i, FREE_NID_LIST, true); + err = __insert_free_nid(sbi, i, FREE_NID, true); err_out: spin_unlock(&nm_i->nid_list_lock); radix_tree_preload_end(); @@ -1866,8 +1866,8 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); - if (i && i->state == NID_NEW) { - __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); + if (i && i->state == FREE_NID) { + __remove_free_nid(sbi, i, FREE_NID, false); need_free = true; } spin_unlock(&nm_i->nid_list_lock); @@ -1952,7 +1952,7 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi) nid = i * NAT_ENTRY_PER_BLOCK + idx; add_free_nid(sbi, nid, true); - if (nm_i->nid_cnt[FREE_NID_LIST] >= MAX_FREE_NIDS) + if (nm_i->nid_cnt[FREE_NID] >= MAX_FREE_NIDS) goto out; } } @@ -1985,7 +1985,7 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) nid = 0; /* Enough entries */ - if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK) + if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK) return; if (!sync && !available_free_memory(sbi, FREE_NIDS)) @@ -1995,7 +1995,7 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) /* try to find free nids in free_nid_bitmap */ scan_free_nid_bits(sbi); - if (nm_i->nid_cnt[FREE_NID_LIST]) + if (nm_i->nid_cnt[FREE_NID]) return; } @@ -2072,15 +2072,15 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) } /* We should not use stale free nids created by build_free_nids */ - if (nm_i->nid_cnt[FREE_NID_LIST] && !on_build_free_nids(nm_i)) { - f2fs_bug_on(sbi, list_empty(&nm_i->nid_list[FREE_NID_LIST])); - i = list_first_entry(&nm_i->nid_list[FREE_NID_LIST], + if (nm_i->nid_cnt[FREE_NID] && !on_build_free_nids(nm_i)) { + f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); + i = list_first_entry(&nm_i->free_nid_list, struct free_nid, list); *nid = i->nid; - __remove_nid_from_list(sbi, i, FREE_NID_LIST, true); - i->state = NID_ALLOC; - __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false); + __remove_free_nid(sbi, i, FREE_NID, true); + i->state = PREALLOC_NID; + __insert_free_nid(sbi, i, PREALLOC_NID, false); nm_i->available_nids--; update_free_nid_bitmap(sbi, *nid, false, false); @@ -2106,7 +2106,7 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); f2fs_bug_on(sbi, !i); - __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false); + __remove_free_nid(sbi, i, PREALLOC_NID, false); spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); @@ -2129,12 +2129,12 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) f2fs_bug_on(sbi, !i); if (!available_free_memory(sbi, FREE_NIDS)) { - __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false); + __remove_free_nid(sbi, i, PREALLOC_NID, false); need_free = true; } else { - __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, true); - i->state = NID_NEW; - __insert_nid_to_list(sbi, i, FREE_NID_LIST, false); + __remove_free_nid(sbi, i, PREALLOC_NID, true); + i->state = FREE_NID; + __insert_free_nid(sbi, i, FREE_NID, false); } nm_i->available_nids++; @@ -2153,20 +2153,19 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) struct free_nid *i, *next; int nr = nr_shrink; - if (nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS) + if (nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS) return 0; if (!mutex_trylock(&nm_i->build_lock)) return 0; spin_lock(&nm_i->nid_list_lock); - list_for_each_entry_safe(i, next, &nm_i->nid_list[FREE_NID_LIST], - list) { + list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) { if (nr_shrink <= 0 || - nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS) + nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS) break; - __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); + __remove_free_nid(sbi, i, FREE_NID, false); kmem_cache_free(free_nid_slab, i); nr_shrink--; } @@ -2638,16 +2637,15 @@ static int init_node_manager(struct f2fs_sb_info *sbi) /* not used nids: 0, node, meta, (and root counted as valid node) */ nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count - F2FS_RESERVED_NODE_NUM; - nm_i->nid_cnt[FREE_NID_LIST] = 0; - nm_i->nid_cnt[ALLOC_NID_LIST] = 0; + nm_i->nid_cnt[FREE_NID] = 0; + nm_i->nid_cnt[PREALLOC_NID] = 0; nm_i->nat_cnt = 0; nm_i->ram_thresh = DEF_RAM_THRESHOLD; nm_i->ra_nid_pages = DEF_RA_NID_PAGES; nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD; INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); - INIT_LIST_HEAD(&nm_i->nid_list[FREE_NID_LIST]); - INIT_LIST_HEAD(&nm_i->nid_list[ALLOC_NID_LIST]); + INIT_LIST_HEAD(&nm_i->free_nid_list); INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO); INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO); INIT_LIST_HEAD(&nm_i->nat_entries); @@ -2739,16 +2737,15 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) /* destroy free nid list */ spin_lock(&nm_i->nid_list_lock); - list_for_each_entry_safe(i, next_i, &nm_i->nid_list[FREE_NID_LIST], - list) { - __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); + list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { + __remove_free_nid(sbi, i, FREE_NID, false); spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); spin_lock(&nm_i->nid_list_lock); } - f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID_LIST]); - f2fs_bug_on(sbi, nm_i->nid_cnt[ALLOC_NID_LIST]); - f2fs_bug_on(sbi, !list_empty(&nm_i->nid_list[ALLOC_NID_LIST])); + f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID]); + f2fs_bug_on(sbi, nm_i->nid_cnt[PREALLOC_NID]); + f2fs_bug_on(sbi, !list_empty(&nm_i->free_nid_list)); spin_unlock(&nm_i->nid_list_lock); /* destroy nat cache */ diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index bb53e9955ff2..e91b08b4a51a 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -150,18 +150,10 @@ struct nat_entry_set { unsigned int entry_cnt; /* the # of nat entries in set */ }; -/* - * For free nid mangement - */ -enum nid_state { - NID_NEW, /* newly added to free nid list */ - NID_ALLOC /* it is allocated */ -}; - struct free_nid { struct list_head list; /* for free node id list */ nid_t nid; /* node id */ - int state; /* in use or not: NID_NEW or NID_ALLOC */ + int state; /* in use or not: FREE_NID or PREALLOC_NID */ }; static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) @@ -170,12 +162,11 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct free_nid *fnid; spin_lock(&nm_i->nid_list_lock); - if (nm_i->nid_cnt[FREE_NID_LIST] <= 0) { + if (nm_i->nid_cnt[FREE_NID] <= 0) { spin_unlock(&nm_i->nid_list_lock); return; } - fnid = list_first_entry(&nm_i->nid_list[FREE_NID_LIST], - struct free_nid, list); + fnid = list_first_entry(&nm_i->free_nid_list, struct free_nid, list); *nid = fnid->nid; spin_unlock(&nm_i->nid_list_lock); } diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index 5c60fc28ec75..0b5664a1a6cc 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -28,7 +28,7 @@ static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) { - long count = NM_I(sbi)->nid_cnt[FREE_NID_LIST] - MAX_FREE_NIDS; + long count = NM_I(sbi)->nid_cnt[FREE_NID] - MAX_FREE_NIDS; return count > 0 ? count : 0; } From 814b463d262f19f5997c3632256ea41a4ee0be11 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 29 Sep 2017 13:59:36 +0800 Subject: [PATCH 0440/1212] f2fs: drop FI_UPDATE_WRITE tag after f2fs_issue_flush If we failed to issue flush in ->fsync, we need to keep FI_UPDATE_WRITE flag to make sure triggering flush in next ->fsync. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 531379f513fa..43617d7c596c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -297,10 +297,12 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, remove_ino_entry(sbi, ino, APPEND_INO); clear_inode_flag(inode, FI_APPEND_WRITE); flush_out: - remove_ino_entry(sbi, ino, UPDATE_INO); - clear_inode_flag(inode, FI_UPDATE_WRITE); if (!atomic) ret = f2fs_issue_flush(sbi); + if (!ret) { + remove_ino_entry(sbi, ino, UPDATE_INO); + clear_inode_flag(inode, FI_UPDATE_WRITE); + } f2fs_update_time(sbi, REQ_TIME); out: trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); From 9c2526ac2ecbb716523bfd21bf1c3e55e1e28c9d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 29 Sep 2017 13:59:37 +0800 Subject: [PATCH 0441/1212] f2fs: fix to show ino management cache size correctly It needs to stat size of ino management cache with all type instead of orphan ino type. Fixes: 652be55162dc ("f2fs: show # of orphan inodes") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 14095fbb4039..d441660c3ba6 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -240,7 +240,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->cache_mem += NM_I(sbi)->dirty_nat_cnt * sizeof(struct nat_entry_set); si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages); - for (i = 0; i <= ORPHAN_INO; i++) + for (i = 0; i < MAX_INO_ENTRY; i++) si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); si->cache_mem += atomic_read(&sbi->total_ext_tree) * sizeof(struct extent_tree); From 08bb9d68d51b2946f244f77865a48b23b29af1eb Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 29 Sep 2017 13:59:38 +0800 Subject: [PATCH 0442/1212] f2fs: enhance multiple device flush When multiple device feature is enabled, during ->fsync we will issue flush in all devices to make sure node/data of the file being persisted into storage. But some flushes of device could be unneeded as file's data may be not writebacked into those devices. So this patch adds and manage bitmap per inode in global cache to indicate which device is dirty and it needs to issue flush during ->fsync, hence, we could improve performance of fsync in scenario of multiple device. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 36 +++++++++++++++++++++++++++++----- fs/f2fs/data.c | 1 + fs/f2fs/f2fs.h | 14 +++++++++++--- fs/f2fs/file.c | 3 ++- fs/f2fs/gc.c | 2 ++ fs/f2fs/inline.c | 1 + fs/f2fs/inode.c | 1 + fs/f2fs/node.c | 3 ++- fs/f2fs/segment.c | 46 +++++++++++++++++++++++++++++++++----------- 9 files changed, 86 insertions(+), 21 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index e86f67ac96c6..b1c6e75c2764 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -402,7 +402,8 @@ const struct address_space_operations f2fs_meta_aops = { #endif }; -static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) +static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, + unsigned int devidx, int type) { struct inode_management *im = &sbi->im[type]; struct ino_entry *e, *tmp; @@ -427,6 +428,10 @@ static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) if (type != ORPHAN_INO) im->ino_num++; } + + if (type == FLUSH_INO) + f2fs_set_bit(devidx, (char *)&e->dirty_device); + spin_unlock(&im->ino_lock); radix_tree_preload_end(); @@ -455,7 +460,7 @@ static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { /* add new dirty ino entry into list */ - __add_ino_entry(sbi, ino, type); + __add_ino_entry(sbi, ino, 0, type); } void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) @@ -481,7 +486,7 @@ void release_ino_entry(struct f2fs_sb_info *sbi, bool all) struct ino_entry *e, *tmp; int i; - for (i = all ? ORPHAN_INO: APPEND_INO; i <= UPDATE_INO; i++) { + for (i = all ? ORPHAN_INO : APPEND_INO; i < MAX_INO_ENTRY; i++) { struct inode_management *im = &sbi->im[i]; spin_lock(&im->ino_lock); @@ -495,6 +500,27 @@ void release_ino_entry(struct f2fs_sb_info *sbi, bool all) } } +void set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, + unsigned int devidx, int type) +{ + __add_ino_entry(sbi, ino, devidx, type); +} + +bool is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, + unsigned int devidx, int type) +{ + struct inode_management *im = &sbi->im[type]; + struct ino_entry *e; + bool is_dirty = false; + + spin_lock(&im->ino_lock); + e = radix_tree_lookup(&im->ino_root, ino); + if (e && f2fs_test_bit(devidx, (char *)&e->dirty_device)) + is_dirty = true; + spin_unlock(&im->ino_lock); + return is_dirty; +} + int acquire_orphan_inode(struct f2fs_sb_info *sbi) { struct inode_management *im = &sbi->im[ORPHAN_INO]; @@ -531,7 +557,7 @@ void release_orphan_inode(struct f2fs_sb_info *sbi) void add_orphan_inode(struct inode *inode) { /* add new orphan ino entry into list */ - __add_ino_entry(F2FS_I_SB(inode), inode->i_ino, ORPHAN_INO); + __add_ino_entry(F2FS_I_SB(inode), inode->i_ino, 0, ORPHAN_INO); update_inode_page(inode); } @@ -555,7 +581,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) return err; } - __add_ino_entry(sbi, ino, ORPHAN_INO); + __add_ino_entry(sbi, ino, 0, ORPHAN_INO); inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) { diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 143355c91873..a655a39d60b3 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1495,6 +1495,7 @@ static int __write_data_page(struct page *page, bool *submitted, int err = 0; struct f2fs_io_info fio = { .sbi = sbi, + .ino = inode->i_ino, .type = DATA, .op = REQ_OP_WRITE, .op_flags = wbc_to_write_flags(wbc), diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 685145846946..06a4d784abce 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -244,12 +244,14 @@ enum { ORPHAN_INO, /* for orphan ino list */ APPEND_INO, /* for append ino list */ UPDATE_INO, /* for update ino list */ + FLUSH_INO, /* for multiple device flushing */ MAX_INO_ENTRY, /* max. list */ }; struct ino_entry { - struct list_head list; /* list head */ - nid_t ino; /* inode number */ + struct list_head list; /* list head */ + nid_t ino; /* inode number */ + unsigned int dirty_device; /* dirty device bitmap */ }; /* for the list of inodes to be GCed */ @@ -838,6 +840,7 @@ enum { struct flush_cmd { struct completion wait; struct llist_node llnode; + nid_t ino; int ret; }; @@ -965,6 +968,7 @@ enum iostat_type { struct f2fs_io_info { struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ + nid_t ino; /* inode number */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ enum temp_type temp; /* contains HOT/WARM/COLD */ int op; /* contains REQ_OP_ */ @@ -2602,7 +2606,7 @@ void drop_inmem_page(struct inode *inode, struct page *page); int commit_inmem_pages(struct inode *inode); void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi); -int f2fs_issue_flush(struct f2fs_sb_info *sbi); +int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino); int create_flush_cmd_control(struct f2fs_sb_info *sbi); void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); @@ -2664,6 +2668,10 @@ void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); void release_ino_entry(struct f2fs_sb_info *sbi, bool all); bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode); +void set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, + unsigned int devidx, int type); +bool is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, + unsigned int devidx, int type); int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi); int acquire_orphan_inode(struct f2fs_sb_info *sbi); void release_orphan_inode(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 43617d7c596c..cd569d394272 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -298,10 +298,11 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, clear_inode_flag(inode, FI_APPEND_WRITE); flush_out: if (!atomic) - ret = f2fs_issue_flush(sbi); + ret = f2fs_issue_flush(sbi, inode->i_ino); if (!ret) { remove_ino_entry(sbi, ino, UPDATE_INO); clear_inode_flag(inode, FI_UPDATE_WRITE); + remove_ino_entry(sbi, ino, FLUSH_INO); } f2fs_update_time(sbi, REQ_TIME); out: diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 3a6eaf01cdf7..32b0b0632e15 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -598,6 +598,7 @@ static void move_data_block(struct inode *inode, block_t bidx, { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), + .ino = inode->i_ino, .type = DATA, .temp = COLD, .op = REQ_OP_READ, @@ -728,6 +729,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, } else { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), + .ino = inode->i_ino, .type = DATA, .temp = COLD, .op = REQ_OP_WRITE, diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 7e76c415b913..0fa5ca0907ba 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -112,6 +112,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(dn->inode), + .ino = dn->inode->i_ino, .type = DATA, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC | REQ_NOIDLE | REQ_PRIO, diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 50c88e37ed66..ad4f7d52c0ad 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -480,6 +480,7 @@ void f2fs_evict_inode(struct inode *inode) remove_ino_entry(sbi, inode->i_ino, APPEND_INO); remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); + remove_ino_entry(sbi, inode->i_ino, FLUSH_INO); sb_start_intwrite(inode->i_sb); set_inode_flag(inode, FI_NO_ALLOC); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 513f5dfb1952..733a8e14a4c8 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -63,7 +63,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) } else if (type == INO_ENTRIES) { int i; - for (i = 0; i <= UPDATE_INO; i++) + for (i = 0; i < MAX_INO_ENTRY; i++) mem_size += sbi->im[i].ino_num * sizeof(struct ino_entry); mem_size >>= PAGE_SHIFT; @@ -1340,6 +1340,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, struct node_info ni; struct f2fs_io_info fio = { .sbi = sbi, + .ino = ino_of_node(page), .type = NODE, .op = REQ_OP_WRITE, .op_flags = wbc_to_write_flags(wbc), diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f5c494389483..5351caa2ffd9 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -313,6 +313,7 @@ static int __commit_inmem_pages(struct inode *inode, struct inmem_pages *cur, *tmp; struct f2fs_io_info fio = { .sbi = sbi, + .ino = inode->i_ino, .type = DATA, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC | REQ_PRIO, @@ -485,15 +486,17 @@ static int __submit_flush_wait(struct f2fs_sb_info *sbi, return ret; } -static int submit_flush_wait(struct f2fs_sb_info *sbi) +static int submit_flush_wait(struct f2fs_sb_info *sbi, nid_t ino) { - int ret = __submit_flush_wait(sbi, sbi->sb->s_bdev); + int ret = 0; int i; - if (!sbi->s_ndevs || ret) - return ret; + if (!sbi->s_ndevs) + return __submit_flush_wait(sbi, sbi->sb->s_bdev); - for (i = 1; i < sbi->s_ndevs; i++) { + for (i = 0; i < sbi->s_ndevs; i++) { + if (!is_dirty_device(sbi, ino, i, FLUSH_INO)) + continue; ret = __submit_flush_wait(sbi, FDEV(i).bdev); if (ret) break; @@ -519,7 +522,9 @@ static int issue_flush_thread(void *data) fcc->dispatch_list = llist_del_all(&fcc->issue_list); fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); - ret = submit_flush_wait(sbi); + cmd = llist_entry(fcc->dispatch_list, struct flush_cmd, llnode); + + ret = submit_flush_wait(sbi, cmd->ino); atomic_inc(&fcc->issued_flush); llist_for_each_entry_safe(cmd, next, @@ -537,7 +542,7 @@ static int issue_flush_thread(void *data) goto repeat; } -int f2fs_issue_flush(struct f2fs_sb_info *sbi) +int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino) { struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; struct flush_cmd cmd; @@ -547,19 +552,20 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) return 0; if (!test_opt(sbi, FLUSH_MERGE)) { - ret = submit_flush_wait(sbi); + ret = submit_flush_wait(sbi, ino); atomic_inc(&fcc->issued_flush); return ret; } - if (atomic_inc_return(&fcc->issing_flush) == 1) { - ret = submit_flush_wait(sbi); + if (atomic_inc_return(&fcc->issing_flush) == 1 || sbi->s_ndevs > 1) { + ret = submit_flush_wait(sbi, ino); atomic_dec(&fcc->issing_flush); atomic_inc(&fcc->issued_flush); return ret; } + cmd.ino = ino; init_completion(&cmd.wait); llist_add(&cmd.llnode, &fcc->issue_list); @@ -583,7 +589,7 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) } else { struct flush_cmd *tmp, *next; - ret = submit_flush_wait(sbi); + ret = submit_flush_wait(sbi, ino); llist_for_each_entry_safe(tmp, next, list, llnode) { if (tmp == &cmd) { @@ -2464,6 +2470,20 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, mutex_unlock(&curseg->curseg_mutex); } +static void update_device_state(struct f2fs_io_info *fio) +{ + struct f2fs_sb_info *sbi = fio->sbi; + unsigned int devidx; + + if (!sbi->s_ndevs) + return; + + devidx = f2fs_target_device_index(sbi, fio->new_blkaddr); + + /* update device state for fsync */ + set_dirty_device(sbi, fio->ino, devidx, FLUSH_INO); +} + static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { int type = __get_segment_type(fio); @@ -2478,6 +2498,8 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) if (err == -EAGAIN) { fio->old_blkaddr = fio->new_blkaddr; goto reallocate; + } else if (!err) { + update_device_state(fio); } } @@ -2538,6 +2560,8 @@ int rewrite_data_page(struct f2fs_io_info *fio) stat_inc_inplace_blocks(fio->sbi); err = f2fs_submit_page_bio(fio); + if (!err) + update_device_state(fio); f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); From 27eaad09380fe2f1fd8dbfb1e3e7ae6afd70ca80 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 29 Sep 2017 13:59:39 +0800 Subject: [PATCH 0443/1212] f2fs: fix to flush multiple device in checkpoint If f2fs manages multiple devices, in checkpoint, we need to issue flush in those devices which contain dirty data/node in their cache before we write checkpoint region, otherwise, filesystem metadata could be corrupted if hitting SPO after checkpoint. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 6 ++++++ fs/f2fs/f2fs.h | 3 +++ fs/f2fs/segment.c | 29 +++++++++++++++++++++++++++++ fs/f2fs/super.c | 3 +++ 4 files changed, 41 insertions(+) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index b1c6e75c2764..90ff066c9569 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1173,6 +1173,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct super_block *sb = sbi->sb; struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); u64 kbytes_written; + int err; /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) { @@ -1266,6 +1267,11 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (unlikely(f2fs_cp_error(sbi))) return -EIO; + /* flush all device cache */ + err = f2fs_flush_device_cache(sbi); + if (err) + return err; + /* write out checkpoint buffer at block 0 */ update_meta_page(sbi, ckpt, start_blk++); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 06a4d784abce..fdf216423473 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1186,6 +1186,8 @@ struct f2fs_sb_info { struct list_head s_list; int s_ndevs; /* number of devices */ struct f2fs_dev_info *devs; /* for device list */ + unsigned int dirty_device; /* for checkpoint data flush */ + spinlock_t dev_lock; /* protect dirty_device */ struct mutex umount_mutex; unsigned int shrinker_run_no; @@ -2608,6 +2610,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi); int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino); int create_flush_cmd_control(struct f2fs_sb_info *sbi); +int f2fs_flush_device_cache(struct f2fs_sb_info *sbi); void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5351caa2ffd9..c009bdff2ff6 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -659,6 +659,28 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) } } +int f2fs_flush_device_cache(struct f2fs_sb_info *sbi) +{ + int ret = 0, i; + + if (!sbi->s_ndevs) + return 0; + + for (i = 1; i < sbi->s_ndevs; i++) { + if (!f2fs_test_bit(i, (char *)&sbi->dirty_device)) + continue; + ret = __submit_flush_wait(sbi, FDEV(i).bdev); + if (ret) + break; + + spin_lock(&sbi->dev_lock); + f2fs_clear_bit(i, (char *)&sbi->dirty_device); + spin_unlock(&sbi->dev_lock); + } + + return ret; +} + static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, enum dirty_type dirty_type) { @@ -2482,6 +2504,13 @@ static void update_device_state(struct f2fs_io_info *fio) /* update device state for fsync */ set_dirty_device(sbi, fio->ino, devidx, FLUSH_INO); + + /* update device state for checkpoint */ + if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) { + spin_lock(&sbi->dev_lock); + f2fs_set_bit(devidx, (char *)&sbi->dirty_device); + spin_unlock(&sbi->dev_lock); + } } static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 482bb0333806..5fe6047d1db8 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1969,6 +1969,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi) for (j = HOT; j < NR_TEMP_TYPE; j++) mutex_init(&sbi->wio_mutex[i][j]); spin_lock_init(&sbi->cp_lock); + + sbi->dirty_device = 0; + spin_lock_init(&sbi->dev_lock); } static int init_percpu_info(struct f2fs_sb_info *sbi) From 684447dad1385fef8a1c2bfaff770860b0beddc2 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 4 Oct 2017 09:08:32 +0800 Subject: [PATCH 0444/1212] f2fs: support issuing/waiting discard in range Fstrim intends to trim invalid blocks of filesystem only with specified range and granularity, but actually, it will issue all previous cached discard commands which may be out-of-range and be with unmatched granularity, it's unneeded. In order to fix above issues, this patch introduces new helps to support to issue and wait discard in range and adds a new fstrim_list for tracking in-flight discard from ->fstrim. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 +- fs/f2fs/segment.c | 127 ++++++++++++++++++++++++++++++++++++++-------- fs/f2fs/super.c | 2 +- 3 files changed, 109 insertions(+), 23 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fdf216423473..ea2cd4112a40 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -316,6 +316,7 @@ struct discard_cmd_control { struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */ unsigned char pend_list_tag[MAX_PLIST_NUM];/* tag for pending entries */ struct list_head wait_list; /* store on-flushing entries */ + struct list_head fstrim_list; /* in-flight discard from fstrim */ wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ unsigned int discard_wake; /* to wake up discard thread */ struct mutex cmd_lock; @@ -2616,7 +2617,7 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new); void stop_discard_thread(struct f2fs_sb_info *sbi); -void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi, bool umount); +void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); void release_discard_addrs(struct f2fs_sb_info *sbi); int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c009bdff2ff6..8bdc31d1c847 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -954,9 +954,11 @@ void __check_sit_bitmap(struct f2fs_sb_info *sbi, /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, - struct discard_cmd *dc) + struct discard_cmd *dc, bool fstrim) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *wait_list = fstrim ? &(dcc->fstrim_list) : + &(dcc->wait_list); struct bio *bio = NULL; if (dc->state != D_PREP) @@ -977,7 +979,7 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, bio->bi_private = dc; bio->bi_end_io = f2fs_submit_discard_endio; submit_bio(REQ_SYNC, bio); - list_move_tail(&dc->list, &dcc->wait_list); + list_move_tail(&dc->list, wait_list); __check_sit_bitmap(sbi, dc->start, dc->start + dc->len); f2fs_update_iostat(sbi, FS_DISCARD, 1); @@ -1162,6 +1164,68 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, return 0; } +static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi, + unsigned int start, unsigned int end, + unsigned int granularity) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *prev_dc = NULL, *next_dc = NULL; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + struct discard_cmd *dc; + struct blk_plug plug; + int issued; + +next: + issued = 0; + + mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); + + dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root, + NULL, start, + (struct rb_entry **)&prev_dc, + (struct rb_entry **)&next_dc, + &insert_p, &insert_parent, true); + if (!dc) + dc = next_dc; + + blk_start_plug(&plug); + + while (dc && dc->lstart <= end) { + struct rb_node *node; + + if (dc->len < granularity) + goto skip; + + if (dc->state != D_PREP) { + list_move_tail(&dc->list, &dcc->fstrim_list); + goto skip; + } + + __submit_discard_cmd(sbi, dc, true); + + if (++issued >= DISCARD_ISSUE_RATE) { + start = dc->lstart + dc->len; + + blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); + + schedule(); + + goto next; + } +skip: + node = rb_next(&dc->rb_node); + dc = rb_entry_safe(node, struct discard_cmd, rb_node); + + if (fatal_signal_pending(current)) + break; + } + + blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); +} + static int __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; @@ -1184,22 +1248,19 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) /* Hurry up to finish fstrim */ if (dcc->pend_list_tag[i] & P_TRIM) { - __submit_discard_cmd(sbi, dc); + __submit_discard_cmd(sbi, dc, false); issued++; - - if (fatal_signal_pending(current)) - break; continue; } if (!issue_cond) { - __submit_discard_cmd(sbi, dc); + __submit_discard_cmd(sbi, dc, false); issued++; continue; } if (is_idle(sbi)) { - __submit_discard_cmd(sbi, dc); + __submit_discard_cmd(sbi, dc, false); issued++; } else { io_interrupted = true; @@ -1253,10 +1314,14 @@ static void __wait_one_discard_bio(struct f2fs_sb_info *sbi, mutex_unlock(&dcc->cmd_lock); } -static void __wait_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond) +static void __wait_discard_cmd_range(struct f2fs_sb_info *sbi, bool wait_cond, + block_t start, block_t end, + unsigned int granularity, + bool fstrim) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct list_head *wait_list = &(dcc->wait_list); + struct list_head *wait_list = fstrim ? &(dcc->fstrim_list) : + &(dcc->wait_list); struct discard_cmd *dc, *tmp; bool need_wait; @@ -1265,6 +1330,10 @@ static void __wait_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond) mutex_lock(&dcc->cmd_lock); list_for_each_entry_safe(dc, tmp, wait_list, list) { + if (dc->lstart + dc->len <= start || end <= dc->lstart) + continue; + if (dc->len < granularity) + continue; if (!wait_cond || (dc->state == D_DONE && !dc->ref)) { wait_for_completion_io(&dc->wait); __remove_discard_cmd(sbi, dc); @@ -1282,6 +1351,11 @@ static void __wait_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond) } } +static void __wait_all_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond) +{ + __wait_discard_cmd_range(sbi, wait_cond, 0, UINT_MAX, 1, false); +} + /* This should be covered by global mutex, &sit_i->sentry_lock */ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) { @@ -1317,12 +1391,12 @@ void stop_discard_thread(struct f2fs_sb_info *sbi) } } -/* This comes from f2fs_put_super and f2fs_trim_fs */ -void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi, bool umount) +/* This comes from f2fs_put_super */ +void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) { __issue_discard_cmd(sbi, false); __drop_discard_cmd(sbi); - __wait_discard_cmd(sbi, !umount); + __wait_all_discard_cmd(sbi, false); } static void mark_discard_range_all(struct f2fs_sb_info *sbi) @@ -1366,7 +1440,7 @@ static int issue_discard_thread(void *data) issued = __issue_discard_cmd(sbi, true); if (issued) { - __wait_discard_cmd(sbi, true); + __wait_all_discard_cmd(sbi, true); wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; } else { wait_ms = DEF_MAX_DISCARD_ISSUE_TIME; @@ -1677,6 +1751,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) dcc->pend_list_tag[i] |= P_ACTIVE; } INIT_LIST_HEAD(&dcc->wait_list); + INIT_LIST_HEAD(&dcc->fstrim_list); mutex_init(&dcc->cmd_lock); atomic_set(&dcc->issued_discard, 0); atomic_set(&dcc->issing_discard, 0); @@ -2304,7 +2379,8 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) { __u64 start = F2FS_BYTES_TO_BLK(range->start); __u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1; - unsigned int start_segno, end_segno; + unsigned int start_segno, end_segno, cur_segno; + block_t start_block, end_block; struct cp_control cpc; int err = 0; @@ -2325,12 +2401,17 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start); end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 : GET_SEGNO(sbi, end); + + start_block = START_BLOCK(sbi, start_segno); + end_block = START_BLOCK(sbi, end_segno + 1); + cpc.reason = CP_DISCARD; cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen)); /* do checkpoint to issue discard commands safely */ - for (; start_segno <= end_segno; start_segno = cpc.trim_end + 1) { - cpc.trim_start = start_segno; + for (cur_segno = start_segno; cur_segno <= end_segno; + cur_segno = cpc.trim_end + 1) { + cpc.trim_start = cur_segno; if (sbi->discard_blks == 0) break; @@ -2338,7 +2419,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) cpc.trim_end = end_segno; else cpc.trim_end = min_t(unsigned int, - rounddown(start_segno + + rounddown(cur_segno + BATCHED_TRIM_SEGMENTS(sbi), sbi->segs_per_sec) - 1, end_segno); @@ -2350,9 +2431,13 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) schedule(); } - /* It's time to issue all the filed discards */ - mark_discard_range_all(sbi); - f2fs_wait_discard_bios(sbi, false); + + start_block = START_BLOCK(sbi, start_segno); + end_block = START_BLOCK(sbi, min(cur_segno, end_segno) + 1); + + __issue_discard_cmd_range(sbi, start_block, end_block, cpc.trim_minlen); + __wait_discard_cmd_range(sbi, true, start_block, end_block, + cpc.trim_minlen, true); out: range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); return err; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 5fe6047d1db8..07734666eae1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -801,7 +801,7 @@ static void f2fs_put_super(struct super_block *sb) } /* be sure to wait for any on-going discard commands */ - f2fs_wait_discard_bios(sbi, true); + f2fs_wait_discard_bios(sbi); if (f2fs_discard_en(sbi) && !sbi->discard_blks) { struct cp_control cpc = { From 1e65afd14d32eb318caaebf16f5797e2c723fa20 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 4 Oct 2017 09:08:33 +0800 Subject: [PATCH 0445/1212] f2fs: wrap discard policy This patch wraps scattered optional parameters into discard policy as below, later, with it we expect that we can adjust these parameters with proper strategy in different scenario. struct discard_policy { unsigned int min_interval; /* used for candidates exist */ unsigned int max_interval; /* used for candidates not exist */ unsigned int max_requests; /* # of discards issued per round */ unsigned int io_aware_gran; /* minimum granularity discard not be aware of I/O */ bool io_aware; /* issue discard in idle time */ bool sync; /* submit discard with REQ_SYNC flag */ }; This patch doesn't change any logic of codes. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 12 +++++++++++- fs/f2fs/segment.c | 38 +++++++++++++++++++++++++++++--------- 2 files changed, 40 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ea2cd4112a40..d4dd9efd48ec 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -214,7 +214,7 @@ enum { #define BATCHED_TRIM_BLOCKS(sbi) \ (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) #define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi) -#define DISCARD_ISSUE_RATE 8 +#define DEF_MAX_DISCARD_REQUEST 8 /* issue 8 discards per round */ #define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */ #define DEF_MAX_DISCARD_ISSUE_TIME 60000 /* 60 s, if no candidates */ #define DEF_CP_INTERVAL 60 /* 60 secs */ @@ -310,6 +310,15 @@ struct discard_cmd { int error; /* bio error */ }; +struct discard_policy { + unsigned int min_interval; /* used for candidates exist */ + unsigned int max_interval; /* used for candidates not exist */ + unsigned int max_requests; /* # of discards issued per round */ + unsigned int io_aware_gran; /* minimum granularity discard not be aware of I/O */ + bool io_aware; /* issue discard in idle time */ + bool sync; /* submit discard with REQ_SYNC flag */ +}; + struct discard_cmd_control { struct task_struct *f2fs_issue_discard; /* discard thread */ struct list_head entry_list; /* 4KB discard entry list */ @@ -328,6 +337,7 @@ struct discard_cmd_control { atomic_t issing_discard; /* # of issing discard */ atomic_t discard_cmd_cnt; /* # of cached cmd count */ struct rb_root root; /* root of discard rb-tree */ + struct discard_policy dpolicy; /* current discard policy */ }; /* for the list of fsync inodes, used only during recovery */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 8bdc31d1c847..c1d648a7d214 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -960,6 +960,7 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, struct list_head *wait_list = fstrim ? &(dcc->fstrim_list) : &(dcc->wait_list); struct bio *bio = NULL; + int flag = dcc->dpolicy.sync ? REQ_SYNC : 0; if (dc->state != D_PREP) return; @@ -978,7 +979,7 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, if (bio) { bio->bi_private = dc; bio->bi_end_io = f2fs_submit_discard_endio; - submit_bio(REQ_SYNC, bio); + submit_bio(flag, bio); list_move_tail(&dc->list, wait_list); __check_sit_bitmap(sbi, dc->start, dc->start + dc->len); @@ -1172,6 +1173,7 @@ static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi, struct discard_cmd *prev_dc = NULL, *next_dc = NULL; struct rb_node **insert_p = NULL, *insert_parent = NULL; struct discard_cmd *dc; + struct discard_policy *dpolicy = &dcc->dpolicy; struct blk_plug plug; int issued; @@ -1204,7 +1206,7 @@ static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi, __submit_discard_cmd(sbi, dc, true); - if (++issued >= DISCARD_ISSUE_RATE) { + if (++issued >= dpolicy->max_requests) { start = dc->lstart + dc->len; blk_finish_plug(&plug); @@ -1232,6 +1234,7 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) struct list_head *pend_list; struct discard_cmd *dc, *tmp; struct blk_plug plug; + struct discard_policy *dpolicy = &dcc->dpolicy; int iter = 0, issued = 0; int i; bool io_interrupted = false; @@ -1259,14 +1262,16 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) continue; } - if (is_idle(sbi)) { - __submit_discard_cmd(sbi, dc, false); - issued++; - } else { + if (dpolicy->io_aware && i < dpolicy->io_aware_gran && + !is_idle(sbi)) { io_interrupted = true; + goto skip; } - if (++iter >= DISCARD_ISSUE_RATE) + __submit_discard_cmd(sbi, dc, false); + issued++; +skip: + if (++iter >= dpolicy->max_requests) goto out; } if (list_empty(pend_list) && dcc->pend_list_tag[i] & P_TRIM) @@ -1415,6 +1420,7 @@ static int issue_discard_thread(void *data) struct f2fs_sb_info *sbi = data; struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; wait_queue_head_t *q = &dcc->discard_wait_queue; + struct discard_policy *dpolicy = &dcc->dpolicy; unsigned int wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; int issued; @@ -1441,9 +1447,9 @@ static int issue_discard_thread(void *data) issued = __issue_discard_cmd(sbi, true); if (issued) { __wait_all_discard_cmd(sbi, true); - wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; + wait_ms = dpolicy->min_interval; } else { - wait_ms = DEF_MAX_DISCARD_ISSUE_TIME; + wait_ms = dpolicy->max_interval; } sb_end_intwrite(sbi->sb); @@ -1728,6 +1734,18 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) wake_up_discard_thread(sbi, false); } +static void inline init_discard_policy(struct discard_cmd_control *dcc) +{ + struct discard_policy *dpolicy = &dcc->dpolicy; + + dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; + dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; + dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; + dpolicy->io_aware_gran = MAX_PLIST_NUM; + dpolicy->io_aware = true; + dpolicy->sync = true; +} + static int create_discard_cmd_control(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; @@ -1761,6 +1779,8 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) dcc->undiscard_blks = 0; dcc->root = RB_ROOT; + init_discard_policy(dcc); + init_waitqueue_head(&dcc->discard_wait_queue); SM_I(sbi)->dcc_info = dcc; init_thread: From a34ab5ca4f94543741fa304c4cb2095f0bc82898 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 4 Oct 2017 09:08:34 +0800 Subject: [PATCH 0446/1212] f2fs: split discard policy There are many different scenarios such as fstrim, umount, urgent or background where we will issue discards, actually, they need use different policy in aspect of io aware, discard granularity, delay interval and so on. But now they just share one common discard policy, so there will be race when changing policy in between these scenarios, the interference of changing discard policy will be very serious. This patch changes to split discard policy for different scenarios. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 17 ++++-- fs/f2fs/segment.c | 149 +++++++++++++++++++++++----------------------- fs/f2fs/segment.h | 5 +- fs/f2fs/sysfs.c | 13 ---- 4 files changed, 88 insertions(+), 96 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d4dd9efd48ec..ec7a55218967 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -275,10 +275,6 @@ struct discard_entry { #define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \ (MAX_PLIST_NUM - 1) : (blk_num - 1)) -#define P_ACTIVE 0x01 -#define P_TRIM 0x02 -#define plist_issue(tag) (((tag) & P_ACTIVE) || ((tag) & P_TRIM)) - enum { D_PREP, D_SUBMIT, @@ -310,13 +306,23 @@ struct discard_cmd { int error; /* bio error */ }; +enum { + DPOLICY_BG, + DPOLICY_FORCE, + DPOLICY_FSTRIM, + DPOLICY_UMOUNT, + MAX_DPOLICY, +}; + struct discard_policy { + int type; /* type of discard */ unsigned int min_interval; /* used for candidates exist */ unsigned int max_interval; /* used for candidates not exist */ unsigned int max_requests; /* # of discards issued per round */ unsigned int io_aware_gran; /* minimum granularity discard not be aware of I/O */ bool io_aware; /* issue discard in idle time */ bool sync; /* submit discard with REQ_SYNC flag */ + unsigned int granularity; /* discard granularity */ }; struct discard_cmd_control { @@ -337,7 +343,6 @@ struct discard_cmd_control { atomic_t issing_discard; /* # of issing discard */ atomic_t discard_cmd_cnt; /* # of cached cmd count */ struct rb_root root; /* root of discard rb-tree */ - struct discard_policy dpolicy; /* current discard policy */ }; /* for the list of fsync inodes, used only during recovery */ @@ -2625,6 +2630,8 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi); void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); +void init_discard_policy(struct discard_policy *dpolicy, int discard_type, + unsigned int granularity); void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new); void stop_discard_thread(struct f2fs_sb_info *sbi); void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c1d648a7d214..f1dbf8d5574e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -954,13 +954,14 @@ void __check_sit_bitmap(struct f2fs_sb_info *sbi, /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, - struct discard_cmd *dc, bool fstrim) + struct discard_policy *dpolicy, + struct discard_cmd *dc) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct list_head *wait_list = fstrim ? &(dcc->fstrim_list) : - &(dcc->wait_list); + struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? + &(dcc->fstrim_list) : &(dcc->wait_list); struct bio *bio = NULL; - int flag = dcc->dpolicy.sync ? REQ_SYNC : 0; + int flag = dpolicy->sync ? REQ_SYNC : 0; if (dc->state != D_PREP) return; @@ -1166,14 +1167,13 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, } static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi, - unsigned int start, unsigned int end, - unsigned int granularity) + struct discard_policy *dpolicy, + unsigned int start, unsigned int end) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_cmd *prev_dc = NULL, *next_dc = NULL; struct rb_node **insert_p = NULL, *insert_parent = NULL; struct discard_cmd *dc; - struct discard_policy *dpolicy = &dcc->dpolicy; struct blk_plug plug; int issued; @@ -1196,7 +1196,7 @@ static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi, while (dc && dc->lstart <= end) { struct rb_node *node; - if (dc->len < granularity) + if (dc->len < dpolicy->granularity) goto skip; if (dc->state != D_PREP) { @@ -1204,7 +1204,7 @@ static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi, goto skip; } - __submit_discard_cmd(sbi, dc, true); + __submit_discard_cmd(sbi, dpolicy, dc); if (++issued >= dpolicy->max_requests) { start = dc->lstart + dc->len; @@ -1228,54 +1228,39 @@ static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi, mutex_unlock(&dcc->cmd_lock); } -static int __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) +static int __issue_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *pend_list; struct discard_cmd *dc, *tmp; struct blk_plug plug; - struct discard_policy *dpolicy = &dcc->dpolicy; - int iter = 0, issued = 0; - int i; + int i, iter = 0, issued = 0; bool io_interrupted = false; mutex_lock(&dcc->cmd_lock); f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); blk_start_plug(&plug); - for (i = MAX_PLIST_NUM - 1; - i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) { + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + if (i + 1 < dpolicy->granularity) + break; pend_list = &dcc->pend_list[i]; list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); - /* Hurry up to finish fstrim */ - if (dcc->pend_list_tag[i] & P_TRIM) { - __submit_discard_cmd(sbi, dc, false); - issued++; - continue; - } - - if (!issue_cond) { - __submit_discard_cmd(sbi, dc, false); - issued++; - continue; - } - if (dpolicy->io_aware && i < dpolicy->io_aware_gran && !is_idle(sbi)) { io_interrupted = true; goto skip; } - __submit_discard_cmd(sbi, dc, false); + __submit_discard_cmd(sbi, dpolicy, dc); issued++; skip: if (++iter >= dpolicy->max_requests) goto out; } - if (list_empty(pend_list) && dcc->pend_list_tag[i] & P_TRIM) - dcc->pend_list_tag[i] &= (~P_TRIM); } out: blk_finish_plug(&plug); @@ -1319,14 +1304,13 @@ static void __wait_one_discard_bio(struct f2fs_sb_info *sbi, mutex_unlock(&dcc->cmd_lock); } -static void __wait_discard_cmd_range(struct f2fs_sb_info *sbi, bool wait_cond, - block_t start, block_t end, - unsigned int granularity, - bool fstrim) +static void __wait_discard_cmd_range(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy, + block_t start, block_t end) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct list_head *wait_list = fstrim ? &(dcc->fstrim_list) : - &(dcc->wait_list); + struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? + &(dcc->fstrim_list) : &(dcc->wait_list); struct discard_cmd *dc, *tmp; bool need_wait; @@ -1337,9 +1321,9 @@ static void __wait_discard_cmd_range(struct f2fs_sb_info *sbi, bool wait_cond, list_for_each_entry_safe(dc, tmp, wait_list, list) { if (dc->lstart + dc->len <= start || end <= dc->lstart) continue; - if (dc->len < granularity) + if (dc->len < dpolicy->granularity) continue; - if (!wait_cond || (dc->state == D_DONE && !dc->ref)) { + if (dc->state == D_DONE && !dc->ref) { wait_for_completion_io(&dc->wait); __remove_discard_cmd(sbi, dc); } else { @@ -1356,9 +1340,10 @@ static void __wait_discard_cmd_range(struct f2fs_sb_info *sbi, bool wait_cond, } } -static void __wait_all_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond) +static void __wait_all_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy) { - __wait_discard_cmd_range(sbi, wait_cond, 0, UINT_MAX, 1, false); + __wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX); } /* This should be covered by global mutex, &sit_i->sentry_lock */ @@ -1398,21 +1383,14 @@ void stop_discard_thread(struct f2fs_sb_info *sbi) /* This comes from f2fs_put_super */ void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) -{ - __issue_discard_cmd(sbi, false); - __drop_discard_cmd(sbi); - __wait_all_discard_cmd(sbi, false); -} - -static void mark_discard_range_all(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - int i; + struct discard_policy dpolicy; - mutex_lock(&dcc->cmd_lock); - for (i = 0; i < MAX_PLIST_NUM; i++) - dcc->pend_list_tag[i] |= P_TRIM; - mutex_unlock(&dcc->cmd_lock); + init_discard_policy(&dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity); + __issue_discard_cmd(sbi, &dpolicy); + __drop_discard_cmd(sbi); + __wait_all_discard_cmd(sbi, &dpolicy); } static int issue_discard_thread(void *data) @@ -1420,13 +1398,16 @@ static int issue_discard_thread(void *data) struct f2fs_sb_info *sbi = data; struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; wait_queue_head_t *q = &dcc->discard_wait_queue; - struct discard_policy *dpolicy = &dcc->dpolicy; + struct discard_policy dpolicy; unsigned int wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; int issued; set_freezable(); do { + init_discard_policy(&dpolicy, DPOLICY_BG, + dcc->discard_granularity); + wait_event_interruptible_timeout(*q, kthread_should_stop() || freezing(current) || dcc->discard_wake, @@ -1439,17 +1420,18 @@ static int issue_discard_thread(void *data) if (dcc->discard_wake) { dcc->discard_wake = 0; if (sbi->gc_thread && sbi->gc_thread->gc_urgent) - mark_discard_range_all(sbi); + init_discard_policy(&dpolicy, + DPOLICY_FORCE, 1); } sb_start_intwrite(sbi->sb); - issued = __issue_discard_cmd(sbi, true); + issued = __issue_discard_cmd(sbi, &dpolicy); if (issued) { - __wait_all_discard_cmd(sbi, true); - wait_ms = dpolicy->min_interval; + __wait_all_discard_cmd(sbi, &dpolicy); + wait_ms = dpolicy.min_interval; } else { - wait_ms = dpolicy->max_interval; + wait_ms = dpolicy.max_interval; } sb_end_intwrite(sbi->sb); @@ -1734,16 +1716,35 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) wake_up_discard_thread(sbi, false); } -static void inline init_discard_policy(struct discard_cmd_control *dcc) +void init_discard_policy(struct discard_policy *dpolicy, + int discard_type, unsigned int granularity) { - struct discard_policy *dpolicy = &dcc->dpolicy; - - dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; - dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; - dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; - dpolicy->io_aware_gran = MAX_PLIST_NUM; - dpolicy->io_aware = true; + /* common policy */ + dpolicy->type = discard_type; dpolicy->sync = true; + dpolicy->granularity = granularity; + + if (discard_type == DPOLICY_BG) { + dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; + dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; + dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; + dpolicy->io_aware_gran = MAX_PLIST_NUM; + dpolicy->io_aware = true; + } else if (discard_type == DPOLICY_FORCE) { + dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; + dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; + dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; + dpolicy->io_aware_gran = MAX_PLIST_NUM; + dpolicy->io_aware = true; + } else if (discard_type == DPOLICY_FSTRIM) { + dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; + dpolicy->io_aware_gran = MAX_PLIST_NUM; + dpolicy->io_aware = false; + } else if (discard_type == DPOLICY_UMOUNT) { + dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; + dpolicy->io_aware_gran = MAX_PLIST_NUM; + dpolicy->io_aware = false; + } } static int create_discard_cmd_control(struct f2fs_sb_info *sbi) @@ -1763,11 +1764,8 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; INIT_LIST_HEAD(&dcc->entry_list); - for (i = 0; i < MAX_PLIST_NUM; i++) { + for (i = 0; i < MAX_PLIST_NUM; i++) INIT_LIST_HEAD(&dcc->pend_list[i]); - if (i >= dcc->discard_granularity - 1) - dcc->pend_list_tag[i] |= P_ACTIVE; - } INIT_LIST_HEAD(&dcc->wait_list); INIT_LIST_HEAD(&dcc->fstrim_list); mutex_init(&dcc->cmd_lock); @@ -1779,8 +1777,6 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) dcc->undiscard_blks = 0; dcc->root = RB_ROOT; - init_discard_policy(dcc); - init_waitqueue_head(&dcc->discard_wait_queue); SM_I(sbi)->dcc_info = dcc; init_thread: @@ -2402,6 +2398,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) unsigned int start_segno, end_segno, cur_segno; block_t start_block, end_block; struct cp_control cpc; + struct discard_policy dpolicy; int err = 0; if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize) @@ -2455,9 +2452,9 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) start_block = START_BLOCK(sbi, start_segno); end_block = START_BLOCK(sbi, min(cur_segno, end_segno) + 1); - __issue_discard_cmd_range(sbi, start_block, end_block, cpc.trim_minlen); - __wait_discard_cmd_range(sbi, true, start_block, end_block, - cpc.trim_minlen, true); + init_discard_policy(&dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen); + __issue_discard_cmd_range(sbi, &dpolicy, start_block, end_block); + __wait_discard_cmd_range(sbi, &dpolicy, start_block, end_block); out: range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); return err; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index ffa11274b0ce..d12d9cd99f91 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -795,8 +795,9 @@ static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force) goto wake_up; mutex_lock(&dcc->cmd_lock); - for (i = MAX_PLIST_NUM - 1; - i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) { + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + if (i + 1 < dcc->discard_granularity) + break; if (!list_empty(&dcc->pend_list[i])) { wakeup = true; break; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index e2c258f717cd..89f25efffd43 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -154,23 +154,10 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, } if (!strcmp(a->attr.name, "discard_granularity")) { - struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - int i; - if (t == 0 || t > MAX_PLIST_NUM) return -EINVAL; if (t == *ui) return count; - - mutex_lock(&dcc->cmd_lock); - for (i = 0; i < MAX_PLIST_NUM; i++) { - if (i >= t - 1) - dcc->pend_list_tag[i] |= P_ACTIVE; - else - dcc->pend_list_tag[i] &= (~P_ACTIVE); - } - mutex_unlock(&dcc->cmd_lock); - *ui = t; return count; } From bd502c6e3e7a59aaf28b6d065384bd90f40790bf Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 4 Oct 2017 09:08:35 +0800 Subject: [PATCH 0447/1212] f2fs: reduce cmd_lock coverage in __issue_discard_cmd __submit_discard_cmd may lead long latency due to exhaustion of I/O request resource in block layer, so issuing all discard under cmd_lock may lead to hangtask, in order to avoid that, let's reduce it's coverage. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f1dbf8d5574e..859ead471243 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1238,14 +1238,14 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, int i, iter = 0, issued = 0; bool io_interrupted = false; - mutex_lock(&dcc->cmd_lock); - f2fs_bug_on(sbi, - !__check_rb_tree_consistence(sbi, &dcc->root)); - blk_start_plug(&plug); for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { if (i + 1 < dpolicy->granularity) break; pend_list = &dcc->pend_list[i]; + + mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); + blk_start_plug(&plug); list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); @@ -1259,12 +1259,14 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, issued++; skip: if (++iter >= dpolicy->max_requests) - goto out; + break; } + blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); + + if (iter >= dpolicy->max_requests) + break; } -out: - blk_finish_plug(&plug); - mutex_unlock(&dcc->cmd_lock); if (!issued && io_interrupted) issued = -1; From df74eacb207596ba0f4323bbb6b2bbc974c6f87b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 4 Oct 2017 09:08:36 +0800 Subject: [PATCH 0448/1212] f2fs: trace f2fs_remove_discard This patch adds tracepoint to trace f2fs_remove_discard. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 ++ include/trace/events/f2fs.h | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 859ead471243..41b3fc0cca62 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -822,6 +822,8 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi, { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + trace_f2fs_remove_discard(dc->bdev, dc->start, dc->len); + f2fs_bug_on(sbi, dc->ref); if (dc->error == -EOPNOTSUPP) diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 7063bbcca03b..1c725ff5786b 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1274,6 +1274,13 @@ DEFINE_EVENT(f2fs_discard, f2fs_issue_discard, TP_ARGS(dev, blkstart, blklen) ); +DEFINE_EVENT(f2fs_discard, f2fs_remove_discard, + + TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen), + + TP_ARGS(dev, blkstart, blklen) +); + TRACE_EVENT(f2fs_issue_reset_zone, TP_PROTO(struct block_device *dev, block_t blkstart), From 68e801abc520b06ba24af6fde667408e4372fe3a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 4 Oct 2017 09:08:37 +0800 Subject: [PATCH 0449/1212] f2fs: give up CP_TRIMMED_FLAG if it drops discards In ->umount, once we drop remained discard entries, we should not set CP_TRIMMED_FLAG with another checkpoint. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/segment.c | 13 ++++++++++--- fs/f2fs/super.c | 5 +++-- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ec7a55218967..9267f50dfe8f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2634,7 +2634,7 @@ void init_discard_policy(struct discard_policy *dpolicy, int discard_type, unsigned int granularity); void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new); void stop_discard_thread(struct f2fs_sb_info *sbi); -void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); +bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); void release_discard_addrs(struct f2fs_sb_info *sbi); int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 41b3fc0cca62..a065a2c01b5f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1276,12 +1276,13 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, return issued; } -static void __drop_discard_cmd(struct f2fs_sb_info *sbi) +static bool __drop_discard_cmd(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *pend_list; struct discard_cmd *dc, *tmp; int i; + bool dropped = false; mutex_lock(&dcc->cmd_lock); for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { @@ -1289,9 +1290,12 @@ static void __drop_discard_cmd(struct f2fs_sb_info *sbi) list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); __remove_discard_cmd(sbi, dc); + dropped = true; } } mutex_unlock(&dcc->cmd_lock); + + return dropped; } static void __wait_one_discard_bio(struct f2fs_sb_info *sbi, @@ -1386,15 +1390,18 @@ void stop_discard_thread(struct f2fs_sb_info *sbi) } /* This comes from f2fs_put_super */ -void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) +bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_policy dpolicy; + bool dropped; init_discard_policy(&dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity); __issue_discard_cmd(sbi, &dpolicy); - __drop_discard_cmd(sbi); + dropped = __drop_discard_cmd(sbi); __wait_all_discard_cmd(sbi, &dpolicy); + + return dropped; } static int issue_discard_thread(void *data) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 07734666eae1..8e7ef3712bcc 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -781,6 +781,7 @@ static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); int i; + bool dropped; f2fs_quota_off_umount(sb); @@ -801,9 +802,9 @@ static void f2fs_put_super(struct super_block *sb) } /* be sure to wait for any on-going discard commands */ - f2fs_wait_discard_bios(sbi); + dropped = f2fs_wait_discard_bios(sbi); - if (f2fs_discard_en(sbi) && !sbi->discard_blks) { + if (f2fs_discard_en(sbi) && !sbi->discard_blks && !dropped) { struct cp_control cpc = { .reason = CP_UMOUNT | CP_TRIMMED, }; From 0c47a892d555f0bcf6613c6b85a6e95d92c55973 Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Sat, 7 Oct 2017 16:02:21 +0200 Subject: [PATCH 0450/1212] f2fs: Fix bool initialization/comparison Bool initializations should use true and false. Bool tests don't need comparisons. Signed-off-by: Thomas Meyer Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a655a39d60b3..277dafd7c964 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -416,8 +416,8 @@ int f2fs_submit_page_write(struct f2fs_io_info *fio) bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; - /* set submitted = 1 as a return value */ - fio->submitted = 1; + /* set submitted = true as a return value */ + fio->submitted = true; inc_page_count(sbi, WB_DATA_TYPE(bio_page)); From 85853e7e38d7691ae9d41ef7ed4579313b857584 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 9 Oct 2017 17:55:19 +0800 Subject: [PATCH 0451/1212] f2fs: fix to avoid race when accessing last_disk_size last_disk_size could be wrong due to concurrently updating, so using i_sem semaphore to make last_disk_size updating exclusive to fix this issue. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 +++ fs/f2fs/f2fs.h | 10 ++++++++-- fs/f2fs/file.c | 4 ++++ 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 277dafd7c964..43e32216e681 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1567,8 +1567,11 @@ static int __write_data_page(struct page *page, bool *submitted, err = do_write_data_page(&fio); } } + + down_write(&F2FS_I(inode)->i_sem); if (F2FS_I(inode)->last_disk_size < psize) F2FS_I(inode)->last_disk_size = psize; + up_write(&F2FS_I(inode)->i_sem); done: if (err && err != -ENOENT) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9267f50dfe8f..bd5839895737 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2345,9 +2345,10 @@ static inline void clear_file(struct inode *inode, int type) static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) { + bool ret; + if (dsync) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - bool ret; spin_lock(&sbi->inode_lock[DIRTY_META]); ret = list_empty(&F2FS_I(inode)->gdirty_list); @@ -2358,7 +2359,12 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) file_keep_isize(inode) || i_size_read(inode) & PAGE_MASK) return false; - return F2FS_I(inode)->last_disk_size == i_size_read(inode); + + down_read(&F2FS_I(inode)->i_sem); + ret = F2FS_I(inode)->last_disk_size == i_size_read(inode); + up_read(&F2FS_I(inode)->i_sem); + + return ret; } static inline int f2fs_readonly(struct super_block *sb) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index cd569d394272..bf6845c01d38 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -759,6 +759,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) inode->i_mtime = inode->i_ctime = current_time(inode); } + down_write(&F2FS_I(inode)->i_sem); + F2FS_I(inode)->last_disk_size = i_size_read(inode); + up_write(&F2FS_I(inode)->i_sem); + size_changed = true; } From 5562a3c53963a24d2f34c258fe11e09ce8aa336f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sat, 7 Oct 2017 00:08:05 -0700 Subject: [PATCH 0452/1212] f2fs/crypto: drop crypto key at evict_inode only This patch avoids dropping crypto key in f2fs_drop_inode, so we can guarantee it happens only at evict_inode. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8e7ef3712bcc..c78bbb78c870 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -673,7 +673,6 @@ static int f2fs_drop_inode(struct inode *inode) sb_end_intwrite(inode->i_sb); - fscrypt_put_encryption_info(inode, NULL); spin_lock(&inode->i_lock); atomic_dec(&inode->i_count); } From cb98f70dea02334bb6f30bb9e879456d789f3afe Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 12 Oct 2017 19:12:53 -0700 Subject: [PATCH 0453/1212] f2fs: avoid stale fi->gdirty_list pointer When doing fault injection test, f2fs_evict_inode() didn't remove gdirty_list which incurs a kernel panic due to wrong pointer access. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index ad4f7d52c0ad..3617e7fca930 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -520,8 +520,10 @@ void f2fs_evict_inode(struct inode *inode) stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); - if (!is_set_ckpt_flags(sbi, CP_ERROR_FLAG)) + if (likely(!is_set_ckpt_flags(sbi, CP_ERROR_FLAG))) f2fs_bug_on(sbi, is_inode_flag_set(inode, FI_DIRTY_INODE)); + else + f2fs_inode_synced(inode); /* ino == 0, if f2fs_new_inode() was failed t*/ if (inode->i_ino) From 8e84f379df61f86bb72830241a9a67de13a1b119 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 13 Oct 2017 10:27:45 -0700 Subject: [PATCH 0454/1212] f2fs: expose some sectors to user in inline data or dentry case If there's some data written through inline data or dentry, we need to shouw st_blocks. This fixes reporting zero blocks even though there is small written data. Cc: stable@vger.kernel.org Reviewed-by: Chao Yu [Jaegeuk Kim: avoid link file for quotacheck] Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index bf6845c01d38..b41a4a2d0a0a 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -668,6 +668,12 @@ int f2fs_getattr(struct vfsmount *mnt, { struct inode *inode = d_inode(dentry); generic_fillattr(inode, stat); + + /* we need to show initial sectors used for inline_data/dentries */ + if ((S_ISREG(inode->i_mode) && f2fs_has_inline_data(inode)) || + f2fs_has_inline_dentry(inode)) + stat->blocks += (stat->size + 511) >> 9; + return 0; } From 40d6250f046a3ddddc4410fc35af0fcc4ea6772d Mon Sep 17 00:00:00 2001 From: Weichao Guo Date: Sat, 14 Oct 2017 08:13:32 +0800 Subject: [PATCH 0455/1212] f2fs: skip searching non-exist range in truncate_hole Let's skip entire non-exist area to speed up truncate_hole by using get_next_page_offset. Signed-off-by: Weichao Guo Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index b41a4a2d0a0a..505fb4ee03a4 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -842,7 +842,7 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) err = get_dnode_of_data(&dn, pg_start, LOOKUP_NODE); if (err) { if (err == -ENOENT) { - pg_start++; + pg_start = get_next_page_offset(&dn, pg_start); continue; } return err; From bb0db666d4bca3a2139bce9da3ed321a226a974a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 17 Oct 2017 17:33:41 +0800 Subject: [PATCH 0456/1212] f2fs: trace f2fs_lookup This patch adds trace for f2fs_lookup. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 49 +++++++++++++++++++++----------- include/trace/events/f2fs.h | 56 +++++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+), 17 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index d92b8e9064cb..5503da9c55f8 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -331,12 +331,15 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, struct inode *inode = NULL; struct f2fs_dir_entry *de; struct page *page; - nid_t ino; + struct dentry *new; + nid_t ino = -1; int err = 0; unsigned int root_ino = F2FS_ROOT_INO(F2FS_I_SB(dir)); + trace_f2fs_lookup_start(dir, dentry, flags); + if (f2fs_encrypted_inode(dir)) { - int res = fscrypt_get_encryption_info(dir); + err = fscrypt_get_encryption_info(dir); /* * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is @@ -346,18 +349,22 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, if (fscrypt_has_encryption_key(dir)) fscrypt_set_encrypted_dentry(dentry); fscrypt_set_d_op(dentry); - if (res && res != -ENOKEY) - return ERR_PTR(res); + if (err && err != -ENOKEY) + goto out; } - if (dentry->d_name.len > F2FS_NAME_LEN) - return ERR_PTR(-ENAMETOOLONG); + if (dentry->d_name.len > F2FS_NAME_LEN) { + err = -ENAMETOOLONG; + goto out; + } de = f2fs_find_entry(dir, &dentry->d_name, &page); if (!de) { - if (IS_ERR(page)) - return (struct dentry *)page; - return d_splice_alias(inode, dentry); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out; + } + goto out_splice; } ino = le32_to_cpu(de->ino); @@ -365,19 +372,21 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, f2fs_put_page(page, 0); inode = f2fs_iget(dir->i_sb, ino); - if (IS_ERR(inode)) - return ERR_CAST(inode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out; + } if ((dir->i_ino == root_ino) && f2fs_has_inline_dots(dir)) { err = __recover_dot_dentries(dir, root_ino); if (err) - goto err_out; + goto out_iput; } if (f2fs_has_inline_dots(inode)) { err = __recover_dot_dentries(inode, dir->i_ino); if (err) - goto err_out; + goto out_iput; } if (f2fs_encrypted_inode(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && @@ -386,12 +395,18 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, "Inconsistent encryption contexts: %lu/%lu", dir->i_ino, inode->i_ino); err = -EPERM; - goto err_out; + goto out_iput; } - return d_splice_alias(inode, dentry); - -err_out: +out_splice: + new = d_splice_alias(inode, dentry); + if (IS_ERR(new)) + err = PTR_ERR(new); + trace_f2fs_lookup_end(dir, dentry, ino, err); + return new; +out_iput: iput(inode); +out: + trace_f2fs_lookup_end(dir, dentry, ino, err); return ERR_PTR(err); } diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 1c725ff5786b..427ad06189ec 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -716,6 +716,62 @@ TRACE_EVENT(f2fs_get_victim, __entry->free) ); +TRACE_EVENT(f2fs_lookup_start, + + TP_PROTO(struct inode *dir, struct dentry *dentry, unsigned int flags), + + TP_ARGS(dir, dentry, flags), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(const char *, name) + __field(unsigned int, flags) + ), + + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->ino = dir->i_ino; + __entry->name = dentry->d_name.name; + __entry->flags = flags; + ), + + TP_printk("dev = (%d,%d), pino = %lu, name:%s, flags:%u", + show_dev_ino(__entry), + __entry->name, + __entry->flags) +); + +TRACE_EVENT(f2fs_lookup_end, + + TP_PROTO(struct inode *dir, struct dentry *dentry, nid_t ino, + int err), + + TP_ARGS(dir, dentry, ino, err), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(const char *, name) + __field(nid_t, cino) + __field(int, err) + ), + + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->ino = dir->i_ino; + __entry->name = dentry->d_name.name; + __entry->cino = ino; + __entry->err = err; + ), + + TP_printk("dev = (%d,%d), pino = %lu, name:%s, ino:%u, err:%d", + show_dev_ino(__entry), + __entry->name, + __entry->cino, + __entry->err) +); + TRACE_EVENT(f2fs_fallocate, TP_PROTO(struct inode *inode, int mode, From 2b903fe94cd0743df361d23aa0ce486ee78be510 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 13 Oct 2017 18:01:33 +0800 Subject: [PATCH 0457/1212] f2fs: trace f2fs_readdir This patch adds trace for f2fs_readdir. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 14 +++++++++----- include/trace/events/f2fs.h | 29 +++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 4f2a8fedb313..c745f977869c 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -14,6 +14,7 @@ #include "node.h" #include "acl.h" #include "xattr.h" +#include static unsigned long dir_blocks(struct inode *inode) { @@ -847,6 +848,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) struct f2fs_dentry_block *dentry_blk = NULL; struct page *dentry_page = NULL; struct file_ra_state *ra = &file->f_ra; + loff_t start_pos = ctx->pos; unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK); struct f2fs_dentry_ptr d; struct fscrypt_str fstr = FSTR_INIT(NULL, 0); @@ -855,16 +857,16 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) if (f2fs_encrypted_inode(inode)) { err = fscrypt_get_encryption_info(inode); if (err && err != -ENOKEY) - return err; + goto out; err = fscrypt_fname_alloc_buffer(inode, F2FS_NAME_LEN, &fstr); if (err < 0) - return err; + goto out; } if (f2fs_has_inline_dentry(inode)) { err = f2fs_read_inline_dir(file, ctx, &fstr); - goto out; + goto out_free; } /* readahead for multi pages of dir */ @@ -880,7 +882,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) err = 0; continue; } else { - goto out; + goto out_free; } } @@ -900,8 +902,10 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) kunmap(dentry_page); f2fs_put_page(dentry_page, 1); } -out: +out_free: fscrypt_fname_free_buffer(&fstr); +out: + trace_f2fs_readdir(inode, start_pos, ctx->pos, err); return err < 0 ? err : 0; } diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 427ad06189ec..c9be882c2718 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -772,6 +772,35 @@ TRACE_EVENT(f2fs_lookup_end, __entry->err) ); +TRACE_EVENT(f2fs_readdir, + + TP_PROTO(struct inode *dir, loff_t start_pos, loff_t end_pos, int err), + + TP_ARGS(dir, start_pos, end_pos, err), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(loff_t, start) + __field(loff_t, end) + __field(int, err) + ), + + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->ino = dir->i_ino; + __entry->start = start_pos; + __entry->end = end_pos; + __entry->err = err; + ), + + TP_printk("dev = (%d,%d), ino = %lu, start_pos:%llu, end_pos:%llu, err:%d", + show_dev_ino(__entry), + __entry->start, + __entry->end, + __entry->err) +); + TRACE_EVENT(f2fs_fallocate, TP_PROTO(struct inode *inode, int mode, From c8be47b54018a12c96dd7328951405202f6c3d89 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 13 Oct 2017 18:01:34 +0800 Subject: [PATCH 0458/1212] f2fs: allow readdir() to be interrupted This patch follows ext4 to allow readdir() in large empty directory to be interrupted. Referenced commit of ext4: 1f60fbe72749 ("ext4: allow readdir()'s of large empty directories to be interrupted"). Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index c745f977869c..95500eaae681 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -10,6 +10,7 @@ */ #include #include +#include #include "f2fs.h" #include "node.h" #include "acl.h" @@ -875,6 +876,14 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); for (; n < npages; n++) { + + /* allow readdir() to be interrupted */ + if (fatal_signal_pending(current)) { + err = -ERESTARTSYS; + goto out_free; + } + cond_resched(); + dentry_page = get_lock_data_page(inode, n, false); if (IS_ERR(dentry_page)) { err = PTR_ERR(dentry_page); From 4d6e68be2534b03a135d5e98dee94d072166c455 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 13 Oct 2017 18:01:35 +0800 Subject: [PATCH 0459/1212] f2fs: relocate readahead codes in readdir() Previously, for large directory, we just do readahead only once in readdir(), readdir()'s performance may drop when traversing latter blocks. In order to avoid this, relocate readahead codes to covering all traverse flow. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 95500eaae681..65c528539b78 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -870,11 +870,6 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) goto out_free; } - /* readahead for multi pages of dir */ - if (npages - n > 1 && !ra_has_index(ra, n)) - page_cache_sync_readahead(inode->i_mapping, ra, file, n, - min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); - for (; n < npages; n++) { /* allow readdir() to be interrupted */ @@ -884,6 +879,11 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) } cond_resched(); + /* readahead for multi pages of dir */ + if (npages - n > 1 && !ra_has_index(ra, n)) + page_cache_sync_readahead(inode->i_mapping, ra, file, n, + min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); + dentry_page = get_lock_data_page(inode, n, false); if (IS_ERR(dentry_page)) { err = PTR_ERR(dentry_page); From 83ed7a615f0a8c2efb448a17c0e51ab33adbee2c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 13 Oct 2017 18:01:36 +0800 Subject: [PATCH 0460/1212] f2fs: update ctx->pos correctly when hitting hole in directory This patch fixes to update ctx->pos correctly when hitting hole in directory. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 65c528539b78..1955707b138b 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -870,7 +870,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) goto out_free; } - for (; n < npages; n++) { + for (; n < npages; n++, ctx->pos = n * NR_DENTRY_IN_BLOCK) { /* allow readdir() to be interrupted */ if (fatal_signal_pending(current)) { @@ -907,7 +907,6 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) break; } - ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK; kunmap(dentry_page); f2fs_put_page(dentry_page, 1); } From 171b638fc49bdaf3302d7df8eb7b9d5bc2d3dfbe Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 18 Oct 2017 19:05:57 -0700 Subject: [PATCH 0461/1212] f2fs: limit # of inmemory pages If some abnormal users try lots of atomic write operations, f2fs is able to produce pinned pages in the main memory which affects system performance. This patch limits that as 20% over total memory size, and if f2fs reaches to the limit, it will drop all the inmemory pages. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 8 ++++++++ fs/f2fs/f2fs.h | 3 +++ fs/f2fs/node.c | 4 ++++ fs/f2fs/node.h | 1 + fs/f2fs/segment.c | 38 ++++++++++++++++++++++++++++++++++++++ fs/f2fs/super.c | 1 + 6 files changed, 55 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 43e32216e681..6750584b7107 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1941,6 +1941,12 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, trace_f2fs_write_begin(inode, pos, len, flags); + if (f2fs_is_atomic_file(inode) && + !available_free_memory(sbi, INMEM_PAGES)) { + err = -ENOMEM; + goto fail; + } + /* * We should check this at this moment to avoid deadlock on inode page * and #0 page. The locking rule for inline_data conversion should be: @@ -2017,6 +2023,8 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, fail: f2fs_put_page(page, 1); f2fs_write_failed(mapping, pos + len); + if (f2fs_is_atomic_file(inode)) + drop_inmem_pages_all(sbi); return err; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index bd5839895737..571b7f18171c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -665,6 +665,7 @@ struct f2fs_inode_info { #endif struct list_head dirty_list; /* dirty list for dirs and files */ struct list_head gdirty_list; /* linked in global dirty list */ + struct list_head inmem_ilist; /* list for inmem inodes */ struct list_head inmem_pages; /* inmemory pages managed by f2fs */ struct task_struct *inmem_task; /* store inmemory task */ struct mutex inmem_lock; /* lock for inmemory pages */ @@ -1029,6 +1030,7 @@ enum inode_type { DIR_INODE, /* for dirty dir inode */ FILE_INODE, /* for dirty regular/symlink inode */ DIRTY_META, /* for all dirtied inode metadata */ + ATOMIC_FILE, /* for all atomic files */ NR_INODE_TYPE, }; @@ -2625,6 +2627,7 @@ void destroy_node_manager_caches(void); */ bool need_SSR(struct f2fs_sb_info *sbi); void register_inmem_page(struct inode *inode, struct page *page); +void drop_inmem_pages_all(struct f2fs_sb_info *sbi); void drop_inmem_pages(struct inode *inode); void drop_inmem_page(struct inode *inode, struct page *page); int commit_inmem_pages(struct inode *inode); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 733a8e14a4c8..22f2ba4a6326 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -74,6 +74,10 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) atomic_read(&sbi->total_ext_node) * sizeof(struct extent_node)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); + } else if (type == INMEM_PAGES) { + /* it allows 20% / total_ram for inmemory pages */ + mem_size = get_pages(sbi, F2FS_INMEM_PAGES); + res = mem_size < (val.totalram / 5); } else { if (!sbi->sb->s_bdi->wb.dirty_exceeded) return true; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index e91b08b4a51a..0ee3e5ff49a3 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -140,6 +140,7 @@ enum mem_type { DIRTY_DENTS, /* indicates dirty dentry pages */ INO_ENTRIES, /* indicates inode entries */ EXTENT_CACHE, /* indicates extent cache */ + INMEM_PAGES, /* indicates inmemory pages */ BASE_CHECK, /* check kernel status */ }; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a065a2c01b5f..f0916b24f5b4 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -186,6 +186,7 @@ bool need_SSR(struct f2fs_sb_info *sbi) void register_inmem_page(struct inode *inode, struct page *page) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); struct inmem_pages *new; @@ -204,6 +205,10 @@ void register_inmem_page(struct inode *inode, struct page *page) mutex_lock(&fi->inmem_lock); get_page(page); list_add_tail(&new->list, &fi->inmem_pages); + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (list_empty(&fi->inmem_ilist)) + list_add_tail(&fi->inmem_ilist, &sbi->inode_list[ATOMIC_FILE]); + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); mutex_unlock(&fi->inmem_lock); @@ -262,12 +267,41 @@ static int __revoke_inmem_pages(struct inode *inode, return err; } +void drop_inmem_pages_all(struct f2fs_sb_info *sbi) +{ + struct list_head *head = &sbi->inode_list[ATOMIC_FILE]; + struct inode *inode; + struct f2fs_inode_info *fi; +next: + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (list_empty(head)) { + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + return; + } + fi = list_first_entry(head, struct f2fs_inode_info, inmem_ilist); + inode = igrab(&fi->vfs_inode); + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + + if (inode) { + drop_inmem_pages(inode); + iput(inode); + } + congestion_wait(BLK_RW_ASYNC, HZ/50); + cond_resched(); + goto next; +} + void drop_inmem_pages(struct inode *inode) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); mutex_lock(&fi->inmem_lock); __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (!list_empty(&fi->inmem_ilist)) + list_del_init(&fi->inmem_ilist); + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); mutex_unlock(&fi->inmem_lock); clear_inode_flag(inode, FI_ATOMIC_FILE); @@ -399,6 +433,10 @@ int commit_inmem_pages(struct inode *inode) /* drop all uncommitted pages */ __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); } + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (!list_empty(&fi->inmem_ilist)) + list_del_init(&fi->inmem_ilist); + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); mutex_unlock(&fi->inmem_lock); clear_inode_flag(inode, FI_ATOMIC_COMMIT); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index c78bbb78c870..1f8711bbb89f 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -625,6 +625,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) init_rwsem(&fi->i_sem); INIT_LIST_HEAD(&fi->dirty_list); INIT_LIST_HEAD(&fi->gdirty_list); + INIT_LIST_HEAD(&fi->inmem_ilist); INIT_LIST_HEAD(&fi->inmem_pages); mutex_init(&fi->inmem_lock); init_rwsem(&fi->dio_rwsem[READ]); From 032a6906825a2ebe60204ba67002a17116113c13 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 19 Oct 2017 09:43:56 -0700 Subject: [PATCH 0462/1212] f2fs: retry ENOMEM for quota_read|write This gives another chance to read or write quota data. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1f8711bbb89f..92abf034bde7 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1322,8 +1322,13 @@ static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data, tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread); repeat: page = read_mapping_page(mapping, blkidx, NULL); - if (IS_ERR(page)) + if (IS_ERR(page)) { + if (PTR_ERR(page) == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto repeat; + } return PTR_ERR(page); + } lock_page(page); @@ -1366,11 +1371,16 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type, while (towrite > 0) { tocopy = min_t(unsigned long, sb->s_blocksize - offset, towrite); - +retry: err = a_ops->write_begin(NULL, mapping, off, tocopy, 0, &page, NULL); - if (unlikely(err)) + if (unlikely(err)) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } break; + } kaddr = kmap_atomic(page); memcpy(kaddr + offset, data, tocopy); From 5f5f59322240a84bcf4c1896754b255b4b158d1d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 19 Oct 2017 11:48:57 -0700 Subject: [PATCH 0463/1212] f2fs: remove obsolete pointer for truncate_xattr_node This patch removes obosolete parameter for truncate_xattr_node. Suggested-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/node.c | 10 ++++------ fs/f2fs/xattr.c | 2 +- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 571b7f18171c..72de8ae4be13 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2592,7 +2592,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni); pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs); int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode); int truncate_inode_blocks(struct inode *inode, pgoff_t from); -int truncate_xattr_node(struct inode *inode, struct page *page); +int truncate_xattr_node(struct inode *inode); int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino); int remove_inode_page(struct inode *inode); struct page *new_inode_page(struct inode *inode); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 22f2ba4a6326..d7e0a4366527 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -962,7 +962,8 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from) return err > 0 ? 0 : err; } -int truncate_xattr_node(struct inode *inode, struct page *page) +/* caller must lock inode page */ +int truncate_xattr_node(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t nid = F2FS_I(inode)->i_xattr_nid; @@ -978,10 +979,7 @@ int truncate_xattr_node(struct inode *inode, struct page *page) f2fs_i_xnid_write(inode, 0); - set_new_dnode(&dn, inode, page, npage, nid); - - if (page) - dn.inode_page_locked = true; + set_new_dnode(&dn, inode, NULL, npage, nid); truncate_node(&dn); return 0; } @@ -1000,7 +998,7 @@ int remove_inode_page(struct inode *inode) if (err) return err; - err = truncate_xattr_node(inode, dn.inode_page); + err = truncate_xattr_node(inode); if (err) { f2fs_put_dnode(&dn); return err; diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index c5e6a7e42262..442c7ec5acd0 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -468,7 +468,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, /* no need to use xattr node block */ if (hsize <= inline_size) { - err = truncate_xattr_node(inode, ipage); + err = truncate_xattr_node(inode); alloc_nid_failed(sbi, new_nid); return err; } From 5c15033ceaea9900ecd1a5551a8080ee1a4abfdb Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 19 Oct 2017 12:07:11 -0700 Subject: [PATCH 0464/1212] Revert "f2fs: return wrong error number on f2fs_quota_write" This reverts commit 4f31d26b0c17f2aae6a6afeb823a87e20671ab4b. It turns out that we need to report error number if nothing was written. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 92abf034bde7..8d79b1887cb4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1397,7 +1397,7 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type, } if (len == towrite) - return 0; + return err; inode->i_version++; inode->i_mtime = inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, false); From 03b1cb0bb4a2f4b1e512aa2b3dcaf22717e7e76e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 18 Oct 2017 10:34:14 +0800 Subject: [PATCH 0465/1212] f2fs: fix to correct no_fggc_candidate There may be extreme case as below: For one section contains one segment, and there are total 100 segments with 10% over-privision ratio in f2fs partition, fggc_threshold will be rounded down to 460 instead of 460.8 as below caclulation: sbi->fggc_threshold = div_u64((u64)(main_count - ovp_count) * BLKS_PER_SEC(sbi), (main_count - resv_count)); If section usage is as: 60 segments which contain 460 valid blocks 40 segments which contain 462 valid blocks As valid block number in all sections is large than fggc_threshold, so none of them will be chosen as candidate due to incorrect fggc_threshold. Let's just soften the term of choosing foreground GC candidates. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index d12d9cd99f91..9342b973da65 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -730,7 +730,7 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) static inline bool no_fggc_candidate(struct f2fs_sb_info *sbi, unsigned int secno) { - if (get_valid_blocks(sbi, GET_SEG_FROM_SEC(sbi, secno), true) >= + if (get_valid_blocks(sbi, GET_SEG_FROM_SEC(sbi, secno), true) > sbi->fggc_threshold) return true; return false; From 807486c79534a3e9286f40a3cbf908a827d5a957 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 19 Oct 2017 11:52:47 +0200 Subject: [PATCH 0466/1212] f2fs: avoid using timespec All uses of timespec are deprecated, and this one is not particularly useful, as the documented method for converting seconds to jiffies is to multiply by 'HZ'. Signed-off-by: Arnd Bergmann Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 72de8ae4be13..88d5c99f44e7 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1269,8 +1269,7 @@ static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type) { - struct timespec ts = {sbi->interval_time[type], 0}; - unsigned long interval = timespec_to_jiffies(&ts); + unsigned long interval = sbi->interval_time[type] * HZ; return time_after(jiffies, sbi->last_time[type] + interval); } From 91bea0c391b3c01b617237a107e76151c2b376b7 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 19 Oct 2017 12:58:21 +0200 Subject: [PATCH 0467/1212] f2fs: remove several redundant assignments There are several assignments to variables that are redundant as the values are never read when the variables are updated later and so the redundant statements can be safely removed. Cleans up clang warnings: fs/f2fs/segment.c:923:19: warning: Value stored to 'p' during its initialization is never read fs/f2fs/segment.c:2060:2: warning: Value stored to 'hint' is never read fs/f2fs/segment.c:2353:2: warning: Value stored to 'start_block' is never read fs/f2fs/segment.c:2354:2: warning: Value stored to 'end_block' is never read Signed-off-by: Colin Ian King Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f0916b24f5b4..85295baa74c8 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1038,7 +1038,7 @@ static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi, struct rb_node *insert_parent) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct rb_node **p = &dcc->root.rb_node; + struct rb_node **p; struct rb_node *parent = NULL; struct discard_cmd *dc = NULL; @@ -2175,7 +2175,6 @@ static void get_new_segment(struct f2fs_sb_info *sbi, } secno = left_start; skip_left: - hint = secno; segno = GET_SEG_FROM_SEC(sbi, secno); zoneno = GET_ZONE_FROM_SEC(sbi, secno); @@ -2468,9 +2467,6 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 : GET_SEGNO(sbi, end); - start_block = START_BLOCK(sbi, start_segno); - end_block = START_BLOCK(sbi, end_segno + 1); - cpc.reason = CP_DISCARD; cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen)); From fc13f9d7ce1e4d04dcc4204dad751cb9dda11d3b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 23 Oct 2017 23:48:49 +0200 Subject: [PATCH 0468/1212] f2fs: stop all the operations by cp_error flag This patch replaces to use cp_error flag instead of RDONLY for quota off. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 3 +++ fs/f2fs/checkpoint.c | 1 - fs/f2fs/file.c | 26 ++++++++++++++++++++++++++ fs/f2fs/namei.c | 30 ++++++++++++++++++++++++++++++ fs/f2fs/super.c | 3 +++ 5 files changed, 62 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 112f8e04c549..3f52efa0f94f 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -253,6 +253,9 @@ static int __f2fs_set_acl(struct inode *inode, int type, int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + return __f2fs_set_acl(inode, type, acl, NULL); } diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 90ff066c9569..48f9366240a2 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -29,7 +29,6 @@ struct kmem_cache *inode_entry_slab; void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) { set_ckpt_flags(sbi, CP_ERROR_FLAG); - sbi->sb->s_flags |= MS_RDONLY; if (!end_io) f2fs_flush_merged_writes(sbi); } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 505fb4ee03a4..19cdf9f5261b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -56,6 +56,11 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, struct dnode_of_data dn; int err; + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; + goto err; + } + sb_start_pagefault(inode->i_sb); f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); @@ -117,6 +122,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, out: sb_end_pagefault(inode->i_sb); f2fs_update_time(sbi, REQ_TIME); +err: return block_page_mkwrite_return(err); } @@ -313,6 +319,8 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { + if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(file))))) + return -EIO; return f2fs_do_sync_file(file, start, end, datasync, false); } @@ -449,6 +457,9 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) struct inode *inode = file_inode(file); int err; + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + /* we don't need to use inline_data strictly */ err = f2fs_convert_inline_inode(inode); if (err) @@ -635,6 +646,9 @@ int f2fs_truncate(struct inode *inode) { int err; + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) return 0; @@ -713,6 +727,9 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) int err; bool size_changed = false; + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + err = inode_change_ok(inode, attr); if (err) return err; @@ -1444,6 +1461,9 @@ static long f2fs_fallocate(struct file *file, int mode, struct inode *inode = file_inode(file); long ret = 0; + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + /* f2fs only support ->fallocate for regular file */ if (!S_ISREG(inode->i_mode)) return -EINVAL; @@ -2420,6 +2440,9 @@ static int f2fs_ioc_get_features(struct file *filp, unsigned long arg) long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { + if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp))))) + return -EIO; + switch (cmd) { case F2FS_IOC_GETFLAGS: return f2fs_ioc_getflags(filp, arg); @@ -2473,6 +2496,9 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct blk_plug plug; ssize_t ret; + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret > 0) { diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 5503da9c55f8..a2402ccc0779 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -177,6 +177,9 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, nid_t ino = 0; int err; + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + err = dquot_initialize(dir); if (err) return err; @@ -221,6 +224,9 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, struct f2fs_sb_info *sbi = F2FS_I_SB(dir); int err; + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + if (f2fs_encrypted_inode(dir) && !fscrypt_has_permitted_context(dir, inode)) return -EPERM; @@ -420,6 +426,9 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) trace_f2fs_unlink_enter(dir, dentry); + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + err = dquot_initialize(dir); if (err) return err; @@ -472,6 +481,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, struct fscrypt_symlink_data *sd = NULL; int err; + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + if (f2fs_encrypted_inode(dir)) { err = fscrypt_get_encryption_info(dir); if (err) @@ -578,6 +590,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct inode *inode; int err; + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + err = dquot_initialize(dir); if (err) return err; @@ -630,6 +645,9 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, struct inode *inode; int err = 0; + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + err = dquot_initialize(dir); if (err) return err; @@ -724,6 +742,9 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) { + if (unlikely(f2fs_cp_error(F2FS_I_SB(dir)))) + return -EIO; + if (f2fs_encrypted_inode(dir)) { int err = fscrypt_get_encryption_info(dir); if (err) @@ -735,6 +756,9 @@ static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) static int f2fs_create_whiteout(struct inode *dir, struct inode **whiteout) { + if (unlikely(f2fs_cp_error(F2FS_I_SB(dir)))) + return -EIO; + return __f2fs_tmpfile(dir, NULL, S_IFCHR | WHITEOUT_MODE, whiteout); } @@ -754,6 +778,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, bool is_old_inline = f2fs_has_inline_dentry(old_dir); int err = -ENOENT; + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + if ((f2fs_encrypted_inode(old_dir) && !fscrypt_has_encryption_key(old_dir)) || (f2fs_encrypted_inode(new_dir) && @@ -947,6 +974,9 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, int old_nlink = 0, new_nlink = 0; int err = -ENOENT; + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + if ((f2fs_encrypted_inode(old_dir) && !fscrypt_has_encryption_key(old_dir)) || (f2fs_encrypted_inode(new_dir) && diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8d79b1887cb4..94bbcaeb2b6e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -860,6 +860,9 @@ int f2fs_sync_fs(struct super_block *sb, int sync) struct f2fs_sb_info *sbi = F2FS_SB(sb); int err = 0; + if (unlikely(f2fs_cp_error(sbi))) + return 0; + trace_f2fs_sync_fs(sb, sync); if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) From 46d4a691f035642af7a51786182963ec8a1748fb Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 24 Oct 2017 09:46:54 +0200 Subject: [PATCH 0469/1212] f2fs: show # of dirty segments via sysfs This patch adds one sysfs entry to show # of dirty segments which can be used for gc timing by user. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 89f25efffd43..48ebe6153cc5 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -63,6 +63,13 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) return NULL; } +static ssize_t dirty_segments_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)(dirty_segments(sbi))); +} + static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -278,6 +285,7 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); #endif +F2FS_GENERAL_RO_ATTR(dirty_segments); F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); F2FS_GENERAL_RO_ATTR(features); @@ -320,6 +328,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(inject_rate), ATTR_LIST(inject_type), #endif + ATTR_LIST(dirty_segments), ATTR_LIST(lifetime_write_kbytes), ATTR_LIST(features), ATTR_LIST(reserved_blocks), From 5b8ff1301a61a8b93e2e6c3c72c4378a993e8989 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 23 Oct 2017 23:50:15 +0200 Subject: [PATCH 0470/1212] f2fs: add missing quota_initialize This patch adds to call quota_intialize in f2fs_set_acl, f2fs_unlink, and f2fs_rename. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 9 +++++++++ fs/f2fs/xattr.c | 4 ++++ 2 files changed, 13 insertions(+) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a2402ccc0779..f78de030b8b7 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -430,6 +430,9 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) return -EIO; err = dquot_initialize(dir); + if (err) + return err; + err = dquot_initialize(inode); if (err) return err; @@ -806,6 +809,12 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (err) goto out; + if (new_inode) { + err = dquot_initialize(new_inode); + if (err) + goto out; + } + old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) { if (IS_ERR(old_page)) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 442c7ec5acd0..1f35ae6a4170 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -718,6 +718,10 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int err; + err = dquot_initialize(inode); + if (err) + return err; + /* this case is only from init_inode_metadata */ if (ipage) return __f2fs_setxattr(inode, index, name, value, From ada4061e191bd0eba88f811d386e72fcc39cce97 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 26 Oct 2017 10:31:22 +0200 Subject: [PATCH 0471/1212] f2fs: show current cp state This patch shows whether checkpoint met any error case. Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index d441660c3ba6..f7eec506ceea 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -264,9 +264,10 @@ static int stat_show(struct seq_file *s, void *v) list_for_each_entry(si, &f2fs_stat_list, stat_list) { update_general_status(si->sbi); - seq_printf(s, "\n=====[ partition info(%pg). #%d, %s]=====\n", + seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s]=====\n", si->sbi->sb->s_bdev, i++, - f2fs_readonly(si->sbi->sb) ? "RO": "RW"); + f2fs_readonly(si->sbi->sb) ? "RO": "RW", + f2fs_cp_error(si->sbi) ? "Error": "Good"); seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ", si->sit_area_segs, si->nat_area_segs); seq_printf(s, "[SSA: %d] [MAIN: %d", From 7368e30495c52bc0b42819e75f78269ddec6c530 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 6 Sep 2017 21:59:50 +0800 Subject: [PATCH 0472/1212] f2fs: support flexible inline xattr size Now, in product, more and more features based on file encryption were introduced, their demand of xattr space is increasing, however, inline xattr has fixed-size of 200 bytes, once inline xattr space is full, new increased xattr data would occupy additional xattr block which may bring us more space usage and performance regression during persisting. In order to resolve above issue, it's better to expand inline xattr size flexibly according to user's requirement. So this patch introduces new filesystem feature 'flexible inline xattr', and new mount option 'inline_xattr_size=%u', once mkfs enables the feature, we can use the option to make f2fs supporting flexible inline xattr size. To support this feature, we add extra attribute i_inline_xattr_size in inode layout, indicating that how many space inline xattr borrows from block address mapping space in inode layout, by this, we can easily locate and store flexible-sized inline xattr data in inode. Inode disk layout: +----------------------+ | .i_mode | | ... | | .i_ext | +----------------------+ | .i_extra_isize | | .i_inline_xattr_size |-----------+ | ... | | +----------------------+ | | .i_addr | | | - block address or | | | - inline data | | +----------------------+<---+ v | inline xattr | +---inline xattr range +----------------------+<---+ | .i_nid | +----------------------+ | node_footer | | (nid, ino, offset) | +----------------------+ Note that, we have to cnosider backward compatibility which reserved inline_data space, 200 bytes, all the time, reported by Sheng Yong. Previous inline data or directory always reserved 200 bytes in inode layout, even if inline_xattr is disabled. In order to keep inline_dentry's structure for backward compatibility, we get the space back only from inline_data. Signed-off-by: Chao Yu Reported-by: Sheng Yong Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 39 ++++++++++++++++++++++++++------------- fs/f2fs/inode.c | 21 +++++++++++++++++++++ fs/f2fs/namei.c | 13 +++++++++++++ fs/f2fs/node.c | 10 ++++++++-- fs/f2fs/super.c | 32 +++++++++++++++++++++++++++++++- fs/f2fs/sysfs.c | 7 +++++++ fs/f2fs/xattr.c | 18 +++++++++--------- include/linux/f2fs_fs.h | 5 +++-- 8 files changed, 118 insertions(+), 27 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 88d5c99f44e7..6ae52815b33f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -94,6 +94,7 @@ extern char *fault_name[FAULT_MAX]; #define F2FS_MOUNT_GRPQUOTA 0x00100000 #define F2FS_MOUNT_PRJQUOTA 0x00200000 #define F2FS_MOUNT_QUOTA 0x00400000 +#define F2FS_MOUNT_INLINE_XATTR_SIZE 0x00800000 #define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option) @@ -119,6 +120,7 @@ struct f2fs_mount_info { #define F2FS_FEATURE_EXTRA_ATTR 0x0008 #define F2FS_FEATURE_PRJQUOTA 0x0010 #define F2FS_FEATURE_INODE_CHKSUM 0x0020 +#define F2FS_FEATURE_FLEXIBLE_INLINE_XATTR 0x0040 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -461,11 +463,14 @@ struct f2fs_flush_device { /* for inline stuff */ #define DEF_INLINE_RESERVED_SIZE 1 +#define DEF_MIN_INLINE_SIZE 1 static inline int get_extra_isize(struct inode *inode); -#define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ - (CUR_ADDRS_PER_INODE(inode) - \ - DEF_INLINE_RESERVED_SIZE - \ - F2FS_INLINE_XATTR_ADDRS)) +static inline int get_inline_xattr_addrs(struct inode *inode); +#define F2FS_INLINE_XATTR_ADDRS(inode) get_inline_xattr_addrs(inode) +#define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ + (CUR_ADDRS_PER_INODE(inode) - \ + F2FS_INLINE_XATTR_ADDRS(inode) - \ + DEF_INLINE_RESERVED_SIZE)) /* for inline dir */ #define NR_INLINE_DENTRY(inode) (MAX_INLINE_DATA(inode) * BITS_PER_BYTE / \ @@ -676,6 +681,7 @@ struct f2fs_inode_info { int i_extra_isize; /* size of extra space located in i_addr */ kprojid_t i_projid; /* id for project quota */ + int i_inline_xattr_size; /* inline xattr size */ }; static inline void get_extent_info(struct extent_info *ext, @@ -1133,6 +1139,7 @@ struct f2fs_sb_info { loff_t max_file_blocks; /* max block index of file */ int active_logs; /* # of active logs */ int dir_level; /* directory level */ + int inline_xattr_size; /* inline xattr size */ block_t user_block_count; /* # of user blocks */ block_t total_valid_block_count; /* # of valid blocks */ @@ -2247,25 +2254,20 @@ static inline int f2fs_has_inline_xattr(struct inode *inode) static inline unsigned int addrs_per_inode(struct inode *inode) { - if (f2fs_has_inline_xattr(inode)) - return CUR_ADDRS_PER_INODE(inode) - F2FS_INLINE_XATTR_ADDRS; - return CUR_ADDRS_PER_INODE(inode); + return CUR_ADDRS_PER_INODE(inode) - F2FS_INLINE_XATTR_ADDRS(inode); } -static inline void *inline_xattr_addr(struct page *page) +static inline void *inline_xattr_addr(struct inode *inode, struct page *page) { struct f2fs_inode *ri = F2FS_INODE(page); return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE - - F2FS_INLINE_XATTR_ADDRS]); + F2FS_INLINE_XATTR_ADDRS(inode)]); } static inline int inline_xattr_size(struct inode *inode) { - if (f2fs_has_inline_xattr(inode)) - return F2FS_INLINE_XATTR_ADDRS << 2; - else - return 0; + return get_inline_xattr_addrs(inode) * sizeof(__le32); } static inline int f2fs_has_inline_data(struct inode *inode) @@ -2435,6 +2437,12 @@ static inline int get_extra_isize(struct inode *inode) return F2FS_I(inode)->i_extra_isize / sizeof(__le32); } +static inline int f2fs_sb_has_flexible_inline_xattr(struct super_block *sb); +static inline int get_inline_xattr_addrs(struct inode *inode) +{ + return F2FS_I(inode)->i_inline_xattr_size; +} + #define get_inode_mode(i) \ ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) @@ -3104,6 +3112,11 @@ static inline int f2fs_sb_has_inode_chksum(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CHKSUM); } +static inline int f2fs_sb_has_flexible_inline_xattr(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_FLEXIBLE_INLINE_XATTR); +} + #ifdef CONFIG_BLK_DEV_ZONED static inline int get_blkz_type(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkaddr) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 3617e7fca930..9684d53563f1 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -232,6 +232,23 @@ static int do_read_inode(struct inode *inode) fi->i_extra_isize = f2fs_has_extra_attr(inode) ? le16_to_cpu(ri->i_extra_isize) : 0; + if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) { + f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode)); + fi->i_inline_xattr_size = le16_to_cpu(ri->i_inline_xattr_size); + } else if (f2fs_has_inline_xattr(inode) || + f2fs_has_inline_dentry(inode)) { + fi->i_inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; + } else { + + /* + * Previous inline data or directory always reserved 200 bytes + * in inode layout, even if inline_xattr is disabled. In order + * to keep inline_dentry's structure for backward compatibility, + * we get the space back only from inline_data. + */ + fi->i_inline_xattr_size = 0; + } + /* check data exist */ if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) __recover_inline_status(inode, node_page); @@ -384,6 +401,10 @@ int update_inode(struct inode *inode, struct page *node_page) if (f2fs_has_extra_attr(inode)) { ri->i_extra_isize = cpu_to_le16(F2FS_I(inode)->i_extra_isize); + if (f2fs_sb_has_flexible_inline_xattr(F2FS_I_SB(inode)->sb)) + ri->i_inline_xattr_size = + cpu_to_le16(F2FS_I(inode)->i_inline_xattr_size); + if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) && F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, i_projid)) { diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index f78de030b8b7..cf8f4370d256 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -29,6 +29,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) nid_t ino; struct inode *inode; bool nid_free = false; + int xattr_size = 0; int err; inode = new_inode(dir->i_sb); @@ -86,11 +87,23 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (test_opt(sbi, INLINE_XATTR)) set_inode_flag(inode, FI_INLINE_XATTR); + if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) set_inode_flag(inode, FI_INLINE_DATA); if (f2fs_may_inline_dentry(inode)) set_inode_flag(inode, FI_INLINE_DENTRY); + if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) { + f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode)); + if (f2fs_has_inline_xattr(inode)) + xattr_size = sbi->inline_xattr_size; + /* Otherwise, will be 0 */ + } else if (f2fs_has_inline_xattr(inode) || + f2fs_has_inline_dentry(inode)) { + xattr_size = DEFAULT_INLINE_XATTR_ADDRS; + } + F2FS_I(inode)->i_inline_xattr_size = xattr_size; + f2fs_init_extent_tree(inode, NULL); stat_inc_inline_xattr(inode); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d7e0a4366527..77b39a058a34 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2194,8 +2194,8 @@ void recover_inline_xattr(struct inode *inode, struct page *page) goto update_inode; } - dst_addr = inline_xattr_addr(ipage); - src_addr = inline_xattr_addr(page); + dst_addr = inline_xattr_addr(inode, ipage); + src_addr = inline_xattr_addr(inode, page); inline_size = inline_xattr_size(inode); f2fs_wait_on_page_writeback(ipage, NODE, true); @@ -2284,6 +2284,12 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) dst->i_inline = src->i_inline & (F2FS_INLINE_XATTR | F2FS_EXTRA_ATTR); if (dst->i_inline & F2FS_EXTRA_ATTR) { dst->i_extra_isize = src->i_extra_isize; + + if (f2fs_sb_has_flexible_inline_xattr(sbi->sb) && + F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), + i_inline_xattr_size)) + dst->i_inline_xattr_size = src->i_inline_xattr_size; + if (f2fs_sb_has_project_quota(sbi->sb) && F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), i_projid)) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 94bbcaeb2b6e..ff3cbfa0c1c9 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -92,6 +92,7 @@ enum { Opt_disable_ext_identify, Opt_inline_xattr, Opt_noinline_xattr, + Opt_inline_xattr_size, Opt_inline_data, Opt_inline_dentry, Opt_noinline_dentry, @@ -141,6 +142,7 @@ static match_table_t f2fs_tokens = { {Opt_disable_ext_identify, "disable_ext_identify"}, {Opt_inline_xattr, "inline_xattr"}, {Opt_noinline_xattr, "noinline_xattr"}, + {Opt_inline_xattr_size, "inline_xattr_size=%u"}, {Opt_inline_data, "inline_data"}, {Opt_inline_dentry, "inline_dentry"}, {Opt_noinline_dentry, "noinline_dentry"}, @@ -383,6 +385,12 @@ static int parse_options(struct super_block *sb, char *options) case Opt_noinline_xattr: clear_opt(sbi, INLINE_XATTR); break; + case Opt_inline_xattr_size: + if (args->from && match_int(args, &arg)) + return -EINVAL; + set_opt(sbi, INLINE_XATTR_SIZE); + sbi->inline_xattr_size = arg; + break; #else case Opt_user_xattr: f2fs_msg(sb, KERN_INFO, @@ -604,6 +612,24 @@ static int parse_options(struct super_block *sb, char *options) F2FS_IO_SIZE_KB(sbi)); return -EINVAL; } + + if (test_opt(sbi, INLINE_XATTR_SIZE)) { + if (!test_opt(sbi, INLINE_XATTR)) { + f2fs_msg(sb, KERN_ERR, + "inline_xattr_size option should be " + "set with inline_xattr option"); + return -EINVAL; + } + if (!sbi->inline_xattr_size || + sbi->inline_xattr_size >= DEF_ADDRS_PER_INODE - + F2FS_TOTAL_EXTRA_ATTR_SIZE - + DEF_INLINE_RESERVED_SIZE - + DEF_MIN_INLINE_SIZE) { + f2fs_msg(sb, KERN_ERR, + "inline xattr size is out of range"); + return -EINVAL; + } + } return 0; } @@ -1051,6 +1077,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",inline_xattr"); else seq_puts(seq, ",noinline_xattr"); + if (test_opt(sbi, INLINE_XATTR_SIZE)) + seq_printf(seq, ",inline_xattr_size=%u", + sbi->inline_xattr_size); #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL if (test_opt(sbi, POSIX_ACL)) @@ -1113,6 +1142,7 @@ static void default_options(struct f2fs_sb_info *sbi) { /* init some FS parameters */ sbi->active_logs = NR_CURSEG_TYPE; + sbi->inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; set_opt(sbi, BG_GC); set_opt(sbi, INLINE_XATTR); @@ -1674,7 +1704,7 @@ static loff_t max_file_blocks(void) /* * note: previously, result is equal to (DEF_ADDRS_PER_INODE - - * F2FS_INLINE_XATTR_ADDRS), but now f2fs try to reserve more + * DEFAULT_INLINE_XATTR_ADDRS), but now f2fs try to reserve more * space in inode.i_addr, it will be more safe to reassign * result as zero. */ diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 48ebe6153cc5..e09e59cc678a 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -107,6 +107,9 @@ static ssize_t features_show(struct f2fs_attr *a, if (f2fs_sb_has_inode_chksum(sb)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "inode_checksum"); + if (f2fs_sb_has_flexible_inline_xattr(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "flexible_inline_xattr"); len += snprintf(buf + len, PAGE_SIZE - len, "\n"); return len; } @@ -216,6 +219,7 @@ enum feat_id { FEAT_EXTRA_ATTR, FEAT_PROJECT_QUOTA, FEAT_INODE_CHECKSUM, + FEAT_FLEXIBLE_INLINE_XATTR, }; static ssize_t f2fs_feature_show(struct f2fs_attr *a, @@ -228,6 +232,7 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a, case FEAT_EXTRA_ATTR: case FEAT_PROJECT_QUOTA: case FEAT_INODE_CHECKSUM: + case FEAT_FLEXIBLE_INLINE_XATTR: return snprintf(buf, PAGE_SIZE, "supported\n"); } return 0; @@ -299,6 +304,7 @@ F2FS_FEATURE_RO_ATTR(atomic_write, FEAT_ATOMIC_WRITE); F2FS_FEATURE_RO_ATTR(extra_attr, FEAT_EXTRA_ATTR); F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA); F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM); +F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -346,6 +352,7 @@ static struct attribute *f2fs_feat_attrs[] = { ATTR_LIST(extra_attr), ATTR_LIST(project_quota), ATTR_LIST(inode_checksum), + ATTR_LIST(flexible_inline_xattr), NULL, }; diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 1f35ae6a4170..bcf455abe845 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -264,12 +264,12 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index, return entry; } -static struct f2fs_xattr_entry *__find_inline_xattr(void *base_addr, - void **last_addr, int index, - size_t len, const char *name) +static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode, + void *base_addr, void **last_addr, int index, + size_t len, const char *name) { struct f2fs_xattr_entry *entry; - unsigned int inline_size = F2FS_INLINE_XATTR_ADDRS << 2; + unsigned int inline_size = inline_xattr_size(inode); list_for_each_xattr(entry, base_addr) { if ((void *)entry + sizeof(__u32) > base_addr + inline_size || @@ -297,13 +297,13 @@ static int read_inline_xattr(struct inode *inode, struct page *ipage, void *inline_addr; if (ipage) { - inline_addr = inline_xattr_addr(ipage); + inline_addr = inline_xattr_addr(inode, ipage); } else { page = get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) return PTR_ERR(page); - inline_addr = inline_xattr_addr(page); + inline_addr = inline_xattr_addr(inode, page); } memcpy(txattr_addr, inline_addr, inline_size); f2fs_put_page(page, 1); @@ -356,7 +356,7 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, if (err) goto out; - *xe = __find_inline_xattr(txattr_addr, &last_addr, + *xe = __find_inline_xattr(inode, txattr_addr, &last_addr, index, len, name); if (*xe) goto check; @@ -451,7 +451,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, void *inline_addr; if (ipage) { - inline_addr = inline_xattr_addr(ipage); + inline_addr = inline_xattr_addr(inode, ipage); f2fs_wait_on_page_writeback(ipage, NODE, true); set_page_dirty(ipage); } else { @@ -460,7 +460,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, alloc_nid_failed(sbi, new_nid); return PTR_ERR(page); } - inline_addr = inline_xattr_addr(page); + inline_addr = inline_xattr_addr(inode, page); f2fs_wait_on_page_writeback(page, NODE, true); } memcpy(inline_addr, txattr_addr, inline_size); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index c2a975e4a711..d79ca96045e4 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -184,7 +184,8 @@ struct f2fs_extent { } __packed; #define F2FS_NAME_LEN 255 -#define F2FS_INLINE_XATTR_ADDRS 50 /* 200 bytes for inline xattrs */ +/* 200 bytes for inline xattrs by default */ +#define DEFAULT_INLINE_XATTR_ADDRS 50 #define DEF_ADDRS_PER_INODE 923 /* Address Pointers in an Inode */ #define CUR_ADDRS_PER_INODE(inode) (DEF_ADDRS_PER_INODE - \ get_extra_isize(inode)) @@ -238,7 +239,7 @@ struct f2fs_inode { union { struct { __le16 i_extra_isize; /* extra inode attribute size */ - __le16 i_padding; /* padding */ + __le16 i_inline_xattr_size; /* inline xattr size, unit: 4 bytes */ __le32 i_projid; /* project id */ __le32 i_inode_checksum;/* inode meta checksum */ __le32 i_extra_end[0]; /* for attribute size calculation */ From b7b2e629b6f6a360dd2314dcbf594dfe111efc49 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 16 Oct 2017 15:05:16 -0700 Subject: [PATCH 0473/1212] f2fs: handle error case when adding xattr entry This patch fixes recovering incomplete xattr entries remaining in inline xattr and xattr block, caused by any kind of errors. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 51 +++++++++++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index bcf455abe845..7acf56ebda65 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -436,10 +436,12 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); size_t inline_size = inline_xattr_size(inode); + struct page *in_page = NULL; void *xattr_addr; + void *inline_addr = NULL; struct page *xpage; nid_t new_nid = 0; - int err; + int err = 0; if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid) if (!alloc_nid(sbi, &new_nid)) @@ -447,30 +449,30 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, /* write to inline xattr */ if (inline_size) { - struct page *page = NULL; - void *inline_addr; - if (ipage) { inline_addr = inline_xattr_addr(inode, ipage); - f2fs_wait_on_page_writeback(ipage, NODE, true); - set_page_dirty(ipage); } else { - page = get_node_page(sbi, inode->i_ino); - if (IS_ERR(page)) { + in_page = get_node_page(sbi, inode->i_ino); + if (IS_ERR(in_page)) { alloc_nid_failed(sbi, new_nid); - return PTR_ERR(page); + return PTR_ERR(in_page); } - inline_addr = inline_xattr_addr(inode, page); - f2fs_wait_on_page_writeback(page, NODE, true); + inline_addr = inline_xattr_addr(inode, in_page); } - memcpy(inline_addr, txattr_addr, inline_size); - f2fs_put_page(page, 1); + f2fs_wait_on_page_writeback(ipage ? ipage : in_page, + NODE, true); /* no need to use xattr node block */ if (hsize <= inline_size) { err = truncate_xattr_node(inode); alloc_nid_failed(sbi, new_nid); - return err; + if (err) { + f2fs_put_page(in_page, 1); + return err; + } + memcpy(inline_addr, txattr_addr, inline_size); + set_page_dirty(ipage ? ipage : in_page); + goto in_page_out; } } @@ -479,7 +481,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); if (IS_ERR(xpage)) { alloc_nid_failed(sbi, new_nid); - return PTR_ERR(xpage); + goto in_page_out; } f2fs_bug_on(sbi, new_nid); f2fs_wait_on_page_writeback(xpage, NODE, true); @@ -489,17 +491,24 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, xpage = new_node_page(&dn, XATTR_NODE_OFFSET); if (IS_ERR(xpage)) { alloc_nid_failed(sbi, new_nid); - return PTR_ERR(xpage); + goto in_page_out; } alloc_nid_done(sbi, new_nid); } - xattr_addr = page_address(xpage); - memcpy(xattr_addr, txattr_addr + inline_size, VALID_XATTR_BLOCK_SIZE); - set_page_dirty(xpage); - f2fs_put_page(xpage, 1); - return 0; + if (inline_size) + memcpy(inline_addr, txattr_addr, inline_size); + memcpy(xattr_addr, txattr_addr + inline_size, VALID_XATTR_BLOCK_SIZE); + + if (inline_size) + set_page_dirty(ipage ? ipage : in_page); + set_page_dirty(xpage); + + f2fs_put_page(xpage, 1); +in_page_out: + f2fs_put_page(in_page, 1); + return err; } int f2fs_getxattr(struct inode *inode, int index, const char *name, From e945474a9c1b018159ff5ba398bfafb1dc6a5956 Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Fri, 27 Oct 2017 20:45:05 +0800 Subject: [PATCH 0474/1212] f2fs: support soft block reservation It supports to extend reserved_blocks sysfs interface to be soft threshold, which allows user configure it exceeding current available user space. This patch also introduces a new sysfs interface called current_reserved_blocks, which shows the current blocks that have already been reserved. Signed-off-by: Yunlong Song Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 13 ++++++++++++- fs/f2fs/f2fs.h | 13 +++++++++++-- fs/f2fs/super.c | 3 ++- fs/f2fs/sysfs.c | 15 ++++++++++++--- 4 files changed, 37 insertions(+), 7 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 500c60403653..2174c66ce1fe 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -132,7 +132,18 @@ What: /sys/fs/f2fs//reserved_blocks Date: June 2017 Contact: "Chao Yu" Description: - Controls current reserved blocks in system. + Controls target reserved blocks in system, the threshold + is soft, it could exceed current available user space. + +What: /sys/fs/f2fs//current_reserved_blocks +Date: October 2017 +Contact: "Yunlong Song" +Contact: "Chao Yu" +Description: + Shows current reserved blocks in system, it may be temporarily + smaller than target_reserved_blocks, but will gradually + increase to target_reserved_blocks when more free blocks are + freed by user later. What: /sys/fs/f2fs//gc_urgent Date: August 2017 diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6ae52815b33f..d67d1d972459 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1146,6 +1146,7 @@ struct f2fs_sb_info { block_t discard_blks; /* discard command candidats */ block_t last_valid_block_count; /* for recovery */ block_t reserved_blocks; /* configurable reserved blocks */ + block_t current_reserved_blocks; /* current reserved blocks */ u32 s_next_generation; /* for NFS support */ @@ -1618,7 +1619,8 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); sbi->total_valid_block_count += (block_t)(*count); - avail_user_block_count = sbi->user_block_count - sbi->reserved_blocks; + avail_user_block_count = sbi->user_block_count - + sbi->current_reserved_blocks; if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) { diff = sbi->total_valid_block_count - avail_user_block_count; *count -= diff; @@ -1652,6 +1654,10 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count); f2fs_bug_on(sbi, inode->i_blocks < sectors); sbi->total_valid_block_count -= (block_t)count; + if (sbi->reserved_blocks && + sbi->current_reserved_blocks < sbi->reserved_blocks) + sbi->current_reserved_blocks = min(sbi->reserved_blocks, + sbi->current_reserved_blocks + count); spin_unlock(&sbi->stat_lock); f2fs_i_blocks_write(inode, count, false, true); } @@ -1798,7 +1804,7 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); valid_block_count = sbi->total_valid_block_count + 1; - if (unlikely(valid_block_count + sbi->reserved_blocks > + if (unlikely(valid_block_count + sbi->current_reserved_blocks > sbi->user_block_count)) { spin_unlock(&sbi->stat_lock); goto enospc; @@ -1841,6 +1847,9 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, sbi->total_valid_node_count--; sbi->total_valid_block_count--; + if (sbi->reserved_blocks && + sbi->current_reserved_blocks < sbi->reserved_blocks) + sbi->current_reserved_blocks++; spin_unlock(&sbi->stat_lock); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ff3cbfa0c1c9..8a99182b368f 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -988,7 +988,7 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = total_count - start_count; buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count; buf->f_bavail = user_block_count - valid_user_blocks(sbi) - - sbi->reserved_blocks; + sbi->current_reserved_blocks; avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; @@ -2466,6 +2466,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) le64_to_cpu(sbi->ckpt->valid_block_count); sbi->last_valid_block_count = sbi->total_valid_block_count; sbi->reserved_blocks = 0; + sbi->current_reserved_blocks = 0; for (i = 0; i < NR_INODE_TYPE; i++) { INIT_LIST_HEAD(&sbi->inode_list[i]); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index e09e59cc678a..4166ac74e837 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -30,7 +30,7 @@ enum { FAULT_INFO_RATE, /* struct f2fs_fault_info */ FAULT_INFO_TYPE, /* struct f2fs_fault_info */ #endif - RESERVED_BLOCKS, + RESERVED_BLOCKS, /* struct f2fs_sb_info */ }; struct f2fs_attr { @@ -114,6 +114,12 @@ static ssize_t features_show(struct f2fs_attr *a, return len; } +static ssize_t current_reserved_blocks_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", sbi->current_reserved_blocks); +} + static ssize_t f2fs_sbi_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -153,12 +159,13 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, #endif if (a->struct_type == RESERVED_BLOCKS) { spin_lock(&sbi->stat_lock); - if ((unsigned long)sbi->total_valid_block_count + t > - (unsigned long)sbi->user_block_count) { + if (t > (unsigned long)sbi->user_block_count) { spin_unlock(&sbi->stat_lock); return -EINVAL; } *ui = t; + sbi->current_reserved_blocks = min(sbi->reserved_blocks, + sbi->user_block_count - valid_user_blocks(sbi)); spin_unlock(&sbi->stat_lock); return count; } @@ -293,6 +300,7 @@ F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); F2FS_GENERAL_RO_ATTR(dirty_segments); F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); F2FS_GENERAL_RO_ATTR(features); +F2FS_GENERAL_RO_ATTR(current_reserved_blocks); #ifdef CONFIG_F2FS_FS_ENCRYPTION F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO); @@ -338,6 +346,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(lifetime_write_kbytes), ATTR_LIST(features), ATTR_LIST(reserved_blocks), + ATTR_LIST(current_reserved_blocks), NULL, }; From 09a073cc8c565c797b73e525cb0c700cc4daaf77 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 28 Oct 2017 16:52:29 +0800 Subject: [PATCH 0475/1212] f2fs: add missing sysfs description There are some missing sysfs entries' description in document, add them. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 2174c66ce1fe..a07134c517e0 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -51,6 +51,12 @@ Description: Controls the dirty page count condition for the in-place-update policies. +What: /sys/fs/f2fs//min_hot_blocks +Date: March 2017 +Contact: "Jaegeuk Kim" +Description: + Controls the dirty page count condition for redefining hot data. + What: /sys/fs/f2fs//max_small_discards Date: November 2013 Contact: "Jaegeuk Kim" @@ -96,6 +102,18 @@ Contact: "Jaegeuk Kim" Description: Controls the checkpoint timing. +What: /sys/fs/f2fs//idle_interval +Date: January 2016 +Contact: "Jaegeuk Kim" +Description: + Controls the idle timing. + +What: /sys/fs/f2fs//iostat_enable +Date: August 2017 +Contact: "Chao Yu" +Description: + Controls to enable/disable IO stat. + What: /sys/fs/f2fs//ra_nid_pages Date: October 2015 Contact: "Chao Yu" @@ -116,6 +134,12 @@ Contact: "Shuoran Liu" Description: Shows total written kbytes issued to disk. +What: /sys/fs/f2fs//feature +Date: July 2017 +Contact: "Jaegeuk Kim" +Description: + Shows all enabled features in current device. + What: /sys/fs/f2fs//inject_rate Date: May 2016 Contact: "Sheng Yong" From 583b7a274c2719d939b0703e53d480edc35c8ac1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 28 Oct 2017 16:52:30 +0800 Subject: [PATCH 0476/1212] f2fs: support get_page error injection This patch adds to support get_page error injection to simulate out-of-memory test scenario. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 ++- fs/f2fs/f2fs.h | 14 ++++++++++++++ fs/f2fs/gc.c | 4 ++-- fs/f2fs/node.c | 3 ++- fs/f2fs/super.c | 1 + 5 files changed, 21 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 6750584b7107..81ec0c6aeedf 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1962,7 +1962,8 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, * Do not use grab_cache_page_write_begin() to avoid deadlock due to * wait_for_stable_page. Will wait that below with our IO control. */ - page = grab_cache_page(mapping, index); + page = f2fs_pagecache_get_page(mapping, index, + FGP_LOCK | FGP_WRITE | FGP_CREAT, GFP_NOFS); if (!page) { err = -ENOMEM; goto fail; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d67d1d972459..b35d894762b5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -47,6 +47,7 @@ enum { FAULT_KMALLOC, FAULT_PAGE_ALLOC, + FAULT_PAGE_GET, FAULT_ALLOC_NID, FAULT_ORPHAN, FAULT_BLOCK, @@ -1896,6 +1897,19 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, return grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); } +static inline struct page *f2fs_pagecache_get_page( + struct address_space *mapping, pgoff_t index, + int fgp_flags, gfp_t gfp_mask) +{ +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET)) { + f2fs_show_injection_info(FAULT_PAGE_GET); + return NULL; + } +#endif + return pagecache_get_page(mapping, index, fgp_flags, gfp_mask); +} + static inline void f2fs_copy_page(struct page *src, struct page *dst) { char *src_kaddr = kmap(src); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 32b0b0632e15..359e7b5590f6 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -650,8 +650,8 @@ static void move_data_block(struct inode *inode, block_t bidx, allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, &sum, CURSEG_COLD_DATA, NULL, false); - fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi), newaddr, - FGP_LOCK | FGP_CREAT, GFP_NOFS); + fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi), + newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS); if (!fio.encrypted_page) { err = -ENOMEM; goto recover_block; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 77b39a058a34..1c81a915c343 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1218,7 +1218,8 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) if (!inode) return; - page = pagecache_get_page(inode->i_mapping, 0, FGP_LOCK|FGP_NOWAIT, 0); + page = f2fs_pagecache_get_page(inode->i_mapping, 0, + FGP_LOCK|FGP_NOWAIT, 0); if (!page) goto iput_out; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8a99182b368f..a5304d9d1392 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -44,6 +44,7 @@ static struct kmem_cache *f2fs_inode_cachep; char *fault_name[FAULT_MAX] = { [FAULT_KMALLOC] = "kmalloc", [FAULT_PAGE_ALLOC] = "page alloc", + [FAULT_PAGE_GET] = "page get", [FAULT_ALLOC_NID] = "alloc nid", [FAULT_ORPHAN] = "orphan", [FAULT_BLOCK] = "no more block", From 5612922fb0acb33f54fc0a67837b3510dde4b00b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 28 Oct 2017 16:52:31 +0800 Subject: [PATCH 0477/1212] f2fs: support bio allocation error injection This patch adds to support bio allocation error injection to simulate out-of-memory test scenario. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 ++-- fs/f2fs/f2fs.h | 23 +++++++++++++++++------ fs/f2fs/segment.c | 4 ++-- fs/f2fs/super.c | 1 + 4 files changed, 22 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 81ec0c6aeedf..043394aa6c62 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -171,7 +171,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, { struct bio *bio; - bio = f2fs_bio_alloc(npages); + bio = f2fs_bio_alloc(sbi, npages, true); f2fs_target_device(sbi, blk_addr, bio); bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; @@ -471,7 +471,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, f2fs_wait_on_block_writeback(sbi, blkaddr); } - bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); + bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), false); if (!bio) { if (ctx) fscrypt_release_ctx(ctx); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b35d894762b5..28df48ca5f1e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -48,6 +48,7 @@ enum { FAULT_KMALLOC, FAULT_PAGE_ALLOC, FAULT_PAGE_GET, + FAULT_ALLOC_BIO, FAULT_ALLOC_NID, FAULT_ORPHAN, FAULT_BLOCK, @@ -1959,15 +1960,25 @@ static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, return entry; } -static inline struct bio *f2fs_bio_alloc(int npages) +static inline struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, + int npages, bool no_fail) { struct bio *bio; - /* No failure on bio allocation */ - bio = bio_alloc(GFP_NOIO, npages); - if (!bio) - bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages); - return bio; + if (no_fail) { + /* No failure on bio allocation */ + bio = bio_alloc(GFP_NOIO, npages); + if (!bio) + bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages); + return bio; + } +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_ALLOC_BIO)) { + f2fs_show_injection_info(FAULT_ALLOC_BIO); + return NULL; + } +#endif + return bio_alloc(GFP_KERNEL, npages); } static inline void f2fs_radix_tree_insert(struct radix_tree_root *root, diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 85295baa74c8..132e1e424ffe 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -511,7 +511,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) static int __submit_flush_wait(struct f2fs_sb_info *sbi, struct block_device *bdev) { - struct bio *bio = f2fs_bio_alloc(0); + struct bio *bio = f2fs_bio_alloc(sbi, 0, true); int ret; bio->bi_rw = REQ_OP_WRITE; @@ -943,7 +943,7 @@ static int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, if (ret) return ret; } - bio = f2fs_bio_alloc(1); + bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, 1); bio->bi_iter.bi_sector = sector; bio->bi_bdev = bdev; bio_set_op_attrs(bio, op, 0); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a5304d9d1392..4c87c75ed352 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -45,6 +45,7 @@ char *fault_name[FAULT_MAX] = { [FAULT_KMALLOC] = "kmalloc", [FAULT_PAGE_ALLOC] = "page alloc", [FAULT_PAGE_GET] = "page get", + [FAULT_ALLOC_BIO] = "alloc bio", [FAULT_ALLOC_NID] = "alloc nid", [FAULT_ORPHAN] = "orphan", [FAULT_BLOCK] = "no more block", From 90c28a18d2a499c53dbff24b382d1b8e4e9547d3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 28 Oct 2017 16:52:32 +0800 Subject: [PATCH 0478/1212] f2fs: give correct trimmed blocks in fstrim We have supported to issue discard in specified range during fstrim, it needs to return caller with successfully trimmed bytes in that range instead of bytes of invalid blocks which are scanned in checkpoint. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 - fs/f2fs/segment.c | 27 +++++++++++++++++++-------- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 28df48ca5f1e..8e6ad6543bcf 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -229,7 +229,6 @@ struct cp_control { __u64 trim_start; __u64 trim_end; __u64 trim_minlen; - __u64 trimmed; }; /* diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 132e1e424ffe..27a6df3bbff3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1336,21 +1336,27 @@ static bool __drop_discard_cmd(struct f2fs_sb_info *sbi) return dropped; } -static void __wait_one_discard_bio(struct f2fs_sb_info *sbi, +static unsigned int __wait_one_discard_bio(struct f2fs_sb_info *sbi, struct discard_cmd *dc) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + unsigned int len = 0; wait_for_completion_io(&dc->wait); mutex_lock(&dcc->cmd_lock); f2fs_bug_on(sbi, dc->state != D_DONE); dc->ref--; - if (!dc->ref) + if (!dc->ref) { + if (!dc->error) + len = dc->len; __remove_discard_cmd(sbi, dc); + } mutex_unlock(&dcc->cmd_lock); + + return len; } -static void __wait_discard_cmd_range(struct f2fs_sb_info *sbi, +static unsigned int __wait_discard_cmd_range(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, block_t start, block_t end) { @@ -1359,6 +1365,7 @@ static void __wait_discard_cmd_range(struct f2fs_sb_info *sbi, &(dcc->fstrim_list) : &(dcc->wait_list); struct discard_cmd *dc, *tmp; bool need_wait; + unsigned int trimmed = 0; next: need_wait = false; @@ -1371,6 +1378,8 @@ static void __wait_discard_cmd_range(struct f2fs_sb_info *sbi, continue; if (dc->state == D_DONE && !dc->ref) { wait_for_completion_io(&dc->wait); + if (!dc->error) + trimmed += dc->len; __remove_discard_cmd(sbi, dc); } else { dc->ref++; @@ -1381,9 +1390,11 @@ static void __wait_discard_cmd_range(struct f2fs_sb_info *sbi, mutex_unlock(&dcc->cmd_lock); if (need_wait) { - __wait_one_discard_bio(sbi, dc); + trimmed += __wait_one_discard_bio(sbi, dc); goto next; } + + return trimmed; } static void __wait_all_discard_cmd(struct f2fs_sb_info *sbi, @@ -1744,7 +1755,6 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos, len); - cpc->trimmed += len; total_len += len; } else { next_pos = find_next_bit_le(entry->discard_map, @@ -2447,12 +2457,12 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) block_t start_block, end_block; struct cp_control cpc; struct discard_policy dpolicy; + unsigned long long trimmed = 0; int err = 0; if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize) return -EINVAL; - cpc.trimmed = 0; if (end <= MAIN_BLKADDR(sbi)) goto out; @@ -2499,9 +2509,10 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) init_discard_policy(&dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen); __issue_discard_cmd_range(sbi, &dpolicy, start_block, end_block); - __wait_discard_cmd_range(sbi, &dpolicy, start_block, end_block); + trimmed = __wait_discard_cmd_range(sbi, &dpolicy, + start_block, end_block); out: - range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); + range->len = F2FS_BLK_TO_BYTES(trimmed); return err; } From ae66786296b4a210c75db6259300636ceb1abdba Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 28 Oct 2017 16:52:33 +0800 Subject: [PATCH 0479/1212] f2fs: export SSR allocation threshold This patch exports min_ssr_segments threshold in sysfs to let user control triggering SSR allocation flexibly. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 6 ++++++ fs/f2fs/f2fs.h | 2 ++ fs/f2fs/segment.c | 3 ++- fs/f2fs/sysfs.c | 2 ++ 4 files changed, 12 insertions(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index a07134c517e0..2baed1151eac 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -57,6 +57,12 @@ Contact: "Jaegeuk Kim" Description: Controls the dirty page count condition for redefining hot data. +What: /sys/fs/f2fs//min_ssr_sections +Date: October 2017 +Contact: "Chao Yu" +Description: + Controls the fee section threshold to trigger SSR allocation. + What: /sys/fs/f2fs//max_small_discards Date: November 2013 Contact: "Jaegeuk Kim" diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8e6ad6543bcf..6654a96f0907 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -904,6 +904,7 @@ struct f2fs_sm_info { unsigned int min_ipu_util; /* in-place-update threshold */ unsigned int min_fsync_blocks; /* threshold for fsync */ unsigned int min_hot_blocks; /* threshold for hot block allocation */ + unsigned int min_ssr_sections; /* threshold to trigger SSR allocation */ /* for flush command control */ struct flush_cmd_control *fcc_info; @@ -1141,6 +1142,7 @@ struct f2fs_sb_info { int active_logs; /* # of active logs */ int dir_level; /* directory level */ int inline_xattr_size; /* inline xattr size */ + unsigned int trigger_ssr_threshold; /* threshold to trigger ssr */ block_t user_block_count; /* # of user blocks */ block_t total_valid_block_count; /* # of valid blocks */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 27a6df3bbff3..af536d427424 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -181,7 +181,7 @@ bool need_SSR(struct f2fs_sb_info *sbi) return true; return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + - 2 * reserved_sections(sbi)); + SM_I(sbi)->min_ssr_sections + reserved_sections(sbi)); } void register_inmem_page(struct inode *inode, struct page *page) @@ -3751,6 +3751,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS; + sm_info->min_ssr_sections = reserved_sections(sbi); sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 4166ac74e837..f0fdc89ce82f 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -285,6 +285,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ssr_sections, min_ssr_sections); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); @@ -330,6 +331,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(min_ipu_util), ATTR_LIST(min_fsync_blocks), ATTR_LIST(min_hot_blocks), + ATTR_LIST(min_ssr_sections), ATTR_LIST(max_victim_search), ATTR_LIST(dir_level), ATTR_LIST(ram_thresh), From 873ec505cb075ab6ad41721289fd8530ba777cab Mon Sep 17 00:00:00 2001 From: Fan Li Date: Sat, 28 Oct 2017 19:03:37 +0800 Subject: [PATCH 0480/1212] f2fs: add a function to move nid This patch add a new function to move nid from one state to another. Move operation is heavily used, by adding a new function for it we can cut down some branches from several flow. Signed-off-by: Fan li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 57 +++++++++++++++++++++++++++++++------------------- 1 file changed, 36 insertions(+), 21 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 1c81a915c343..a1f8307b1085 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1765,15 +1765,13 @@ static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, } static int __insert_free_nid(struct f2fs_sb_info *sbi, - struct free_nid *i, enum nid_state state, bool new) + struct free_nid *i, enum nid_state state) { struct f2fs_nm_info *nm_i = NM_I(sbi); - if (new) { - int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i); - if (err) - return err; - } + int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i); + if (err) + return err; f2fs_bug_on(sbi, state != i->state); nm_i->nid_cnt[state]++; @@ -1783,7 +1781,7 @@ static int __insert_free_nid(struct f2fs_sb_info *sbi, } static void __remove_free_nid(struct f2fs_sb_info *sbi, - struct free_nid *i, enum nid_state state, bool reuse) + struct free_nid *i, enum nid_state state) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -1791,8 +1789,29 @@ static void __remove_free_nid(struct f2fs_sb_info *sbi, nm_i->nid_cnt[state]--; if (state == FREE_NID) list_del(&i->list); - if (!reuse) - radix_tree_delete(&nm_i->free_nid_root, i->nid); + radix_tree_delete(&nm_i->free_nid_root, i->nid); +} + +static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i, + enum nid_state org_state, enum nid_state dst_state) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + f2fs_bug_on(sbi, org_state != i->state); + i->state = dst_state; + nm_i->nid_cnt[org_state]--; + nm_i->nid_cnt[dst_state]++; + + switch (dst_state) { + case PREALLOC_NID: + list_del(&i->list); + break; + case FREE_NID: + list_add_tail(&i->list, &nm_i->free_nid_list); + break; + default: + BUG_ON(1); + } } /* return if the nid is recognized as free */ @@ -1852,7 +1871,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) } } ret = true; - err = __insert_free_nid(sbi, i, FREE_NID, true); + err = __insert_free_nid(sbi, i, FREE_NID); err_out: spin_unlock(&nm_i->nid_list_lock); radix_tree_preload_end(); @@ -1871,7 +1890,7 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); if (i && i->state == FREE_NID) { - __remove_free_nid(sbi, i, FREE_NID, false); + __remove_free_nid(sbi, i, FREE_NID); need_free = true; } spin_unlock(&nm_i->nid_list_lock); @@ -2082,9 +2101,7 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct free_nid, list); *nid = i->nid; - __remove_free_nid(sbi, i, FREE_NID, true); - i->state = PREALLOC_NID; - __insert_free_nid(sbi, i, PREALLOC_NID, false); + __move_free_nid(sbi, i, FREE_NID, PREALLOC_NID); nm_i->available_nids--; update_free_nid_bitmap(sbi, *nid, false, false); @@ -2110,7 +2127,7 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); f2fs_bug_on(sbi, !i); - __remove_free_nid(sbi, i, PREALLOC_NID, false); + __remove_free_nid(sbi, i, PREALLOC_NID); spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); @@ -2133,12 +2150,10 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) f2fs_bug_on(sbi, !i); if (!available_free_memory(sbi, FREE_NIDS)) { - __remove_free_nid(sbi, i, PREALLOC_NID, false); + __remove_free_nid(sbi, i, PREALLOC_NID); need_free = true; } else { - __remove_free_nid(sbi, i, PREALLOC_NID, true); - i->state = FREE_NID; - __insert_free_nid(sbi, i, FREE_NID, false); + __move_free_nid(sbi, i, PREALLOC_NID, FREE_NID); } nm_i->available_nids++; @@ -2169,7 +2184,7 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS) break; - __remove_free_nid(sbi, i, FREE_NID, false); + __remove_free_nid(sbi, i, FREE_NID); kmem_cache_free(free_nid_slab, i); nr_shrink--; } @@ -2748,7 +2763,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) /* destroy free nid list */ spin_lock(&nm_i->nid_list_lock); list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { - __remove_free_nid(sbi, i, FREE_NID, false); + __remove_free_nid(sbi, i, FREE_NID); spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); spin_lock(&nm_i->nid_list_lock); From c713fdb5a23cdda4ed85e04c7dec1094ab9c691f Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Mon, 30 Oct 2017 09:33:41 +0800 Subject: [PATCH 0481/1212] Revert "f2fs: handle dirty segments inside refresh_sit_entry" This reverts commit 5e443818fa0b2a2845561ee25bec181424fb2889 The commit should be reverted because call sequence of below two parts of code must be kept: a. update sit information, it needs to be updated before segment allocation since latter allocation may trigger SSR, and SSR allocation needs latest valid block information of all segments. b. update segment status, it needs to be updated after segment allocation since we can skip updating current opened segment status. Fixes: 5e443818fa0b ("f2fs: handle dirty segments inside refresh_sit_entry") Suggested-by: Chao Yu Signed-off-by: Yunlong Song Reviewed-by: Chao Yu [Jaegeuk Kim: remove refresh_sit_entry function] Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 - fs/f2fs/segment.c | 27 ++++++++++++++------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6654a96f0907..d8fcdead2aa5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2683,7 +2683,6 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); void init_discard_policy(struct discard_policy *dpolicy, int discard_type, unsigned int granularity); -void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new); void stop_discard_thread(struct f2fs_sb_info *sbi); bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index af536d427424..f59b00aa502b 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1974,16 +1974,6 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) get_sec_entry(sbi, segno)->valid_blocks += del; } -void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new) -{ - update_sit_entry(sbi, new, 1); - if (GET_SEGNO(sbi, old) != NULL_SEGNO) - update_sit_entry(sbi, old, -1); - - locate_dirty_segment(sbi, GET_SEGNO(sbi, old)); - locate_dirty_segment(sbi, GET_SEGNO(sbi, new)); -} - void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) { unsigned int segno = GET_SEGNO(sbi, addr); @@ -2620,13 +2610,24 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, stat_inc_block_count(sbi, curseg); + /* + * SIT information should be updated before segment allocation, + * since SSR needs latest valid block information. + */ + update_sit_entry(sbi, *new_blkaddr, 1); + if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) + update_sit_entry(sbi, old_blkaddr, -1); + if (!__has_curseg_space(sbi, type)) sit_i->s_ops->allocate_segment(sbi, type, false); + /* - * SIT information should be updated after segment allocation, - * since we need to keep dirty segments precisely under SSR. + * segment dirty status should be updated after segment allocation, + * so we just need to update status only one time after previous + * segment being closed. */ - refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); + locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); + locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr)); mutex_unlock(&sit_i->sentry_lock); From f46ae958c701e580c4405c7b455438980d78585a Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Mon, 30 Oct 2017 14:18:55 +0800 Subject: [PATCH 0482/1212] f2fs: modify for accurate fggc node io stat modify for accurate fggc node io stat Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 62 +++++++++++++++++++++++++------------------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index a1f8307b1085..99c966fcf32d 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1243,37 +1243,6 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) iput(inode); } -void move_node_page(struct page *node_page, int gc_type) -{ - if (gc_type == FG_GC) { - struct f2fs_sb_info *sbi = F2FS_P_SB(node_page); - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 1, - .for_reclaim = 0, - }; - - set_page_dirty(node_page); - f2fs_wait_on_page_writeback(node_page, NODE, true); - - f2fs_bug_on(sbi, PageWriteback(node_page)); - if (!clear_page_dirty_for_io(node_page)) - goto out_page; - - if (NODE_MAPPING(sbi)->a_ops->writepage(node_page, &wbc)) - unlock_page(node_page); - goto release_page; - } else { - /* set page dirty and write it */ - if (!PageWriteback(node_page)) - set_page_dirty(node_page); - } -out_page: - unlock_page(node_page); -release_page: - f2fs_put_page(node_page, 0); -} - static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) { pgoff_t index, end; @@ -1416,6 +1385,37 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, return AOP_WRITEPAGE_ACTIVATE; } +void move_node_page(struct page *node_page, int gc_type) +{ + if (gc_type == FG_GC) { + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 1, + .for_reclaim = 0, + }; + + set_page_dirty(node_page); + f2fs_wait_on_page_writeback(node_page, NODE, true); + + f2fs_bug_on(F2FS_P_SB(node_page), PageWriteback(node_page)); + if (!clear_page_dirty_for_io(node_page)) + goto out_page; + + if (__write_node_page(node_page, false, NULL, + &wbc, false, FS_GC_NODE_IO)) + unlock_page(node_page); + goto release_page; + } else { + /* set page dirty and write it */ + if (!PageWriteback(node_page)) + set_page_dirty(node_page); + } +out_page: + unlock_page(node_page); +release_page: + f2fs_put_page(node_page, 0); +} + static int f2fs_write_node_page(struct page *page, struct writeback_control *wbc) { From ddb8e2ae9811a6e1a186e1366db817924231842e Mon Sep 17 00:00:00 2001 From: Fan Li Date: Mon, 30 Oct 2017 15:19:48 +0800 Subject: [PATCH 0483/1212] f2fs: optimize __update_nat_bits Make three modification for __update_nat_bits: 1. Take the codes of dealing the nat with nid 0 out of the loop Such nat only needs to be dealt with once at beginning. 2. Use " nat_index == 0" instead of " start_nid == 0" to decide if it's the first nat block It's better that we don't assume @start_nid is the first nid of the nat block it's in. 3. Use " if (nat_blk->entries[i].block_addr != NULL_ADDR)" to explicitly comfirm the value of block_addr use constant to make sure the codes is right, even if the value of NULL_ADDR changes. Signed-off-by: Fan li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 99c966fcf32d..09707de3c9c5 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2424,15 +2424,17 @@ static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, unsigned int nat_index = start_nid / NAT_ENTRY_PER_BLOCK; struct f2fs_nat_block *nat_blk = page_address(page); int valid = 0; - int i; + int i = 0; if (!enabled_nat_bits(sbi, NULL)) return; - for (i = 0; i < NAT_ENTRY_PER_BLOCK; i++) { - if (start_nid == 0 && i == 0) - valid++; - if (nat_blk->entries[i].block_addr) + if (nat_index == 0) { + valid = 1; + i = 1; + } + for (; i < NAT_ENTRY_PER_BLOCK; i++) { + if (nat_blk->entries[i].block_addr != NULL_ADDR) valid++; } if (valid == 0) { From 26dfec49b25a3a895181b5f76bc8c762924f6197 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 5 Oct 2017 21:03:06 -0700 Subject: [PATCH 0484/1212] f2fs: add quota_ino feature infra This patch adds quota_ino feature infra to be used for quota files. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 6 ++++++ fs/f2fs/sysfs.c | 7 +++++++ include/linux/f2fs_fs.h | 6 +++++- 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d8fcdead2aa5..ff5cd87e745f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -123,6 +123,7 @@ struct f2fs_mount_info { #define F2FS_FEATURE_PRJQUOTA 0x0010 #define F2FS_FEATURE_INODE_CHKSUM 0x0020 #define F2FS_FEATURE_FLEXIBLE_INLINE_XATTR 0x0040 +#define F2FS_FEATURE_QUOTA_INO 0x0080 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -3151,6 +3152,11 @@ static inline int f2fs_sb_has_flexible_inline_xattr(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_FLEXIBLE_INLINE_XATTR); } +static inline int f2fs_sb_has_quota_ino(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_QUOTA_INO); +} + #ifdef CONFIG_BLK_DEV_ZONED static inline int get_blkz_type(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkaddr) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index f0fdc89ce82f..9835348b6e5d 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -110,6 +110,9 @@ static ssize_t features_show(struct f2fs_attr *a, if (f2fs_sb_has_flexible_inline_xattr(sb)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "flexible_inline_xattr"); + if (f2fs_sb_has_quota_ino(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "quota_ino"); len += snprintf(buf + len, PAGE_SIZE - len, "\n"); return len; } @@ -227,6 +230,7 @@ enum feat_id { FEAT_PROJECT_QUOTA, FEAT_INODE_CHECKSUM, FEAT_FLEXIBLE_INLINE_XATTR, + FEAT_QUOTA_INO, }; static ssize_t f2fs_feature_show(struct f2fs_attr *a, @@ -240,6 +244,7 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a, case FEAT_PROJECT_QUOTA: case FEAT_INODE_CHECKSUM: case FEAT_FLEXIBLE_INLINE_XATTR: + case FEAT_QUOTA_INO: return snprintf(buf, PAGE_SIZE, "supported\n"); } return 0; @@ -314,6 +319,7 @@ F2FS_FEATURE_RO_ATTR(extra_attr, FEAT_EXTRA_ATTR); F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA); F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM); F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR); +F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -364,6 +370,7 @@ static struct attribute *f2fs_feat_attrs[] = { ATTR_LIST(project_quota), ATTR_LIST(inode_checksum), ATTR_LIST(flexible_inline_xattr), + ATTR_LIST(quota_ino), NULL, }; diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index d79ca96045e4..3e15df3cebcb 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -36,6 +36,9 @@ #define F2FS_NODE_INO(sbi) (sbi->node_ino_num) #define F2FS_META_INO(sbi) (sbi->meta_ino_num) +#define F2FS_QUOTA_INO 3 +#define F2FS_MAX_QUOTAS 3 + #define F2FS_IO_SIZE(sbi) (1 << (sbi)->write_io_size_bits) /* Blocks */ #define F2FS_IO_SIZE_KB(sbi) (1 << ((sbi)->write_io_size_bits + 2)) /* KB */ #define F2FS_IO_SIZE_BYTES(sbi) (1 << ((sbi)->write_io_size_bits + 12)) /* B */ @@ -108,7 +111,8 @@ struct f2fs_super_block { __u8 encryption_level; /* versioning level for encryption */ __u8 encrypt_pw_salt[16]; /* Salt used for string2key algorithm */ struct f2fs_device devs[MAX_DEVICES]; /* device list */ - __u8 reserved[327]; /* valid reserved region */ + __le32 qf_ino[F2FS_MAX_QUOTAS]; /* quota inode numbers */ + __u8 reserved[315]; /* valid reserved region */ } __packed; /* From 82750d346ab7d09c9ffc2d9ecce84420422bd6fa Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 6 Oct 2017 09:14:28 -0700 Subject: [PATCH 0485/1212] f2fs: support quota sys files This patch supports hidden quota files in the system, which will be used for Android. It requires up-to-date f2fs-tools later than v1.9.0. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 9 ++- fs/f2fs/f2fs.h | 10 ++- fs/f2fs/recovery.c | 8 ++- fs/f2fs/super.c | 149 ++++++++++++++++++++++++++++++++++++---- include/linux/f2fs_fs.h | 1 - 5 files changed, 158 insertions(+), 19 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 48f9366240a2..a69795e046bb 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -616,6 +616,9 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) block_t start_blk, orphan_blocks, i, j; unsigned int s_flags = sbi->sb->s_flags; int err = 0; +#ifdef CONFIG_QUOTA + int quota_enabled; +#endif if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG)) return 0; @@ -628,8 +631,9 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) #ifdef CONFIG_QUOTA /* Needed for iput() to work correctly and not trash data */ sbi->sb->s_flags |= MS_ACTIVE; + /* Turn on quotas so that they are updated correctly */ - f2fs_enable_quota_files(sbi); + quota_enabled = f2fs_enable_quota_files(sbi, s_flags & MS_RDONLY); #endif start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); @@ -657,7 +661,8 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) out: #ifdef CONFIG_QUOTA /* Turn quotas off */ - f2fs_quota_off_umount(sbi->sb); + if (quota_enabled) + f2fs_quota_off_umount(sbi->sb); #endif sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ff5cd87e745f..72d5ea456250 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1445,6 +1445,13 @@ static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp) return le64_to_cpu(cp->checkpoint_ver); } +static inline unsigned long f2fs_qf_ino(struct super_block *sb, int type) +{ + if (type < F2FS_MAX_QUOTAS) + return le32_to_cpu(F2FS_SB(sb)->raw_super->qf_ino[type]); + return 0; +} + static inline __u64 cur_cp_crc(struct f2fs_checkpoint *cp) { size_t crc_offset = le32_to_cpu(cp->checksum_offset); @@ -2406,6 +2413,7 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) return ret; } +#define sb_rdonly f2fs_readonly static inline int f2fs_readonly(struct super_block *sb) { return sb->s_flags & MS_RDONLY; @@ -2607,7 +2615,7 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) */ int f2fs_inode_dirtied(struct inode *inode, bool sync); void f2fs_inode_synced(struct inode *inode); -void f2fs_enable_quota_files(struct f2fs_sb_info *sbi); +int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly); void f2fs_quota_off_umount(struct super_block *sb); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 9626758bc762..92c57ace1939 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -594,6 +594,9 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) int ret = 0; unsigned long s_flags = sbi->sb->s_flags; bool need_writecp = false; +#ifdef CONFIG_QUOTA + int quota_enabled; +#endif if (s_flags & MS_RDONLY) { f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs"); @@ -604,7 +607,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) /* Needed for iput() to work correctly and not trash data */ sbi->sb->s_flags |= MS_ACTIVE; /* Turn on quotas so that they are updated correctly */ - f2fs_enable_quota_files(sbi); + quota_enabled = f2fs_enable_quota_files(sbi, s_flags & MS_RDONLY); #endif fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", @@ -665,7 +668,8 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) out: #ifdef CONFIG_QUOTA /* Turn quotas off */ - f2fs_quota_off_umount(sbi->sb); + if (quota_enabled) + f2fs_quota_off_umount(sbi->sb); #endif sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4c87c75ed352..e304ce603c5d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -213,6 +213,12 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype, "quota options when quota turned on"); return -EINVAL; } + if (f2fs_sb_has_quota_ino(sb)) { + f2fs_msg(sb, KERN_INFO, + "QUOTA feature is enabled, so ignore qf_name"); + return 0; + } + qname = match_strdup(args); if (!qname) { f2fs_msg(sb, KERN_ERR, @@ -291,6 +297,18 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) return -1; } } + + if (f2fs_sb_has_quota_ino(sbi->sb) && sbi->s_jquota_fmt) { + f2fs_msg(sbi->sb, KERN_INFO, + "QUOTA feature is enabled, so ignore jquota_fmt"); + sbi->s_jquota_fmt = 0; + } + if (f2fs_sb_has_quota_ino(sbi->sb) && sb_rdonly(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_INFO, + "Filesystem with quota feature cannot be mounted RDWR " + "without CONFIG_QUOTA"); + return -1; + } return 0; } #endif @@ -1173,6 +1191,9 @@ static void default_options(struct f2fs_sb_info *sbi) #endif } +#ifdef CONFIG_QUOTA +static int f2fs_enable_quotas(struct super_block *sb); +#endif static int f2fs_remount(struct super_block *sb, int *flags, char *data) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -1239,6 +1260,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if (f2fs_readonly(sb) && (*flags & MS_RDONLY)) goto skip; +#ifdef CONFIG_QUOTA if (!f2fs_readonly(sb) && (*flags & MS_RDONLY)) { err = dquot_suspend(sb, -1); if (err < 0) @@ -1246,9 +1268,15 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } else { /* dquot_resume needs RW */ sb->s_flags &= ~MS_RDONLY; - dquot_resume(sb, -1); + if (sb_any_quota_suspended(sb)) { + dquot_resume(sb, -1); + } else if (f2fs_sb_has_quota_ino(sb)) { + err = f2fs_enable_quotas(sb); + if (err) + goto restore_opts; + } } - +#endif /* disallow enable/disable extent_cache dynamically */ if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) { err = -EINVAL; @@ -1455,19 +1483,91 @@ static int f2fs_quota_on_mount(struct f2fs_sb_info *sbi, int type) sbi->s_jquota_fmt, type); } -void f2fs_enable_quota_files(struct f2fs_sb_info *sbi) +int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly) { - int i, ret; + int enabled = 0; + int i, err; + + if (f2fs_sb_has_quota_ino(sbi->sb) && rdonly) { + err = f2fs_enable_quotas(sbi->sb); + if (err) { + f2fs_msg(sbi->sb, KERN_ERR, + "Cannot turn on quota_ino: %d", err); + return 0; + } + return 1; + } for (i = 0; i < MAXQUOTAS; i++) { if (sbi->s_qf_names[i]) { - ret = f2fs_quota_on_mount(sbi, i); - if (ret < 0) - f2fs_msg(sbi->sb, KERN_ERR, - "Cannot turn on journaled " - "quota: error %d", ret); + err = f2fs_quota_on_mount(sbi, i); + if (!err) { + enabled = 1; + continue; + } + f2fs_msg(sbi->sb, KERN_ERR, + "Cannot turn on quotas: %d on %d", err, i); } } + return enabled; +} + +static int f2fs_quota_enable(struct super_block *sb, int type, int format_id, + unsigned int flags) +{ + struct inode *qf_inode; + unsigned long qf_inum; + int err; + + BUG_ON(!f2fs_sb_has_quota_ino(sb)); + + qf_inum = f2fs_qf_ino(sb, type); + if (!qf_inum) + return -EPERM; + + qf_inode = f2fs_iget(sb, qf_inum); + if (IS_ERR(qf_inode)) { + f2fs_msg(sb, KERN_ERR, + "Bad quota inode %u:%lu", type, qf_inum); + return PTR_ERR(qf_inode); + } + + /* Don't account quota for quota files to avoid recursion */ + qf_inode->i_flags |= S_NOQUOTA; + err = dquot_enable(qf_inode, type, format_id, flags); + iput(qf_inode); + return err; +} + +static int f2fs_enable_quotas(struct super_block *sb) +{ + int type, err = 0; + unsigned long qf_inum; + bool quota_mopt[MAXQUOTAS] = { + test_opt(F2FS_SB(sb), USRQUOTA), + test_opt(F2FS_SB(sb), GRPQUOTA), + test_opt(F2FS_SB(sb), PRJQUOTA), + }; + + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; + for (type = 0; type < MAXQUOTAS; type++) { + qf_inum = f2fs_qf_ino(sb, type); + if (qf_inum) { + err = f2fs_quota_enable(sb, type, QFMT_VFS_V1, + DQUOT_USAGE_ENABLED | + (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0)); + if (err) { + f2fs_msg(sb, KERN_ERR, + "Failed to enable quota tracking " + "(type=%d, err=%d). Please run " + "fsck to fix.", type, err); + for (type--; type >= 0; type--) + dquot_quota_off(sb, type); + return err; + } + } + } + return 0; } static int f2fs_quota_sync(struct super_block *sb, int type) @@ -1538,7 +1638,7 @@ static int f2fs_quota_off(struct super_block *sb, int type) f2fs_quota_sync(sb, type); err = dquot_quota_off(sb, type); - if (err) + if (err || f2fs_sb_has_quota_ino(sb)) goto out_put; inode_lock(inode); @@ -2372,7 +2472,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) #ifdef CONFIG_QUOTA sb->dq_op = &f2fs_quota_operations; - sb->s_qcop = &f2fs_quotactl_ops; + if (f2fs_sb_has_quota_ino(sb)) + sb->s_qcop = &dquot_quotactl_sysfile_ops; + else + sb->s_qcop = &f2fs_quotactl_ops; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; #endif @@ -2543,10 +2646,24 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (err) goto free_root_inode; +#ifdef CONFIG_QUOTA + /* + * Turn on quotas which were not enabled for read-only mounts if + * filesystem has quota feature, so that they are updated correctly. + */ + if (f2fs_sb_has_quota_ino(sb) && !sb_rdonly(sb)) { + err = f2fs_enable_quotas(sb); + if (err) { + f2fs_msg(sb, KERN_ERR, + "Cannot turn on quotas: error %d", err); + goto free_sysfs; + } + } +#endif /* if there are nt orphan nodes free them */ err = recover_orphan_inodes(sbi); if (err) - goto free_sysfs; + goto free_meta; /* recover fsynced data */ if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { @@ -2580,7 +2697,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) err = -EINVAL; f2fs_msg(sb, KERN_ERR, "Need to recover fsync data"); - goto free_sysfs; + goto free_meta; } } skip_recovery: @@ -2614,6 +2731,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) return 0; free_meta: +#ifdef CONFIG_QUOTA + if (f2fs_sb_has_quota_ino(sb) && !sb_rdonly(sb)) + f2fs_quota_off_umount(sbi->sb); +#endif f2fs_sync_inode_meta(sbi); /* * Some dirty meta pages can be produced by recover_orphan_inodes() @@ -2622,7 +2743,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) * falls into an infinite loop in sync_meta_pages(). */ truncate_inode_pages_final(META_MAPPING(sbi)); +#ifdef CONFIG_QUOTA free_sysfs: +#endif f2fs_unregister_sysfs(sbi); free_root_inode: dput(sb->s_root); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 3e15df3cebcb..fef1caeddf54 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -36,7 +36,6 @@ #define F2FS_NODE_INO(sbi) (sbi->node_ino_num) #define F2FS_META_INO(sbi) (sbi->meta_ino_num) -#define F2FS_QUOTA_INO 3 #define F2FS_MAX_QUOTAS 3 #define F2FS_IO_SIZE(sbi) (1 << (sbi)->write_io_size_bits) /* Blocks */ From c5470498e59be4c3d9ebc9d7ee396dd8e6c6b1ea Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 30 Oct 2017 17:49:53 +0800 Subject: [PATCH 0486/1212] f2fs: use rw_semaphore to protect SIT cache There are some cases user didn't update SIT cache under this lock, so let's use rw_semaphore instead of mutex to enhance concurrently accessing. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 12 ++++++------ fs/f2fs/segment.c | 34 +++++++++++++++++++--------------- fs/f2fs/segment.h | 2 +- 3 files changed, 26 insertions(+), 22 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 359e7b5590f6..297c204ea221 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -456,10 +456,10 @@ static int check_valid_map(struct f2fs_sb_info *sbi, struct seg_entry *sentry; int ret; - mutex_lock(&sit_i->sentry_lock); + down_read(&sit_i->sentry_lock); sentry = get_seg_entry(sbi, segno); ret = f2fs_test_bit(offset, sentry->cur_valid_map); - mutex_unlock(&sit_i->sentry_lock); + up_read(&sit_i->sentry_lock); return ret; } @@ -893,10 +893,10 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, struct sit_info *sit_i = SIT_I(sbi); int ret; - mutex_lock(&sit_i->sentry_lock); + down_write(&sit_i->sentry_lock); ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type, NO_CHECK_TYPE, LFS); - mutex_unlock(&sit_i->sentry_lock); + up_write(&sit_i->sentry_lock); return ret; } @@ -944,8 +944,8 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, /* * this is to avoid deadlock: * - lock_page(sum_page) - f2fs_replace_block - * - check_valid_map() - mutex_lock(sentry_lock) - * - mutex_lock(sentry_lock) - change_curseg() + * - check_valid_map() - down_write(sentry_lock) + * - down_read(sentry_lock) - change_curseg() * - lock_page(sum_page) */ if (type == SUM_TYPE_NODE) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f59b00aa502b..7dfd4580380e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1984,14 +1984,14 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) return; /* add it into sit main buffer */ - mutex_lock(&sit_i->sentry_lock); + down_write(&sit_i->sentry_lock); update_sit_entry(sbi, addr, -1); /* add it into dirty seglist */ locate_dirty_segment(sbi, segno); - mutex_unlock(&sit_i->sentry_lock); + up_write(&sit_i->sentry_lock); } bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) @@ -2004,7 +2004,7 @@ bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) return true; - mutex_lock(&sit_i->sentry_lock); + down_read(&sit_i->sentry_lock); segno = GET_SEGNO(sbi, blkaddr); se = get_seg_entry(sbi, segno); @@ -2013,7 +2013,7 @@ bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) if (f2fs_test_bit(offset, se->ckpt_valid_map)) is_cp = true; - mutex_unlock(&sit_i->sentry_lock); + up_read(&sit_i->sentry_lock); return is_cp; } @@ -2409,12 +2409,16 @@ void allocate_new_segments(struct f2fs_sb_info *sbi) unsigned int old_segno; int i; + down_write(&SIT_I(sbi)->sentry_lock); + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { curseg = CURSEG_I(sbi, i); old_segno = curseg->segno; SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true); locate_dirty_segment(sbi, old_segno); } + + up_write(&SIT_I(sbi)->sentry_lock); } static const struct segment_allocation default_salloc_ops = { @@ -2426,14 +2430,14 @@ bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc) __u64 trim_start = cpc->trim_start; bool has_candidate = false; - mutex_lock(&SIT_I(sbi)->sentry_lock); + down_write(&SIT_I(sbi)->sentry_lock); for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) { if (add_discard_addrs(sbi, cpc, true)) { has_candidate = true; break; } } - mutex_unlock(&SIT_I(sbi)->sentry_lock); + up_write(&SIT_I(sbi)->sentry_lock); cpc->trim_start = trim_start; return has_candidate; @@ -2593,7 +2597,7 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, struct curseg_info *curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); - mutex_lock(&sit_i->sentry_lock); + down_write(&sit_i->sentry_lock); *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); @@ -2629,7 +2633,7 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr)); - mutex_unlock(&sit_i->sentry_lock); + up_write(&sit_i->sentry_lock); if (page && IS_NODESEG(type)) { fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg)); @@ -2787,7 +2791,7 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); - mutex_lock(&sit_i->sentry_lock); + down_write(&sit_i->sentry_lock); old_cursegno = curseg->segno; old_blkoff = curseg->next_blkoff; @@ -2819,7 +2823,7 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, curseg->next_blkoff = old_blkoff; } - mutex_unlock(&sit_i->sentry_lock); + up_write(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); } @@ -3274,7 +3278,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) bool to_journal = true; struct seg_entry *se; - mutex_lock(&sit_i->sentry_lock); + down_write(&sit_i->sentry_lock); if (!sit_i->dirty_sentries) goto out; @@ -3368,7 +3372,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) cpc->trim_start = trim_start; } - mutex_unlock(&sit_i->sentry_lock); + up_write(&sit_i->sentry_lock); set_prefree_as_free_segments(sbi); } @@ -3461,7 +3465,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK; sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time); sit_i->mounted_time = CURRENT_TIME_SEC.tv_sec; - mutex_init(&sit_i->sentry_lock); + init_rwsem(&sit_i->sentry_lock); return 0; } @@ -3702,7 +3706,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi) struct sit_info *sit_i = SIT_I(sbi); unsigned int segno; - mutex_lock(&sit_i->sentry_lock); + down_write(&sit_i->sentry_lock); sit_i->min_mtime = LLONG_MAX; @@ -3719,7 +3723,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi) sit_i->min_mtime = mtime; } sit_i->max_mtime = get_mtime(sbi); - mutex_unlock(&sit_i->sentry_lock); + up_write(&sit_i->sentry_lock); } int build_segment_manager(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 9342b973da65..4f19eb45eada 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -231,7 +231,7 @@ struct sit_info { unsigned long *dirty_sentries_bitmap; /* bitmap for dirty sentries */ unsigned int dirty_sentries; /* # of dirty sentries */ unsigned int sents_per_block; /* # of SIT entries per block */ - struct mutex sentry_lock; /* to protect SIT cache */ + struct rw_semaphore sentry_lock; /* to protect SIT cache */ struct seg_entry *sentries; /* SIT segment-level cache */ struct sec_entry *sec_entries; /* SIT section-level cache */ From 42c7c71824fc026f8d0ed1c2261680752ba7ffa4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 30 Oct 2017 17:49:54 +0800 Subject: [PATCH 0487/1212] f2fs: check curseg space before foreground GC When we are closing to trigger foreground GC, if there are only a few of dirty metas, we can log these dirty metas in left space of opened segments instead of triggering foreground GC. With this patch, total count of foreground GC triggered by test/generic/* of fstest suit reduce from 254 to 184. So let's do the check before foreground GC anyway. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 4f19eb45eada..5264b6ed120c 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -497,6 +497,33 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi) return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi)); } +static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi) +{ + unsigned int node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) + + get_pages(sbi, F2FS_DIRTY_DENTS); + unsigned int dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS); + unsigned int segno, left_blocks; + int i; + + /* check current node segment */ + for (i = CURSEG_HOT_NODE; i <= CURSEG_COLD_NODE; i++) { + segno = CURSEG_I(sbi, i)->segno; + left_blocks = sbi->blocks_per_seg - + get_seg_entry(sbi, segno)->ckpt_valid_blocks; + + if (node_blocks > left_blocks) + return false; + } + + /* check current data segment */ + segno = CURSEG_I(sbi, CURSEG_HOT_DATA)->segno; + left_blocks = sbi->blocks_per_seg - + get_seg_entry(sbi, segno)->ckpt_valid_blocks; + if (dent_blocks > left_blocks) + return false; + return true; +} + static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed, int needed) { @@ -507,6 +534,9 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; + if (free_sections(sbi) + freed == reserved_sections(sbi) + needed && + has_curseg_enough_space(sbi)) + return false; return (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs + imeta_secs + reserved_sections(sbi) + needed); From 8b92814117d5b040c30c4978b5489dcac166a8aa Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 30 Oct 2017 11:11:56 -0400 Subject: [PATCH 0488/1212] f2fs: don't bother with inode->i_version f2fs does not set the SB_I_VERSION flag, so the i_version will never be incremented on write. It was recently changed to increment the i_version on a quota write, which isn't necessary here. Signed-off-by: Jeff Layton Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e304ce603c5d..76e2f1518224 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -664,7 +664,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) init_once((void *) fi); /* Initialize f2fs-specific inode info */ - fi->vfs_inode.i_version = 1; atomic_set(&fi->dirty_pages, 0); fi->i_current_depth = 1; fi->i_advise = 0; @@ -1461,7 +1460,6 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type, if (len == towrite) return err; - inode->i_version++; inode->i_mtime = inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, false); return len - towrite; From 55c7b9595bb93d69f8b099cd9915ae19cefe53a0 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 2 Nov 2017 20:41:01 +0800 Subject: [PATCH 0489/1212] f2fs: remove unneeded semicolon Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index a69795e046bb..d6c02bb8fcf8 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1017,7 +1017,7 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) update_inode_page(inode); iput(inode); } - }; + } return 0; } From 44889e487981b1aa258399696a35f1a7be96ea9f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 2 Nov 2017 20:41:02 +0800 Subject: [PATCH 0490/1212] f2fs: remove dead code in update_meta_page After commit a468f0ef516f ("f2fs: use crc and cp version to determine roll-forward recovery"), last caller of update_meta_page passing @src with NULL is gone, so remove related dead code there. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7dfd4580380e..9538e1ac652d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2071,12 +2071,8 @@ struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno) void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr) { struct page *page = grab_meta_page(sbi, blk_addr); - void *dst = page_address(page); - if (src) - memcpy(dst, src, PAGE_SIZE); - else - memset(dst, 0, PAGE_SIZE); + memcpy(page_address(page), src, PAGE_SIZE); set_page_dirty(page); f2fs_put_page(page, 1); } From 3e3b40557525c0bdb32f4b8d19f02b660245bc27 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 2 Nov 2017 20:41:03 +0800 Subject: [PATCH 0491/1212] f2fs: fix summary info corruption Sometimes, after running generic/270 of fstest, fsck reports summary info and actual position of block address in direct node becoming inconsistent. The root cause is race in between __f2fs_replace_block and change_curseg as below: Thread A Thread B - __clone_blkaddrs - f2fs_replace_block - __f2fs_replace_block - segnoA = GET_SEGNO(sbi, blkaddrA); - type = se->type:=CURSEG_HOT_DATA - if (!IS_CURSEG(sbi, segnoA)) type = CURSEG_WARM_DATA - allocate_data_block - allocate_segment - get_ssr_segment - change_curseg(segnoA, CURSEG_HOT_DATA) - change_curseg(segnoA, CURSEG_WARM_DATA) - reset_curseg - __set_sit_entry_type - change se->type from CURSEG_HOT_DATA to CURSEG_WARM_DATA So finally, hot curseg locates in segnoA, but type of segnoA becomes CURSEG_WARM_DATA. Then if we invoke __f2fs_replace_block(blkaddrB, blkaddrA, true, false), as blkaddrA locates in segnoA, so we will move warm type curseg to segnoA, then change its summary cache and writeback it to summary block. But segnoA is used by hot type curseg too, once it moves or persist, it will cover summary block content with inner old summary cache, result in inconsistent status. This patch tries to fix this issue by introduce global curseg lock to avoid race in between __f2fs_replace_block and change_curseg. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/segment.c | 28 +++++++++++++++++++++++++++- 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 72d5ea456250..b6b382888a94 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -884,6 +884,8 @@ struct f2fs_sm_info { struct dirty_seglist_info *dirty_info; /* dirty segment information */ struct curseg_info *curseg_array; /* active segment information */ + struct rw_semaphore curseg_lock; /* for preventing curseg change */ + block_t seg0_blkaddr; /* block address of 0'th segment */ block_t main_blkaddr; /* start block address of main area */ block_t ssa_blkaddr; /* start block address of SSA area */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 9538e1ac652d..734c6a880633 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2592,6 +2592,8 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); + down_read(&SM_I(sbi)->curseg_lock); + mutex_lock(&curseg->curseg_mutex); down_write(&sit_i->sentry_lock); @@ -2649,6 +2651,8 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, } mutex_unlock(&curseg->curseg_mutex); + + up_read(&SM_I(sbi)->curseg_lock); } static void update_device_state(struct f2fs_io_info *fio) @@ -2756,6 +2760,18 @@ int rewrite_data_page(struct f2fs_io_info *fio) return err; } +static inline int __f2fs_get_curseg(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + int i; + + for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) { + if (CURSEG_I(sbi, i)->segno == segno) + break; + } + return i; +} + void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, block_t old_blkaddr, block_t new_blkaddr, bool recover_curseg, bool recover_newaddr) @@ -2771,6 +2787,8 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, se = get_seg_entry(sbi, segno); type = se->type; + down_write(&SM_I(sbi)->curseg_lock); + if (!recover_curseg) { /* for recovery flow */ if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) { @@ -2780,8 +2798,13 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, type = CURSEG_WARM_DATA; } } else { - if (!IS_CURSEG(sbi, segno)) + if (IS_CURSEG(sbi, segno)) { + /* se->type is volatile as SSR allocation */ + type = __f2fs_get_curseg(sbi, segno); + f2fs_bug_on(sbi, type == NO_CHECK_TYPE); + } else { type = CURSEG_WARM_DATA; + } } curseg = CURSEG_I(sbi, type); @@ -2821,6 +2844,7 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, up_write(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); + up_write(&SM_I(sbi)->curseg_lock); } void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, @@ -3758,6 +3782,8 @@ int build_segment_manager(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&sm_info->sit_entry_set); + init_rwsem(&sm_info->curseg_lock); + if (!f2fs_readonly(sbi->sb)) { err = create_flush_cmd_control(sbi); if (err) From 4423778adf0e777147f9c0252f6a4f42cbb91256 Mon Sep 17 00:00:00 2001 From: Fan Li Date: Thu, 2 Nov 2017 11:02:52 +0800 Subject: [PATCH 0492/1212] f2fs: save a multiplication for last_nid calculation Use a slightly easier way to calculate last_nid. Signed-off-by: Fan li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 09707de3c9c5..930bdb90faac 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2629,7 +2629,7 @@ static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi) __set_bit_le(i, nm_i->nat_block_bitmap); nid = i * NAT_ENTRY_PER_BLOCK; - last_nid = (i + 1) * NAT_ENTRY_PER_BLOCK; + last_nid = nid + NAT_ENTRY_PER_BLOCK; spin_lock(&NM_I(sbi)->nid_list_lock); for (; nid < last_nid; nid++) From 3c8f767e13741c5174909e39f238655c82be1c20 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 3 Nov 2017 10:21:05 +0800 Subject: [PATCH 0493/1212] f2fs: avoid race in between GC and block exchange During block exchange in {insert,collapse,move}_range, page-block mapping is unstable due to mapping moving or recovery, so there should be no concurrent cache read operation rely on such mapping, nor cache write operation to mess up block exchange. So this patch let background GC be aware of that. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 26 +++++++++++++++++++++----- fs/f2fs/gc.c | 7 +++++++ 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 19cdf9f5261b..62f23f82b971 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1174,11 +1174,14 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (ret) goto out; + /* avoid gc operation during block exchange */ + down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + truncate_pagecache(inode, offset); ret = f2fs_do_collapse(inode, pg_start, pg_end); if (ret) - goto out; + goto out_unlock; /* write out all moved pages, if possible */ filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); @@ -1190,7 +1193,8 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) ret = truncate_blocks(inode, new_size, true); if (!ret) f2fs_i_size_write(inode, new_size); - +out_unlock: + up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); out: up_write(&F2FS_I(inode)->i_mmap_sem); return ret; @@ -1373,6 +1377,9 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) if (ret) goto out; + /* avoid gc operation during block exchange */ + down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + truncate_pagecache(inode, offset); pg_start = offset >> PAGE_SHIFT; @@ -1400,6 +1407,8 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) if (!ret) f2fs_i_size_write(inode, new_size); + + up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); out: up_write(&F2FS_I(inode)->i_mmap_sem); return ret; @@ -2254,9 +2263,13 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, } inode_lock(src); + down_write(&F2FS_I(src)->dio_rwsem[WRITE]); if (src != dst) { - if (!inode_trylock(dst)) { - ret = -EBUSY; + ret = -EBUSY; + if (!inode_trylock(dst)) + goto out; + if (!down_write_trylock(&F2FS_I(dst)->dio_rwsem[WRITE])) { + inode_unlock(dst); goto out; } } @@ -2316,9 +2329,12 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, } f2fs_unlock_op(sbi); out_unlock: - if (src != dst) + if (src != dst) { + up_write(&F2FS_I(dst)->dio_rwsem[WRITE]); inode_unlock(dst); + } out: + up_write(&F2FS_I(src)->dio_rwsem[WRITE]); inode_unlock(src); return ret; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 297c204ea221..be9fd616736b 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -832,10 +832,17 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, continue; } + if (!down_write_trylock( + &F2FS_I(inode)->dio_rwsem[WRITE])) { + iput(inode); + continue; + } + start_bidx = start_bidx_of_node(nofs, inode); data_page = get_read_data_page(inode, start_bidx + ofs_in_node, REQ_RAHEAD, true); + up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); if (IS_ERR(data_page)) { iput(inode); continue; From 5d4b6efcfd09ce00a2ef238ee333cdabcf1d87c6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 5 Nov 2017 21:53:30 +0800 Subject: [PATCH 0494/1212] f2fs: keep isize once block is reserved cross EOF Without FADVISE_KEEP_SIZE_BIT, we will try to recover file size according to last non-hole block, so in fallocate(), we must set FADVISE_KEEP_SIZE_BIT flag once we have preallocated block cross EOF, instead of when all preallocation is success. Otherwise, file size will be incorrect due to lack of this flag. Simple testcase to reproduce this: 1. echo 2 > /sys/fs/f2fs//inject_type 2. echo 10 > /sys/fs/f2fs//inject_rate 3. run tests/generic/392 4. disable fault injection 5. do remount Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 62f23f82b971..0ebf08f00b8f 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1458,8 +1458,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset, new_size = ((loff_t)pg_end << PAGE_SHIFT) + off_end; } - if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) - f2fs_i_size_write(inode, new_size); + if (new_size > i_size_read(inode)) { + if (mode & FALLOC_FL_KEEP_SIZE) + file_set_keep_isize(inode); + else + f2fs_i_size_write(inode, new_size); + } return err; } @@ -1506,8 +1510,6 @@ static long f2fs_fallocate(struct file *file, int mode, if (!ret) { inode->i_mtime = inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, false); - if (mode & FALLOC_FL_KEEP_SIZE) - file_set_keep_isize(inode); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } From 0186182c0c4d208a43d0c09bb04027b9e7e8f15a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 6 Nov 2017 22:51:45 +0800 Subject: [PATCH 0495/1212] f2fs: trace checkpoint reason in fsync() This patch slightly changes need_do_checkpoint to return the detail info that indicates why we need do checkpoint, then caller could print it with trace message. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 12 ++++++++++++ fs/f2fs/file.c | 34 ++++++++++++++++++---------------- include/trace/events/f2fs.h | 24 ++++++++++++++++++------ 3 files changed, 48 insertions(+), 22 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b6b382888a94..31edffdd51d4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -977,6 +977,18 @@ enum need_lock_type { LOCK_RETRY, }; +enum cp_reason_type { + CP_NO_NEEDED, + CP_NON_REGULAR, + CP_HARDLINK, + CP_SB_NEED_CP, + CP_WRONG_PINO, + CP_NO_SPC_ROLL, + CP_NODE_NEED_CP, + CP_FASTBOOT_MODE, + CP_SPEC_LOG_NUM, +}; + enum iostat_type { APP_DIRECT_IO, /* app direct IOs */ APP_BUFFERED_IO, /* app buffered IOs */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 0ebf08f00b8f..3de13816d2ac 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -147,27 +147,29 @@ static int get_parent_ino(struct inode *inode, nid_t *pino) return 1; } -static inline bool need_do_checkpoint(struct inode *inode) +static inline enum cp_reason_type need_do_checkpoint(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - bool need_cp = false; + enum cp_reason_type cp_reason = CP_NO_NEEDED; - if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) - need_cp = true; + if (!S_ISREG(inode->i_mode)) + cp_reason = CP_NON_REGULAR; + else if (inode->i_nlink != 1) + cp_reason = CP_HARDLINK; else if (is_sbi_flag_set(sbi, SBI_NEED_CP)) - need_cp = true; + cp_reason = CP_SB_NEED_CP; else if (file_wrong_pino(inode)) - need_cp = true; + cp_reason = CP_WRONG_PINO; else if (!space_for_roll_forward(sbi)) - need_cp = true; + cp_reason = CP_NO_SPC_ROLL; else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino)) - need_cp = true; + cp_reason = CP_NODE_NEED_CP; else if (test_opt(sbi, FASTBOOT)) - need_cp = true; + cp_reason = CP_FASTBOOT_MODE; else if (sbi->active_logs == 2) - need_cp = true; + cp_reason = CP_SPEC_LOG_NUM; - return need_cp; + return cp_reason; } static bool need_inode_page_update(struct f2fs_sb_info *sbi, nid_t ino) @@ -202,7 +204,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t ino = inode->i_ino; int ret = 0; - bool need_cp = false; + enum cp_reason_type cp_reason = 0; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, @@ -221,7 +223,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, clear_inode_flag(inode, FI_NEED_IPU); if (ret) { - trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); + trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret); return ret; } @@ -252,10 +254,10 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, * sudden-power-off. */ down_read(&F2FS_I(inode)->i_sem); - need_cp = need_do_checkpoint(inode); + cp_reason = need_do_checkpoint(inode); up_read(&F2FS_I(inode)->i_sem); - if (need_cp) { + if (cp_reason) { /* all the dirty node pages should be flushed for POR */ ret = f2fs_sync_fs(inode->i_sb, 1); @@ -312,7 +314,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, } f2fs_update_time(sbi, REQ_TIME); out: - trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); + trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret); f2fs_trace_ios(NULL, 1); return ret; } diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index c9be882c2718..589df6f73789 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -128,6 +128,18 @@ TRACE_DEFINE_ENUM(CP_TRIMMED); { CP_DISCARD, "Discard" }, \ { CP_UMOUNT | CP_TRIMMED, "Umount,Trimmed" }) +#define show_fsync_cpreason(type) \ + __print_symbolic(type, \ + { CP_NO_NEEDED, "no needed" }, \ + { CP_NON_REGULAR, "non regular" }, \ + { CP_HARDLINK, "hardlink" }, \ + { CP_SB_NEED_CP, "sb needs cp" }, \ + { CP_WRONG_PINO, "wrong pino" }, \ + { CP_NO_SPC_ROLL, "no space roll forward" }, \ + { CP_NODE_NEED_CP, "node needs cp" }, \ + { CP_FASTBOOT_MODE, "fastboot mode" }, \ + { CP_SPEC_LOG_NUM, "log type is 2" }) + struct victim_sel_policy; struct f2fs_map_blocks; @@ -202,14 +214,14 @@ DEFINE_EVENT(f2fs__inode, f2fs_sync_file_enter, TRACE_EVENT(f2fs_sync_file_exit, - TP_PROTO(struct inode *inode, int need_cp, int datasync, int ret), + TP_PROTO(struct inode *inode, int cp_reason, int datasync, int ret), - TP_ARGS(inode, need_cp, datasync, ret), + TP_ARGS(inode, cp_reason, datasync, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) - __field(int, need_cp) + __field(int, cp_reason) __field(int, datasync) __field(int, ret) ), @@ -217,15 +229,15 @@ TRACE_EVENT(f2fs_sync_file_exit, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->need_cp = need_cp; + __entry->cp_reason = cp_reason; __entry->datasync = datasync; __entry->ret = ret; ), - TP_printk("dev = (%d,%d), ino = %lu, checkpoint is %s, " + TP_printk("dev = (%d,%d), ino = %lu, cp_reason: %s, " "datasync = %d, ret = %d", show_dev_ino(__entry), - __entry->need_cp ? "needed" : "not needed", + show_fsync_cpreason(__entry->cp_reason), __entry->datasync, __entry->ret) ); From 460688b59e8bc67d25340c430c099f1c8ebcdb4d Mon Sep 17 00:00:00 2001 From: Fan Li Date: Tue, 7 Nov 2017 11:04:33 +0800 Subject: [PATCH 0496/1212] f2fs: keep scanning until enough free nids are acquired In current version, after scan_free_nid_bits, the scan is over if nid_cnt[FREE_NID] != 0. In most cases, there are still free nids in the free list during the scan, and scan_free_nid_bits usually can't increase nid_cnt[FREE_NID]. It causes that __build_free_nids is called many times without solving the shortage of the free nids. This patch fixes that. Signed-off-by: Fan li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 930bdb90faac..c75c1ac06f3a 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2018,7 +2018,7 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) /* try to find free nids in free_nid_bitmap */ scan_free_nid_bits(sbi); - if (nm_i->nid_cnt[FREE_NID]) + if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK) return; } From ca28e9670e807900f4ad9a447ffa50b0b4cbff5f Mon Sep 17 00:00:00 2001 From: Fan Li Date: Tue, 7 Nov 2017 19:14:24 +0800 Subject: [PATCH 0497/1212] f2fs: optimize the way of traversing free_nid_bitmap We call scan_free_nid_bits only when there isn't many free nids left, it means that marked bits in free_nid_bitmap are supposed to be few, use find_next_bit_le is more efficient in such case. According to my tests, use find_next_bit_le instead of test_bit_le will cut down the traversal time to one third of its original. Signed-off-by: Fan li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index c75c1ac06f3a..ffaa695224f7 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1958,6 +1958,7 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi) struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); struct f2fs_journal *journal = curseg->journal; unsigned int i, idx; + nid_t nid; down_read(&nm_i->nat_tree_lock); @@ -1967,10 +1968,10 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi) if (!nm_i->free_nid_count[i]) continue; for (idx = 0; idx < NAT_ENTRY_PER_BLOCK; idx++) { - nid_t nid; - - if (!test_bit_le(idx, nm_i->free_nid_bitmap[i])) - continue; + idx = find_next_bit_le(nm_i->free_nid_bitmap[i], + NAT_ENTRY_PER_BLOCK, idx); + if (idx >= NAT_ENTRY_PER_BLOCK) + break; nid = i * NAT_ENTRY_PER_BLOCK + idx; add_free_nid(sbi, nid, true); @@ -1983,7 +1984,6 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi) down_read(&curseg->journal_rwsem); for (i = 0; i < nats_in_cursum(journal); i++) { block_t addr; - nid_t nid; addr = le32_to_cpu(nat_in_journal(journal, i).block_addr); nid = le32_to_cpu(nid_in_journal(journal, i)); From ac9819160586ff12691558c3a3b07554069a8024 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 8 Nov 2017 17:47:36 +0800 Subject: [PATCH 0498/1212] f2fs: introduce scan_curseg_cache for cleanup Commit 4ac912427c42 ("f2fs: introduce free nid bitmap") copied codes from __build_free_nids() into scan_free_nid_bits(), they are redundant, introduce one common function scan_curseg_cache for cleanup. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 49 +++++++++++++++++++++++-------------------------- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index ffaa695224f7..62e597b08e09 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1952,11 +1952,30 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, } } +static void scan_curseg_cache(struct f2fs_sb_info *sbi) +{ + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); + struct f2fs_journal *journal = curseg->journal; + int i; + + down_read(&curseg->journal_rwsem); + for (i = 0; i < nats_in_cursum(journal); i++) { + block_t addr; + nid_t nid; + + addr = le32_to_cpu(nat_in_journal(journal, i).block_addr); + nid = le32_to_cpu(nid_in_journal(journal, i)); + if (addr == NULL_ADDR) + add_free_nid(sbi, nid, true); + else + remove_free_nid(sbi, nid); + } + up_read(&curseg->journal_rwsem); +} + static void scan_free_nid_bits(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); - struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); - struct f2fs_journal *journal = curseg->journal; unsigned int i, idx; nid_t nid; @@ -1981,26 +2000,14 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi) } } out: - down_read(&curseg->journal_rwsem); - for (i = 0; i < nats_in_cursum(journal); i++) { - block_t addr; + scan_curseg_cache(sbi); - addr = le32_to_cpu(nat_in_journal(journal, i).block_addr); - nid = le32_to_cpu(nid_in_journal(journal, i)); - if (addr == NULL_ADDR) - add_free_nid(sbi, nid, true); - else - remove_free_nid(sbi, nid); - } - up_read(&curseg->journal_rwsem); up_read(&nm_i->nat_tree_lock); } static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) { struct f2fs_nm_info *nm_i = NM_I(sbi); - struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); - struct f2fs_journal *journal = curseg->journal; int i = 0; nid_t nid = nm_i->next_scan_nid; @@ -2046,18 +2053,8 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) nm_i->next_scan_nid = nid; /* find free nids from current sum_pages */ - down_read(&curseg->journal_rwsem); - for (i = 0; i < nats_in_cursum(journal); i++) { - block_t addr; + scan_curseg_cache(sbi); - addr = le32_to_cpu(nat_in_journal(journal, i).block_addr); - nid = le32_to_cpu(nid_in_journal(journal, i)); - if (addr == NULL_ADDR) - add_free_nid(sbi, nid, true); - else - remove_free_nid(sbi, nid); - } - up_read(&curseg->journal_rwsem); up_read(&nm_i->nat_tree_lock); ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), From 47af6c72d9440c90674d9b79da13ce8922491d24 Mon Sep 17 00:00:00 2001 From: Hyunchul Lee Date: Thu, 9 Nov 2017 14:51:27 +0900 Subject: [PATCH 0499/1212] f2fs: apply write hints to select the type of segments for buffered write Write hints helps F2FS to determine which type of segments would be selected for buffered write. This patch implements the mapping from write hints to segment types as shown below. hints segment type ----- ------------ WRITE_LIFE_SHORT CURSEG_HOT_DATA WRITE_LIFE_EXTREME CURSEG_COLD_DATA others CURSEG_WARM_DATA the F2FS poliy for hot/cold seperation has precedence over this hints. And hints are not applied in in-place update. Signed-off-by: Hyunchul Lee Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 734c6a880633..94939a5a96c8 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2514,6 +2514,20 @@ static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) return false; } +#if 0 +int rw_hint_to_seg_type(enum rw_hint hint) +{ + switch (hint) { + case WRITE_LIFE_SHORT: + return CURSEG_HOT_DATA; + case WRITE_LIFE_EXTREME: + return CURSEG_COLD_DATA; + default: + return CURSEG_WARM_DATA; + } +} +#endif + static int __get_segment_type_2(struct f2fs_io_info *fio) { if (fio->type == DATA) @@ -2548,6 +2562,7 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) return CURSEG_COLD_DATA; if (is_inode_flag_set(inode, FI_HOT_DATA)) return CURSEG_HOT_DATA; + /* rw_hint_to_seg_type(inode->i_write_hint); */ return CURSEG_WARM_DATA; } else { if (IS_DNODE(fio->page)) From baf9275a4bbdf42fcc443ba3ba90482ee9995665 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 10 Nov 2017 09:30:42 +0800 Subject: [PATCH 0500/1212] f2fs: avoid opened loop codes in __add_ino_entry We will keep __add_ino_entry success all the time, for ENOMEM failure case, we have already handled it by using __GFP_NOFAIL flag, so we don't have to use additional opened loop codes here, remove them. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index d6c02bb8fcf8..2eb778174a9b 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -408,18 +408,16 @@ static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, struct ino_entry *e, *tmp; tmp = f2fs_kmem_cache_alloc(ino_entry_slab, GFP_NOFS); -retry: + radix_tree_preload(GFP_NOFS | __GFP_NOFAIL); spin_lock(&im->ino_lock); e = radix_tree_lookup(&im->ino_root, ino); if (!e) { e = tmp; - if (radix_tree_insert(&im->ino_root, ino, e)) { - spin_unlock(&im->ino_lock); - radix_tree_preload_end(); - goto retry; - } + if (unlikely(radix_tree_insert(&im->ino_root, ino, e))) + f2fs_bug_on(sbi, 1); + memset(e, 0, sizeof(struct ino_entry)); e->ino = ino; From 48c72b4c8c5016521b3c670d003acb1dc664c3ac Mon Sep 17 00:00:00 2001 From: LiFan Date: Fri, 10 Nov 2017 15:41:42 +0800 Subject: [PATCH 0501/1212] f2fs: validate before set/clear free nat bitmap In flush_nat_entries, all dirty nats will be flushed and if their new address isn't NULL_ADDR, their bitmaps will be updated, the free_nid_count of the bitmaps will be increaced regardless of whether the nats have already been occupied before. This could lead to wrong free_nid_count. So this patch checks the status of the bits beforeactually set/clear them. Fixes: 586d1492f301 ("f2fs: skip scanning free nid bitmap of full NAT blocks") Signed-off-by: Fan li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 62e597b08e09..7e3ee2c5e497 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1909,15 +1909,18 @@ static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, if (!test_bit_le(nat_ofs, nm_i->nat_block_bitmap)) return; - if (set) + if (set) { + if (test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs])) + return; __set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); - else - __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); - - if (set) nm_i->free_nid_count[nat_ofs]++; - else if (!build) - nm_i->free_nid_count[nat_ofs]--; + } else { + if (!test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs])) + return; + __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + if (!build) + nm_i->free_nid_count[nat_ofs]--; + } } static void scan_nat_page(struct f2fs_sb_info *sbi, From c4cd2efe835b9b3dc0d9ca0807f46b603f6e3532 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Fri, 10 Nov 2017 13:36:51 -0800 Subject: [PATCH 0502/1212] f2fs: separate nat entry mem alloc from nat_tree_lock This patch splits memory allocation part in nat_entry to avoid lock contention. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 98 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 59 insertions(+), 39 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 7e3ee2c5e497..964c99655942 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -138,6 +138,44 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) return dst_page; } +static struct nat_entry *__alloc_nat_entry(nid_t nid, bool no_fail) +{ + struct nat_entry *new; + + if (no_fail) + new = f2fs_kmem_cache_alloc(nat_entry_slab, + GFP_NOFS | __GFP_ZERO); + else + new = kmem_cache_alloc(nat_entry_slab, + GFP_NOFS | __GFP_ZERO); + if (new) { + nat_set_nid(new, nid); + nat_reset_flag(new); + } + return new; +} + +static void __free_nat_entry(struct nat_entry *e) +{ + kmem_cache_free(nat_entry_slab, e); +} + +/* must be locked by nat_tree_lock */ +static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i, + struct nat_entry *ne, struct f2fs_nat_entry *raw_ne, bool no_fail) +{ + if (no_fail) + f2fs_radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne); + else if (radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne)) + return NULL; + + if (raw_ne) + node_info_from_raw_nat(&ne->ni, raw_ne); + list_add_tail(&ne->list, &nm_i->nat_entries); + nm_i->nat_cnt++; + return ne; +} + static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n) { return radix_tree_lookup(&nm_i->nat_root, n); @@ -154,7 +192,7 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e) list_del(&e->list); radix_tree_delete(&nm_i->nat_root, nat_get_nid(e)); nm_i->nat_cnt--; - kmem_cache_free(nat_entry_slab, e); + __free_nat_entry(e); } static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, @@ -250,49 +288,29 @@ bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) return need_update; } -static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid, - bool no_fail) -{ - struct nat_entry *new; - - if (no_fail) { - new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_NOFS); - f2fs_radix_tree_insert(&nm_i->nat_root, nid, new); - } else { - new = kmem_cache_alloc(nat_entry_slab, GFP_NOFS); - if (!new) - return NULL; - if (radix_tree_insert(&nm_i->nat_root, nid, new)) { - kmem_cache_free(nat_entry_slab, new); - return NULL; - } - } - - memset(new, 0, sizeof(struct nat_entry)); - nat_set_nid(new, nid); - nat_reset_flag(new); - list_add_tail(&new->list, &nm_i->nat_entries); - nm_i->nat_cnt++; - return new; -} - +/* must be locked by nat_tree_lock */ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, struct f2fs_nat_entry *ne) { struct f2fs_nm_info *nm_i = NM_I(sbi); - struct nat_entry *e; + struct nat_entry *new, *e; + new = __alloc_nat_entry(nid, false); + if (!new) + return; + + down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); - if (!e) { - e = grab_nat_entry(nm_i, nid, false); - if (e) - node_info_from_raw_nat(&e->ni, ne); - } else { + if (!e) + e = __init_nat_entry(nm_i, new, ne, false); + else f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) || nat_get_blkaddr(e) != le32_to_cpu(ne->block_addr) || nat_get_version(e) != ne->version); - } + up_write(&nm_i->nat_tree_lock); + if (e != new) + __free_nat_entry(new); } static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, @@ -300,11 +318,12 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; + struct nat_entry *new = __alloc_nat_entry(ni->nid, true); down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, ni->nid); if (!e) { - e = grab_nat_entry(nm_i, ni->nid, true); + e = __init_nat_entry(nm_i, new, NULL, true); copy_node_info(&e->ni, ni); f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR); } else if (new_blkaddr == NEW_ADDR) { @@ -316,6 +335,9 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, copy_node_info(&e->ni, ni); f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR); } + /* let's free early to reduce memory consumption */ + if (e != new) + __free_nat_entry(new); /* sanity check */ f2fs_bug_on(sbi, nat_get_blkaddr(e) != ni->blk_addr); @@ -424,9 +446,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) f2fs_put_page(page, 1); cache: /* cache nat entry */ - down_write(&nm_i->nat_tree_lock); cache_nat_entry(sbi, nid, &ne); - up_write(&nm_i->nat_tree_lock); } /* @@ -2377,8 +2397,8 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) ne = __lookup_nat_cache(nm_i, nid); if (!ne) { - ne = grab_nat_entry(nm_i, nid, true); - node_info_from_raw_nat(&ne->ni, &raw_ne); + ne = __alloc_nat_entry(nid, true); + __init_nat_entry(nm_i, ne, &raw_ne, true); } /* From e6cfc5de2d057d457d53084acb52c9383e62f44a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 13 Nov 2017 17:46:38 -0800 Subject: [PATCH 0503/1212] f2fs: expose quota information in debugfs This patch shows # of dirty pages and # of hidden quota files. Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 11 +++++++++++ fs/f2fs/f2fs.h | 10 ++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index f7eec506ceea..ecada8425268 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -45,9 +45,18 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS); si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META); si->ndirty_data = get_pages(sbi, F2FS_DIRTY_DATA); + si->ndirty_qdata = get_pages(sbi, F2FS_DIRTY_QDATA); si->ndirty_imeta = get_pages(sbi, F2FS_DIRTY_IMETA); si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE]; si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; + + si->nquota_files = 0; + if (f2fs_sb_has_quota_ino(sbi->sb)) { + for (i = 0; i < MAXQUOTAS; i++) { + if (f2fs_qf_ino(sbi->sb, i)) + si->nquota_files++; + } + } si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); si->aw_cnt = atomic_read(&sbi->aw_cnt); @@ -369,6 +378,8 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_dent, si->ndirty_dirs, si->ndirty_all); seq_printf(s, " - datas: %4d in files:%4d\n", si->ndirty_data, si->ndirty_files); + seq_printf(s, " - quota datas: %4d in quota files:%4d\n", + si->ndirty_qdata, si->nquota_files); seq_printf(s, " - meta: %4d in %4d\n", si->ndirty_meta, si->meta_pages); seq_printf(s, " - imeta: %4d\n", diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 31edffdd51d4..7569347fd453 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -929,6 +929,7 @@ struct f2fs_sm_info { enum count_type { F2FS_DIRTY_DENTS, F2FS_DIRTY_DATA, + F2FS_DIRTY_QDATA, F2FS_DIRTY_NODES, F2FS_DIRTY_META, F2FS_INMEM_PAGES, @@ -1703,6 +1704,8 @@ static inline void inode_inc_dirty_pages(struct inode *inode) atomic_inc(&F2FS_I(inode)->dirty_pages); inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); + if (IS_NOQUOTA(inode)) + inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_QDATA); } static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) @@ -1719,6 +1722,8 @@ static inline void inode_dec_dirty_pages(struct inode *inode) atomic_dec(&F2FS_I(inode)->dirty_pages); dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); + if (IS_NOQUOTA(inode)) + dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_QDATA); } static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type) @@ -2853,9 +2858,10 @@ struct f2fs_stat_info { unsigned long long hit_largest, hit_cached, hit_rbtree; unsigned long long hit_total, total_ext; int ext_tree, zombie_tree, ext_node; - int ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; + int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta; + int ndirty_data, ndirty_qdata; int inmem_pages; - unsigned int ndirty_dirs, ndirty_files, ndirty_all; + unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all; int nats, dirty_nats, sits, dirty_sits; int free_nids, avail_nids, alloc_nids; int total_count, utilization; From 9262922510220084179c7fec823dd05526bdf77f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 13 Nov 2017 17:32:39 +0800 Subject: [PATCH 0504/1212] f2fs: fix to clear FI_NO_PREALLOC We need to clear FI_NO_PREALLOC flag in error path of f2fs_file_write_iter, otherwise we will lose the chance to preallocate blocks in latter write() at one time. Fixes: dc91de78e5e1 ("f2fs: do not preallocate blocks which has wrong buffer") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 3de13816d2ac..52d29785154a 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2529,6 +2529,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) err = f2fs_preallocate_blocks(iocb, from); if (err) { + clear_inode_flag(inode, FI_NO_PREALLOC); inode_unlock(inode); return err; } From c394842e26e555a5d26d476e8fb2014ce7fbae57 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 13 Nov 2017 17:32:40 +0800 Subject: [PATCH 0505/1212] f2fs: inject fault in inc_valid_node_count This patch adds missing fault injection in inc_valid_node_count. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7569347fd453..fc9c00ae7159 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1831,6 +1831,13 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, return ret; } +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_BLOCK)) { + f2fs_show_injection_info(FAULT_BLOCK); + goto enospc; + } +#endif + spin_lock(&sbi->stat_lock); valid_block_count = sbi->total_valid_block_count + 1; From 56a07b07051015d5fab339561103c4cc346c9685 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 14 Nov 2017 19:28:42 +0800 Subject: [PATCH 0506/1212] f2fs: deny accessing encryption policy if encryption is off This patch adds missing feature check in encryption ioctl interface. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 52d29785154a..bfff53f658e1 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1905,6 +1905,9 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); + if (!f2fs_sb_has_crypto(inode->i_sb)) + return -EOPNOTSUPP; + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return fscrypt_ioctl_set_policy(filp, (const void __user *)arg); @@ -1912,6 +1915,8 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) { + if (!f2fs_sb_has_crypto(file_inode(filp)->i_sb)) + return -EOPNOTSUPP; return fscrypt_ioctl_get_policy(filp, (void __user *)arg); } From a53dc7e00559b3302b38deeffab735fdfe6ec20c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 22 Jun 2017 12:14:40 -0700 Subject: [PATCH 0507/1212] fscrypt: make ->dummy_context() return bool This makes it consistent with ->is_encrypted(), ->empty_dir(), and fscrypt_dummy_context_enabled(). Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- include/linux/fscrypt_common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/fscrypt_common.h b/include/linux/fscrypt_common.h index 4022c61f7e9b..e3e1208e0f54 100644 --- a/include/linux/fscrypt_common.h +++ b/include/linux/fscrypt_common.h @@ -77,7 +77,7 @@ struct fscrypt_operations { const char *key_prefix; int (*get_context)(struct inode *, void *, size_t); int (*set_context)(struct inode *, const void *, size_t, void *); - int (*dummy_context)(struct inode *); + bool (*dummy_context)(struct inode *); bool (*is_encrypted)(struct inode *); bool (*empty_dir)(struct inode *); unsigned (*max_namelen)(struct inode *); From bc4a61c60bea8d5e7468885ef7b7a41ba16b1b96 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 9 Oct 2017 12:46:18 -0700 Subject: [PATCH 0508/1212] fscrypt: fix dereference of NULL user_key_payload When an fscrypt-encrypted file is opened, we request the file's master key from the keyrings service as a logon key, then access its payload. However, a revoked key has a NULL payload, and we failed to check for this. request_key() *does* skip revoked keys, but there is still a window where the key can be revoked before we acquire its semaphore. Fix it by checking for a NULL payload, treating it like a key which was already revoked at the time it was requested. Fixes: 88bd6ccdcdd6 ("ext4 crypto: add encryption key management facilities") Reviewed-by: James Morris Cc: [v4.1+] Signed-off-by: Eric Biggers Signed-off-by: David Howells --- fs/crypto/keyinfo.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 66e0728e9bbe..169fefb62940 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -109,6 +109,11 @@ static int validate_user_key(struct fscrypt_info *crypt_info, goto out; } ukp = user_key_payload(keyring_key); + if (!ukp) { + /* key was revoked before we acquired its semaphore */ + res = -EKEYREVOKED; + goto out; + } if (ukp->datalen != sizeof(struct fscrypt_key)) { res = -EINVAL; goto out; From ff0a3dbc9392233e967b54534459c68d91e7963e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 9 Oct 2017 12:15:34 -0700 Subject: [PATCH 0509/1212] fscrypt: clean up include file mess Filesystems have to include different header files based on whether they are compiled with encryption support or not. That's nasty and messy. Instead, rationalise the headers so we have a single include fscrypt.h and let it decide what internal implementation to include based on the __FS_HAS_ENCRYPTION define. Filesystems set __FS_HAS_ENCRYPTION to 1 before including linux/fscrypt.h if they are built with encryption support. Otherwise, they must set __FS_HAS_ENCRYPTION to 0. Add guards to prevent fscrypt_supp.h and fscrypt_notsupp.h from being directly included by filesystems. Signed-off-by: Dave Chinner [EB: use 1 and 0 rather than defined/undefined] Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/fscrypt_private.h | 3 +- fs/ext4/ext4.h | 3 ++ fs/f2fs/f2fs.h | 8 ++-- include/linux/{fscrypt_common.h => fscrypt.h} | 41 +++++++++++++------ include/linux/fscrypt_notsupp.h | 7 +++- include/linux/fscrypt_supp.h | 7 ++-- 6 files changed, 45 insertions(+), 24 deletions(-) rename include/linux/{fscrypt_common.h => fscrypt.h} (79%) diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 79d79755d79b..ff97988fe6e9 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -11,7 +11,8 @@ #ifndef _FSCRYPT_PRIVATE_H #define _FSCRYPT_PRIVATE_H -#include +#define __FS_HAS_ENCRYPTION 1 +#include #include /* Encryption parameters */ diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b7e921d207fb..bea7d7febdab 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -37,6 +37,9 @@ #include #endif +#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_EXT4_FS_ENCRYPTION) +#include + /* * The fourth extended filesystem constants/structures */ diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fc9c00ae7159..b036ea741a03 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -23,14 +23,12 @@ #include #include #include -#ifdef CONFIG_F2FS_FS_ENCRYPTION -#include -#else -#include -#endif #include #include +#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_F2FS_FS_ENCRYPTION) +#include + #ifdef CONFIG_F2FS_CHECK_FS #define f2fs_bug_on(sbi, condition) BUG_ON(condition) #else diff --git a/include/linux/fscrypt_common.h b/include/linux/fscrypt.h similarity index 79% rename from include/linux/fscrypt_common.h rename to include/linux/fscrypt.h index e3e1208e0f54..58663327f692 100644 --- a/include/linux/fscrypt_common.h +++ b/include/linux/fscrypt.h @@ -1,14 +1,17 @@ /* - * fscrypt_common.h: common declarations for per-file encryption + * fscrypt.h: declarations for per-file encryption + * + * Filesystems that implement per-file encryption include this header + * file with the __FS_HAS_ENCRYPTION set according to whether that filesystem + * is being built with encryption support or not. * * Copyright (C) 2015, Google, Inc. * * Written by Michael Halcrow, 2015. * Modified by Jaegeuk Kim, 2015. */ - -#ifndef _LINUX_FSCRYPT_COMMON_H -#define _LINUX_FSCRYPT_COMMON_H +#ifndef _LINUX_FSCRYPT_H +#define _LINUX_FSCRYPT_H #include #include @@ -116,23 +119,35 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) return false; } +#if __FS_HAS_ENCRYPTION + static inline struct page *fscrypt_control_page(struct page *page) { -#if IS_ENABLED(CONFIG_FS_ENCRYPTION) return ((struct fscrypt_ctx *)page_private(page))->w.control_page; -#else +} + +static inline bool fscrypt_has_encryption_key(const struct inode *inode) +{ + return (inode->i_crypt_info != NULL); +} + +#include + +#else /* !__FS_HAS_ENCRYPTION */ + +static inline struct page *fscrypt_control_page(struct page *page) +{ WARN_ON_ONCE(1); return ERR_PTR(-EINVAL); -#endif } -static inline int fscrypt_has_encryption_key(const struct inode *inode) +static inline bool fscrypt_has_encryption_key(const struct inode *inode) { -#if IS_ENABLED(CONFIG_FS_ENCRYPTION) - return (inode->i_crypt_info != NULL); -#else return 0; -#endif } -#endif /* _LINUX_FSCRYPT_COMMON_H */ +#include +#endif /* __FS_HAS_ENCRYPTION */ + + +#endif /* _LINUX_FSCRYPT_H */ diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index ec406aed2f2f..2d0b6960831e 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -3,13 +3,16 @@ * * This stubs out the fscrypt functions for filesystems configured without * encryption support. + * + * Do not include this file directly. Use fscrypt.h instead! */ +#ifndef _LINUX_FSCRYPT_H +#error "Incorrect include of linux/fscrypt_notsupp.h!" +#endif #ifndef _LINUX_FSCRYPT_NOTSUPP_H #define _LINUX_FSCRYPT_NOTSUPP_H -#include - /* crypto.c */ static inline struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags) diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index 32e2fcf13b01..5a90e5ef4687 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -1,14 +1,15 @@ /* * fscrypt_supp.h * - * This is included by filesystems configured with encryption support. + * Do not include this file directly. Use fscrypt.h instead! */ +#ifndef _LINUX_FSCRYPT_H +#error "Incorrect include of linux/fscrypt_supp.h!" +#endif #ifndef _LINUX_FSCRYPT_SUPP_H #define _LINUX_FSCRYPT_SUPP_H -#include - /* crypto.c */ extern struct kmem_cache *fscrypt_info_cachep; extern struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *, gfp_t); From a4781dd1f17554560631993375a446c04c7d6c78 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 9 Oct 2017 12:15:35 -0700 Subject: [PATCH 0510/1212] fs, fscrypt: add an S_ENCRYPTED inode flag Introduce a flag S_ENCRYPTED which can be set in ->i_flags to indicate that the inode is encrypted using the fscrypt (fs/crypto/) mechanism. Checking this flag will give the same information that inode->i_sb->s_cop->is_encrypted(inode) currently does, but will be more efficient. This will be useful for adding higher-level helper functions for filesystems to use. For example we'll be able to replace this: if (ext4_encrypted_inode(inode)) { ret = fscrypt_get_encryption_info(inode); if (ret) return ret; if (!fscrypt_has_encryption_key(inode)) return -ENOKEY; } with this: ret = fscrypt_require_key(inode); if (ret) return ret; ... since we'll be able to retain the fast path for unencrypted files as a single flag check, using an inline function. This wasn't possible before because we'd have had to frequently call through the ->i_sb->s_cop->is_encrypted function pointer, even when the encryption support was disabled or not being used. Note: we don't define S_ENCRYPTED to 0 if CONFIG_FS_ENCRYPTION is disabled because we want to continue to return an error if an encrypted file is accessed without encryption support, rather than pretending that it is unencrypted. Reviewed-by: Chao Yu Acked-by: Dave Chinner Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 5 ++++- fs/f2fs/f2fs.h | 1 + fs/f2fs/inode.c | 5 ++++- include/linux/fs.h | 2 ++ 4 files changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 28702932a908..df30d04f6760 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4103,8 +4103,11 @@ void ext4_set_inode_flags(struct inode *inode) new_fl |= S_DIRSYNC; if (test_opt(inode->i_sb, DAX)) new_fl |= S_DAX; + if (flags & EXT4_ENCRYPT_FL) + new_fl |= S_ENCRYPTED; inode_set_flags(inode, new_fl, - S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX); + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX| + S_ENCRYPTED); } /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b036ea741a03..dc4a95e848af 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3147,6 +3147,7 @@ static inline void f2fs_set_encrypted_inode(struct inode *inode) { #ifdef CONFIG_F2FS_FS_ENCRYPTION file_set_encrypt(inode); + inode->i_flags |= S_ENCRYPTED; #endif } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 9684d53563f1..b4c4f2b25304 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -43,8 +43,11 @@ void f2fs_set_inode_flags(struct inode *inode) new_fl |= S_NOATIME; if (flags & FS_DIRSYNC_FL) new_fl |= S_DIRSYNC; + if (f2fs_encrypted_inode(inode)) + new_fl |= S_ENCRYPTED; inode_set_flags(inode, new_fl, - S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC| + S_ENCRYPTED); } static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) diff --git a/include/linux/fs.h b/include/linux/fs.h index a88271902ff2..933978eb92fb 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1778,6 +1778,7 @@ struct super_operations { #else #define S_DAX 0 /* Make all the DAX code disappear */ #endif +#define S_ENCRYPTED 16384 /* Encrypted file (using fs/crypto/) */ /* * Note that nosuid etc flags are inode-specific: setting some file-system @@ -1816,6 +1817,7 @@ struct super_operations { #define IS_AUTOMOUNT(inode) ((inode)->i_flags & S_AUTOMOUNT) #define IS_NOSEC(inode) ((inode)->i_flags & S_NOSEC) #define IS_DAX(inode) ((inode)->i_flags & S_DAX) +#define IS_ENCRYPTED(inode) ((inode)->i_flags & S_ENCRYPTED) #define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \ (inode)->i_rdev == WHITEOUT_DEV) From 32c0d3ae9d664766abf4e64f89398dcd92614b35 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 9 Oct 2017 12:15:36 -0700 Subject: [PATCH 0511/1212] fscrypt: switch from ->is_encrypted() to IS_ENCRYPTED() IS_ENCRYPTED() now gives the same information as i_sb->s_cop->is_encrypted() but is more efficient, since IS_ENCRYPTED() is just a simple flag check. Prepare to remove ->is_encrypted() by switching all callers to IS_ENCRYPTED(). Acked-by: Dave Chinner Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/crypto.c | 2 +- fs/crypto/fname.c | 3 +-- fs/crypto/keyinfo.c | 2 +- fs/crypto/policy.c | 6 +++--- include/linux/fscrypt_notsupp.h | 2 +- 5 files changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index c7835df7e7b8..608f6bbe0f31 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -340,7 +340,7 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) return -ECHILD; dir = dget_parent(dentry); - if (!d_inode(dir)->i_sb->s_cop->is_encrypted(d_inode(dir))) { + if (!IS_ENCRYPTED(d_inode(dir))) { dput(dir); return 0; } diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index ad9f814fdead..2878289b3ed2 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -382,8 +382,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, memset(fname, 0, sizeof(struct fscrypt_name)); fname->usr_fname = iname; - if (!dir->i_sb->s_cop->is_encrypted(dir) || - fscrypt_is_dot_dotdot(iname)) { + if (!IS_ENCRYPTED(dir) || fscrypt_is_dot_dotdot(iname)) { fname->disk_name.name = (unsigned char *)iname->name; fname->disk_name.len = iname->len; return 0; diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 169fefb62940..3ce6ca91ce23 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -273,7 +273,7 @@ int fscrypt_get_encryption_info(struct inode *inode) res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (res < 0) { if (!fscrypt_dummy_context_enabled(inode) || - inode->i_sb->s_cop->is_encrypted(inode)) + IS_ENCRYPTED(inode)) return res; /* Fake up a context for an unencrypted directory */ memset(&ctx, 0, sizeof(ctx)); diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 9914d51dff86..2f2c53f2e136 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -109,7 +109,7 @@ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) struct fscrypt_policy policy; int res; - if (!inode->i_sb->s_cop->is_encrypted(inode)) + if (!IS_ENCRYPTED(inode)) return -ENODATA; res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); @@ -166,11 +166,11 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) return 1; /* No restrictions if the parent directory is unencrypted */ - if (!cops->is_encrypted(parent)) + if (!IS_ENCRYPTED(parent)) return 1; /* Encrypted directories must not contain unencrypted files */ - if (!cops->is_encrypted(child)) + if (!IS_ENCRYPTED(child)) return 0; /* diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index 2d0b6960831e..7b390e356f7f 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -100,7 +100,7 @@ static inline int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct fscrypt_name *fname) { - if (dir->i_sb->s_cop->is_encrypted(dir)) + if (IS_ENCRYPTED(dir)) return -EOPNOTSUPP; memset(fname, 0, sizeof(struct fscrypt_name)); From 1034eeec516a7ff036c55e46ff9fe124584fdb82 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 9 Oct 2017 12:15:37 -0700 Subject: [PATCH 0512/1212] fscrypt: remove ->is_encrypted() Now that all callers of fscrypt_operations.is_encrypted() have been switched to IS_ENCRYPTED(), remove ->is_encrypted(). Reviewed-by: Chao Yu Acked-by: Dave Chinner Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/f2fs/super.c | 2 -- include/linux/fscrypt.h | 1 - 2 files changed, 3 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 76e2f1518224..07a5628a6779 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1742,13 +1742,11 @@ static const struct fscrypt_operations f2fs_cryptops = { .key_prefix = "f2fs:", .get_context = f2fs_get_context, .set_context = f2fs_set_context, - .is_encrypted = f2fs_encrypted_inode, .empty_dir = f2fs_empty_dir, .max_namelen = f2fs_max_namelen, }; #else static const struct fscrypt_operations f2fs_cryptops = { - .is_encrypted = f2fs_encrypted_inode, }; #endif diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 58663327f692..800e0f812f36 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -81,7 +81,6 @@ struct fscrypt_operations { int (*get_context)(struct inode *, void *, size_t); int (*set_context)(struct inode *, const void *, size_t, void *); bool (*dummy_context)(struct inode *); - bool (*is_encrypted)(struct inode *); bool (*empty_dir)(struct inode *); unsigned (*max_namelen)(struct inode *); }; From 272e43502577d08921becbce635d8e0a48c8086d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 9 Oct 2017 12:15:38 -0700 Subject: [PATCH 0513/1212] fscrypt: remove unneeded empty fscrypt_operations structs In the case where a filesystem has been configured without encryption support, there is no longer any need to initialize ->s_cop at all, since none of the methods are ever called. Reviewed-by: Chao Yu Acked-by: Dave Chinner Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/f2fs/super.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 07a5628a6779..187cead7bd37 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1745,9 +1745,6 @@ static const struct fscrypt_operations f2fs_cryptops = { .empty_dir = f2fs_empty_dir, .max_namelen = f2fs_max_namelen, }; -#else -static const struct fscrypt_operations f2fs_cryptops = { -}; #endif static struct inode *f2fs_nfs_get_inode(struct super_block *sb, @@ -2476,7 +2473,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) #endif sb->s_op = &f2fs_sops; +#ifdef CONFIG_F2FS_FS_ENCRYPTION sb->s_cop = &f2fs_cryptops; +#endif sb->s_xattr = f2fs_xattr_handlers; sb->s_export_op = &f2fs_export_ops; sb->s_magic = F2FS_SUPER_MAGIC; From 8c815f381cd6224828e63dbfb5435bdd58240ed4 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 9 Oct 2017 12:15:39 -0700 Subject: [PATCH 0514/1212] fscrypt: new helper function - fscrypt_require_key() Add a helper function which checks if an inode is encrypted, and if so, tries to set up its encryption key. This is a pattern which is duplicated in multiple places in each of ext4, f2fs, and ubifs --- for example, when a regular file is asked to be opened or truncated. Acked-by: Dave Chinner Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- include/linux/fscrypt.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 800e0f812f36..b1e3914c3e49 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -148,5 +148,30 @@ static inline bool fscrypt_has_encryption_key(const struct inode *inode) #include #endif /* __FS_HAS_ENCRYPTION */ +/** + * fscrypt_require_key - require an inode's encryption key + * @inode: the inode we need the key for + * + * If the inode is encrypted, set up its encryption key if not already done. + * Then require that the key be present and return -ENOKEY otherwise. + * + * No locks are needed, and the key will live as long as the struct inode --- so + * it won't go away from under you. + * + * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code + * if a problem occurred while setting up the encryption key. + */ +static inline int fscrypt_require_key(struct inode *inode) +{ + if (IS_ENCRYPTED(inode)) { + int err = fscrypt_get_encryption_info(inode); + + if (err) + return err; + if (!fscrypt_has_encryption_key(inode)) + return -ENOKEY; + } + return 0; +} #endif /* _LINUX_FSCRYPT_H */ From 2b4b4f98dddf0430cb52d9729a51066fe16153b5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 9 Oct 2017 12:15:40 -0700 Subject: [PATCH 0515/1212] fscrypt: new helper function - fscrypt_file_open() Add a helper function which prepares to open a regular file which may be encrypted. It handles setting up the file's encryption key, then checking that the file's encryption policy matches that of its parent directory (if the parent directory is encrypted). It may be set as the ->open() method or it can be called from another ->open() method. Acked-by: Dave Chinner Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/Makefile | 2 +- fs/crypto/hooks.c | 49 +++++++++++++++++++++++++++++++++ include/linux/fscrypt_notsupp.h | 9 ++++++ include/linux/fscrypt_supp.h | 3 ++ 4 files changed, 62 insertions(+), 1 deletion(-) create mode 100644 fs/crypto/hooks.c diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile index 9f6607f17b53..cb496989a6b6 100644 --- a/fs/crypto/Makefile +++ b/fs/crypto/Makefile @@ -1,4 +1,4 @@ obj-$(CONFIG_FS_ENCRYPTION) += fscrypto.o -fscrypto-y := crypto.o fname.o policy.o keyinfo.o +fscrypto-y := crypto.o fname.o hooks.o keyinfo.o policy.o fscrypto-$(CONFIG_BLOCK) += bio.o diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c new file mode 100644 index 000000000000..069088e91ea9 --- /dev/null +++ b/fs/crypto/hooks.c @@ -0,0 +1,49 @@ +/* + * fs/crypto/hooks.c + * + * Encryption hooks for higher-level filesystem operations. + */ + +#include +#include "fscrypt_private.h" + +/** + * fscrypt_file_open - prepare to open a possibly-encrypted regular file + * @inode: the inode being opened + * @filp: the struct file being set up + * + * Currently, an encrypted regular file can only be opened if its encryption key + * is available; access to the raw encrypted contents is not supported. + * Therefore, we first set up the inode's encryption key (if not already done) + * and return an error if it's unavailable. + * + * We also verify that if the parent directory (from the path via which the file + * is being opened) is encrypted, then the inode being opened uses the same + * encryption policy. This is needed as part of the enforcement that all files + * in an encrypted directory tree use the same encryption policy, as a + * protection against certain types of offline attacks. Note that this check is + * needed even when opening an *unencrypted* file, since it's forbidden to have + * an unencrypted file in an encrypted directory. + * + * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code + */ +int fscrypt_file_open(struct inode *inode, struct file *filp) +{ + int err; + struct dentry *dir; + + err = fscrypt_require_key(inode); + if (err) + return err; + + dir = dget_parent(file_dentry(filp)); + if (IS_ENCRYPTED(d_inode(dir)) && + !fscrypt_has_permitted_context(d_inode(dir), inode)) { + pr_warn_ratelimited("fscrypt: inconsistent encryption contexts: %lu/%lu", + d_inode(dir)->i_ino, inode->i_ino); + err = -EPERM; + } + dput(dir); + return err; +} +EXPORT_SYMBOL_GPL(fscrypt_file_open); diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index 7b390e356f7f..162da6517ac4 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -177,4 +177,13 @@ static inline int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, return -EOPNOTSUPP; } +/* hooks.c */ + +static inline int fscrypt_file_open(struct inode *inode, struct file *filp) +{ + if (IS_ENCRYPTED(inode)) + return -EOPNOTSUPP; + return 0; +} + #endif /* _LINUX_FSCRYPT_NOTSUPP_H */ diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index 5a90e5ef4687..fd2f6decaee4 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -143,4 +143,7 @@ extern void fscrypt_pullback_bio_page(struct page **, bool); extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t, unsigned int); +/* hooks.c */ +extern int fscrypt_file_open(struct inode *inode, struct file *filp); + #endif /* _LINUX_FSCRYPT_SUPP_H */ From 95efafb6239dd82ca0bb3d9e32edaa41da58f54e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 9 Oct 2017 12:15:41 -0700 Subject: [PATCH 0516/1212] fscrypt: new helper function - fscrypt_prepare_link() Introduce a helper function which prepares to link an inode into a possibly-encrypted directory. It handles setting up the target directory's encryption key, then verifying that the link won't violate the constraint that all files in an encrypted directory tree use the same encryption policy. Acked-by: Dave Chinner Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/hooks.c | 15 +++++++++++++++ include/linux/fscrypt.h | 27 +++++++++++++++++++++++++++ include/linux/fscrypt_notsupp.h | 6 ++++++ include/linux/fscrypt_supp.h | 1 + 4 files changed, 49 insertions(+) diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 069088e91ea9..8b90217320dd 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -47,3 +47,18 @@ int fscrypt_file_open(struct inode *inode, struct file *filp) return err; } EXPORT_SYMBOL_GPL(fscrypt_file_open); + +int __fscrypt_prepare_link(struct inode *inode, struct inode *dir) +{ + int err; + + err = fscrypt_require_key(dir); + if (err) + return err; + + if (!fscrypt_has_permitted_context(dir, inode)) + return -EPERM; + + return 0; +} +EXPORT_SYMBOL_GPL(__fscrypt_prepare_link); diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index b1e3914c3e49..4a2b0e307711 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -174,4 +174,31 @@ static inline int fscrypt_require_key(struct inode *inode) return 0; } +/** + * fscrypt_prepare_link - prepare to link an inode into a possibly-encrypted directory + * @old_dentry: an existing dentry for the inode being linked + * @dir: the target directory + * @dentry: negative dentry for the target filename + * + * A new link can only be added to an encrypted directory if the directory's + * encryption key is available --- since otherwise we'd have no way to encrypt + * the filename. Therefore, we first set up the directory's encryption key (if + * not already done) and return an error if it's unavailable. + * + * We also verify that the link will not violate the constraint that all files + * in an encrypted directory tree use the same encryption policy. + * + * Return: 0 on success, -ENOKEY if the directory's encryption key is missing, + * -EPERM if the link would result in an inconsistent encryption policy, or + * another -errno code. + */ +static inline int fscrypt_prepare_link(struct dentry *old_dentry, + struct inode *dir, + struct dentry *dentry) +{ + if (IS_ENCRYPTED(dir)) + return __fscrypt_prepare_link(d_inode(old_dentry), dir); + return 0; +} + #endif /* _LINUX_FSCRYPT_H */ diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index 162da6517ac4..d7d1039eb6b5 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -186,4 +186,10 @@ static inline int fscrypt_file_open(struct inode *inode, struct file *filp) return 0; } +static inline int __fscrypt_prepare_link(struct inode *inode, + struct inode *dir) +{ + return -EOPNOTSUPP; +} + #endif /* _LINUX_FSCRYPT_NOTSUPP_H */ diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index fd2f6decaee4..80706283da75 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -145,5 +145,6 @@ extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t, /* hooks.c */ extern int fscrypt_file_open(struct inode *inode, struct file *filp); +extern int __fscrypt_prepare_link(struct inode *inode, struct inode *dir); #endif /* _LINUX_FSCRYPT_SUPP_H */ From a31feba5c18ff73cedb4301c5a0f2ffa7624218b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 9 Oct 2017 12:15:42 -0700 Subject: [PATCH 0517/1212] fscrypt: new helper function - fscrypt_prepare_rename() Introduce a helper function which prepares to rename a file into a possibly encrypted directory. It handles loading the encryption keys for the source and target directories if needed, and it handles enforcing that if the target directory (and the source directory for a cross-rename) is encrypted, then the file being moved into the directory has the same encryption policy as its containing directory. Acked-by: Dave Chinner Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/hooks.c | 30 ++++++++++++++++++++++++++++++ include/linux/fscrypt.h | 33 +++++++++++++++++++++++++++++++++ include/linux/fscrypt_notsupp.h | 9 +++++++++ include/linux/fscrypt_supp.h | 5 +++++ 4 files changed, 77 insertions(+) diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 8b90217320dd..822cb78f9b45 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -62,3 +62,33 @@ int __fscrypt_prepare_link(struct inode *inode, struct inode *dir) return 0; } EXPORT_SYMBOL_GPL(__fscrypt_prepare_link); + +int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + int err; + + err = fscrypt_require_key(old_dir); + if (err) + return err; + + err = fscrypt_require_key(new_dir); + if (err) + return err; + + if (old_dir != new_dir) { + if (IS_ENCRYPTED(new_dir) && + !fscrypt_has_permitted_context(new_dir, + d_inode(old_dentry))) + return -EPERM; + + if ((flags & RENAME_EXCHANGE) && + IS_ENCRYPTED(old_dir) && + !fscrypt_has_permitted_context(old_dir, + d_inode(new_dentry))) + return -EPERM; + } + return 0; +} +EXPORT_SYMBOL_GPL(__fscrypt_prepare_rename); diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 4a2b0e307711..d331050e93f4 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -201,4 +201,37 @@ static inline int fscrypt_prepare_link(struct dentry *old_dentry, return 0; } +/** + * fscrypt_prepare_rename - prepare for a rename between possibly-encrypted directories + * @old_dir: source directory + * @old_dentry: dentry for source file + * @new_dir: target directory + * @new_dentry: dentry for target location (may be negative unless exchanging) + * @flags: rename flags (we care at least about %RENAME_EXCHANGE) + * + * Prepare for ->rename() where the source and/or target directories may be + * encrypted. A new link can only be added to an encrypted directory if the + * directory's encryption key is available --- since otherwise we'd have no way + * to encrypt the filename. A rename to an existing name, on the other hand, + * *is* cryptographically possible without the key. However, we take the more + * conservative approach and just forbid all no-key renames. + * + * We also verify that the rename will not violate the constraint that all files + * in an encrypted directory tree use the same encryption policy. + * + * Return: 0 on success, -ENOKEY if an encryption key is missing, -EPERM if the + * rename would cause inconsistent encryption policies, or another -errno code. + */ +static inline int fscrypt_prepare_rename(struct inode *old_dir, + struct dentry *old_dentry, + struct inode *new_dir, + struct dentry *new_dentry, + unsigned int flags) +{ + if (IS_ENCRYPTED(old_dir) || IS_ENCRYPTED(new_dir)) + return __fscrypt_prepare_rename(old_dir, old_dentry, + new_dir, new_dentry, flags); + return 0; +} + #endif /* _LINUX_FSCRYPT_H */ diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index d7d1039eb6b5..6af378d8126e 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -192,4 +192,13 @@ static inline int __fscrypt_prepare_link(struct inode *inode, return -EOPNOTSUPP; } +static inline int __fscrypt_prepare_rename(struct inode *old_dir, + struct dentry *old_dentry, + struct inode *new_dir, + struct dentry *new_dentry, + unsigned int flags) +{ + return -EOPNOTSUPP; +} + #endif /* _LINUX_FSCRYPT_NOTSUPP_H */ diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index 80706283da75..40f35073145f 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -146,5 +146,10 @@ extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t, /* hooks.c */ extern int fscrypt_file_open(struct inode *inode, struct file *filp); extern int __fscrypt_prepare_link(struct inode *inode, struct inode *dir); +extern int __fscrypt_prepare_rename(struct inode *old_dir, + struct dentry *old_dentry, + struct inode *new_dir, + struct dentry *new_dentry, + unsigned int flags); #endif /* _LINUX_FSCRYPT_SUPP_H */ From 5cbdd42ad248df655c3a07c0be92078e6e4ebe84 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 9 Oct 2017 12:15:43 -0700 Subject: [PATCH 0518/1212] fscrypt: new helper function - fscrypt_prepare_lookup() Introduce a helper function which prepares to look up the given dentry in the given directory. If the directory is encrypted, it handles loading the directory's encryption key, setting the dentry's ->d_op to fscrypt_d_ops, and setting DCACHE_ENCRYPTED_WITH_KEY if the directory's encryption key is available. Note: once all filesystems switch over to this, we'll be able to move fscrypt_d_ops and fscrypt_set_encrypted_dentry() to fscrypt_private.h. Acked-by: Dave Chinner Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/hooks.c | 18 ++++++++++++++++++ include/linux/fscrypt.h | 28 ++++++++++++++++++++++++++++ include/linux/fscrypt_notsupp.h | 6 ++++++ include/linux/fscrypt_supp.h | 1 + 4 files changed, 53 insertions(+) diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 822cb78f9b45..9f5fb2eb9cf7 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -92,3 +92,21 @@ int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, return 0; } EXPORT_SYMBOL_GPL(__fscrypt_prepare_rename); + +int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry) +{ + int err = fscrypt_get_encryption_info(dir); + + if (err) + return err; + + if (fscrypt_has_encryption_key(dir)) { + spin_lock(&dentry->d_lock); + dentry->d_flags |= DCACHE_ENCRYPTED_WITH_KEY; + spin_unlock(&dentry->d_lock); + } + + d_set_d_op(dentry, &fscrypt_d_ops); + return 0; +} +EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup); diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index d331050e93f4..9f1050721ab1 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -234,4 +234,32 @@ static inline int fscrypt_prepare_rename(struct inode *old_dir, return 0; } +/** + * fscrypt_prepare_lookup - prepare to lookup a name in a possibly-encrypted directory + * @dir: directory being searched + * @dentry: filename being looked up + * @flags: lookup flags + * + * Prepare for ->lookup() in a directory which may be encrypted. Lookups can be + * done with or without the directory's encryption key; without the key, + * filenames are presented in encrypted form. Therefore, we'll try to set up + * the directory's encryption key, but even without it the lookup can continue. + * + * To allow invalidating stale dentries if the directory's encryption key is + * added later, we also install a custom ->d_revalidate() method and use the + * DCACHE_ENCRYPTED_WITH_KEY flag to indicate whether a given dentry is a + * plaintext name (flag set) or a ciphertext name (flag cleared). + * + * Return: 0 on success, -errno if a problem occurred while setting up the + * encryption key + */ +static inline int fscrypt_prepare_lookup(struct inode *dir, + struct dentry *dentry, + unsigned int flags) +{ + if (IS_ENCRYPTED(dir)) + return __fscrypt_prepare_lookup(dir, dentry); + return 0; +} + #endif /* _LINUX_FSCRYPT_H */ diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index 6af378d8126e..c4c6bf2c390e 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -201,4 +201,10 @@ static inline int __fscrypt_prepare_rename(struct inode *old_dir, return -EOPNOTSUPP; } +static inline int __fscrypt_prepare_lookup(struct inode *dir, + struct dentry *dentry) +{ + return -EOPNOTSUPP; +} + #endif /* _LINUX_FSCRYPT_NOTSUPP_H */ diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index 40f35073145f..2db5e9706f60 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -151,5 +151,6 @@ extern int __fscrypt_prepare_rename(struct inode *old_dir, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); +extern int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry); #endif /* _LINUX_FSCRYPT_SUPP_H */ From 2286508d17c258719b7f1e37f8000cb4faebf51b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 9 Oct 2017 12:15:44 -0700 Subject: [PATCH 0519/1212] fscrypt: new helper function - fscrypt_prepare_setattr() Introduce a helper function for filesystems to call when processing ->setattr() on a possibly-encrypted inode. It handles enforcing that an encrypted file can only be truncated if its encryption key is available. Acked-by: Dave Chinner Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- include/linux/fscrypt.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 9f1050721ab1..8641e56b8f8a 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -262,4 +262,29 @@ static inline int fscrypt_prepare_lookup(struct inode *dir, return 0; } +/** + * fscrypt_prepare_setattr - prepare to change a possibly-encrypted inode's attributes + * @dentry: dentry through which the inode is being changed + * @attr: attributes to change + * + * Prepare for ->setattr() on a possibly-encrypted inode. On an encrypted file, + * most attribute changes are allowed even without the encryption key. However, + * without the encryption key we do have to forbid truncates. This is needed + * because the size being truncated to may not be a multiple of the filesystem + * block size, and in that case we'd have to decrypt the final block, zero the + * portion past i_size, and re-encrypt it. (We *could* allow truncating to a + * filesystem block boundary, but it's simpler to just forbid all truncates --- + * and we already forbid all other contents modifications without the key.) + * + * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code + * if a problem occurred while setting up the encryption key. + */ +static inline int fscrypt_prepare_setattr(struct dentry *dentry, + struct iattr *attr) +{ + if (attr->ia_valid & ATTR_SIZE) + return fscrypt_require_key(d_inode(dentry)); + return 0; +} + #endif /* _LINUX_FSCRYPT_H */ From 42d89da82b25f2d2e6bc062c1181d1fdd3926446 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 29 Oct 2017 06:30:19 -0400 Subject: [PATCH 0520/1212] fscrypt: lock mutex before checking for bounce page pool fscrypt_initialize(), which allocates the global bounce page pool when an encrypted file is first accessed, uses "double-checked locking" to try to avoid locking fscrypt_init_mutex. However, it doesn't use any memory barriers, so it's theoretically possible for a thread to observe a bounce page pool which has not been fully initialized. This is a classic bug with "double-checked locking". While "only a theoretical issue" in the latest kernel, in pre-4.8 kernels the pointer that was checked was not even the last to be initialized, so it was easily possible for a crash (NULL pointer dereference) to happen. This was changed only incidentally by the large refactor to use fs/crypto/. Solve both problems in a trivial way that can easily be backported: just always take the mutex. It's theoretically less efficient, but it shouldn't be noticeable in practice as the mutex is only acquired very briefly once per encrypted file. Later I'd like to make this use a helper macro like DO_ONCE(). However, DO_ONCE() runs in atomic context, so we'd need to add a new macro that allows blocking. Cc: stable@vger.kernel.org # v4.1+ Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/crypto.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 608f6bbe0f31..472326737717 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -410,11 +410,8 @@ int fscrypt_initialize(unsigned int cop_flags) { int i, res = -ENOMEM; - /* - * No need to allocate a bounce page pool if there already is one or - * this FS won't use it. - */ - if (cop_flags & FS_CFLG_OWN_PAGES || fscrypt_bounce_page_pool) + /* No need to allocate a bounce page pool if this FS won't use it. */ + if (cop_flags & FS_CFLG_OWN_PAGES) return 0; mutex_lock(&fscrypt_init_mutex); From 4ecacbed6e1ca727c1df55b231b3a7247ac38c4f Mon Sep 17 00:00:00 2001 From: Gilad Ben-Yossef Date: Wed, 18 Oct 2017 08:00:38 +0100 Subject: [PATCH 0521/1212] crypto: introduce crypto wait for async op Invoking a possibly async. crypto op and waiting for completion while correctly handling backlog processing is a common task in the crypto API implementation and outside users of it. This patch adds a generic implementation for doing so in preparation for using it across the board instead of hand rolled versions. Signed-off-by: Gilad Ben-Yossef CC: Eric Biggers CC: Jonathan Cameron Signed-off-by: Herbert Xu --- crypto/api.c | 13 +++++++++++++ include/linux/crypto.h | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/crypto/api.c b/crypto/api.c index bbc147cb5dec..e5c1abfd451f 100644 --- a/crypto/api.c +++ b/crypto/api.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "internal.h" LIST_HEAD(crypto_alg_list); @@ -611,5 +612,17 @@ int crypto_has_alg(const char *name, u32 type, u32 mask) } EXPORT_SYMBOL_GPL(crypto_has_alg); +void crypto_req_done(struct crypto_async_request *req, int err) +{ + struct crypto_wait *wait = req->data; + + if (err == -EINPROGRESS) + return; + + wait->err = err; + complete(&wait->completion); +} +EXPORT_SYMBOL_GPL(crypto_req_done); + MODULE_DESCRIPTION("Cryptographic core API"); MODULE_LICENSE("GPL"); diff --git a/include/linux/crypto.h b/include/linux/crypto.h index e71cb70a1ac2..b7c1e1a7ebac 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -24,6 +24,7 @@ #include #include #include +#include /* * Autoloaded crypto modules should only use a prefixed name to avoid allowing @@ -469,6 +470,45 @@ struct crypto_alg { struct module *cra_module; } CRYPTO_MINALIGN_ATTR; +/* + * A helper struct for waiting for completion of async crypto ops + */ +struct crypto_wait { + struct completion completion; + int err; +}; + +/* + * Macro for declaring a crypto op async wait object on stack + */ +#define DECLARE_CRYPTO_WAIT(_wait) \ + struct crypto_wait _wait = { \ + COMPLETION_INITIALIZER_ONSTACK((_wait).completion), 0 } + +/* + * Async ops completion helper functioons + */ +void crypto_req_done(struct crypto_async_request *req, int err); + +static inline int crypto_wait_req(int err, struct crypto_wait *wait) +{ + switch (err) { + case -EINPROGRESS: + case -EBUSY: + wait_for_completion(&wait->completion); + reinit_completion(&wait->completion); + err = wait->err; + break; + }; + + return err; +} + +static inline void crypto_init_wait(struct crypto_wait *wait) +{ + init_completion(&wait->completion); +} + /* * Algorithm registration interface. */ From 9e32f17d241bf2aceef65a33c133d09890fa20d4 Mon Sep 17 00:00:00 2001 From: Gilad Ben-Yossef Date: Wed, 18 Oct 2017 08:00:44 +0100 Subject: [PATCH 0522/1212] fscrypt: move to generic async completion fscrypt starts several async. crypto ops and waiting for them to complete. Move it over to generic code doing the same. Signed-off-by: Gilad Ben-Yossef Signed-off-by: Herbert Xu --- fs/crypto/crypto.c | 28 ++++------------------------ fs/crypto/fname.c | 36 ++++++------------------------------ fs/crypto/fscrypt_private.h | 9 --------- fs/crypto/keyinfo.c | 21 +++------------------ 4 files changed, 13 insertions(+), 81 deletions(-) diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 472326737717..732a786cce9d 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -126,21 +126,6 @@ struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags) } EXPORT_SYMBOL(fscrypt_get_ctx); -/** - * page_crypt_complete() - completion callback for page crypto - * @req: The asynchronous cipher request context - * @res: The result of the cipher operation - */ -static void page_crypt_complete(struct crypto_async_request *req, int res) -{ - struct fscrypt_completion_result *ecr = req->data; - - if (res == -EINPROGRESS) - return; - ecr->res = res; - complete(&ecr->completion); -} - int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, u64 lblk_num, struct page *src_page, struct page *dest_page, unsigned int len, @@ -151,7 +136,7 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, u8 padding[FS_IV_SIZE - sizeof(__le64)]; } iv; struct skcipher_request *req = NULL; - DECLARE_FS_COMPLETION_RESULT(ecr); + DECLARE_CRYPTO_WAIT(wait); struct scatterlist dst, src; struct fscrypt_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_ctfm; @@ -179,7 +164,7 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, skcipher_request_set_callback( req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - page_crypt_complete, &ecr); + crypto_req_done, &wait); sg_init_table(&dst, 1); sg_set_page(&dst, dest_page, len, offs); @@ -187,14 +172,9 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, sg_set_page(&src, src_page, len, offs); skcipher_request_set_crypt(req, &src, &dst, len, &iv); if (rw == FS_DECRYPT) - res = crypto_skcipher_decrypt(req); + res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait); else - res = crypto_skcipher_encrypt(req); - if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); - wait_for_completion(&ecr.completion); - res = ecr.res; - } + res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); skcipher_request_free(req); if (res) { printk_ratelimited(KERN_ERR diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 2878289b3ed2..6eb434363ff2 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -14,21 +14,6 @@ #include #include "fscrypt_private.h" -/** - * fname_crypt_complete() - completion callback for filename crypto - * @req: The asynchronous cipher request context - * @res: The result of the cipher operation - */ -static void fname_crypt_complete(struct crypto_async_request *req, int res) -{ - struct fscrypt_completion_result *ecr = req->data; - - if (res == -EINPROGRESS) - return; - ecr->res = res; - complete(&ecr->completion); -} - /** * fname_encrypt() - encrypt a filename * @@ -40,7 +25,7 @@ static int fname_encrypt(struct inode *inode, const struct qstr *iname, struct fscrypt_str *oname) { struct skcipher_request *req = NULL; - DECLARE_FS_COMPLETION_RESULT(ecr); + DECLARE_CRYPTO_WAIT(wait); struct fscrypt_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; @@ -76,17 +61,12 @@ static int fname_encrypt(struct inode *inode, } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - fname_crypt_complete, &ecr); + crypto_req_done, &wait); sg_init_one(&sg, oname->name, cryptlen); skcipher_request_set_crypt(req, &sg, &sg, cryptlen, iv); /* Do the encryption */ - res = crypto_skcipher_encrypt(req); - if (res == -EINPROGRESS || res == -EBUSY) { - /* Request is being completed asynchronously; wait for it */ - wait_for_completion(&ecr.completion); - res = ecr.res; - } + res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); skcipher_request_free(req); if (res < 0) { printk_ratelimited(KERN_ERR @@ -110,7 +90,7 @@ static int fname_decrypt(struct inode *inode, struct fscrypt_str *oname) { struct skcipher_request *req = NULL; - DECLARE_FS_COMPLETION_RESULT(ecr); + DECLARE_CRYPTO_WAIT(wait); struct scatterlist src_sg, dst_sg; struct fscrypt_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_ctfm; @@ -131,7 +111,7 @@ static int fname_decrypt(struct inode *inode, } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - fname_crypt_complete, &ecr); + crypto_req_done, &wait); /* Initialize IV */ memset(iv, 0, FS_CRYPTO_BLOCK_SIZE); @@ -140,11 +120,7 @@ static int fname_decrypt(struct inode *inode, sg_init_one(&src_sg, iname->name, iname->len); sg_init_one(&dst_sg, oname->name, oname->len); skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv); - res = crypto_skcipher_decrypt(req); - if (res == -EINPROGRESS || res == -EBUSY) { - wait_for_completion(&ecr.completion); - res = ecr.res; - } + res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait); skcipher_request_free(req); if (res < 0) { printk_ratelimited(KERN_ERR diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index ff97988fe6e9..c3ad415cd14f 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -70,15 +70,6 @@ typedef enum { #define FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 #define FS_CTX_HAS_BOUNCE_BUFFER_FL 0x00000002 -struct fscrypt_completion_result { - struct completion completion; - int res; -}; - -#define DECLARE_FS_COMPLETION_RESULT(ecr) \ - struct fscrypt_completion_result ecr = { \ - COMPLETION_INITIALIZER_ONSTACK((ecr).completion), 0 } - /* bio stuffs */ #define REQ_OP_READ READ #define REQ_OP_WRITE WRITE diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 3ce6ca91ce23..444c65ed6db8 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -17,17 +17,6 @@ static struct crypto_shash *essiv_hash_tfm; -static void derive_crypt_complete(struct crypto_async_request *req, int rc) -{ - struct fscrypt_completion_result *ecr = req->data; - - if (rc == -EINPROGRESS) - return; - - ecr->res = rc; - complete(&ecr->completion); -} - /** * derive_key_aes() - Derive a key using AES-128-ECB * @deriving_key: Encryption key used for derivation. @@ -42,7 +31,7 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], { int res = 0; struct skcipher_request *req = NULL; - DECLARE_FS_COMPLETION_RESULT(ecr); + DECLARE_CRYPTO_WAIT(wait); struct scatterlist src_sg, dst_sg; struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0); @@ -59,7 +48,7 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - derive_crypt_complete, &ecr); + crypto_req_done, &wait); res = crypto_skcipher_setkey(tfm, deriving_key, FS_AES_128_ECB_KEY_SIZE); if (res < 0) @@ -69,11 +58,7 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], sg_init_one(&dst_sg, derived_raw_key, source_key->size); skcipher_request_set_crypt(req, &src_sg, &dst_sg, source_key->size, NULL); - res = crypto_skcipher_encrypt(req); - if (res == -EINPROGRESS || res == -EBUSY) { - wait_for_completion(&ecr.completion); - res = ecr.res; - } + res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); out: skcipher_request_free(req); crypto_free_skcipher(tfm); From ba1ade71012d50c8c9bedfc6ed6c009a7f4de59e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 9 Jan 2018 16:52:25 -0800 Subject: [PATCH 0523/1212] fscrypt: resolve some cherry-pick bugs - remove wrong linux/fscrypt.h declared in ext4 - remove obsolete function Fixes: 734f0d241d2b ("fscrypt: clean up include file mess") Signed-off-by: Jaegeuk Kim --- fs/ext4/ext4.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index bea7d7febdab..b7e921d207fb 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -37,9 +37,6 @@ #include #endif -#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_EXT4_FS_ENCRYPTION) -#include - /* * The fourth extended filesystem constants/structures */ From 4dd2d0733809951ac9ac4acdeefce30519019261 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 16 Nov 2017 16:59:14 +0800 Subject: [PATCH 0524/1212] f2fs: reserve nid resource for quota sysfile During mkfs, quota sysfiles have already occupied nid resource, it needs to adjust remaining available nid count in kernel side. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 9 +-------- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/node.c | 2 +- fs/f2fs/super.c | 10 +++++++++- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index ecada8425268..4d929627e210 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -49,14 +49,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_imeta = get_pages(sbi, F2FS_DIRTY_IMETA); si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE]; si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; - - si->nquota_files = 0; - if (f2fs_sb_has_quota_ino(sbi->sb)) { - for (i = 0; i < MAXQUOTAS; i++) { - if (f2fs_qf_ino(sbi->sb, i)) - si->nquota_files++; - } - } + si->nquota_files = sbi->nquota_files; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); si->aw_cnt = atomic_read(&sbi->aw_cnt); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index dc4a95e848af..8c03659b13cd 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1165,6 +1165,8 @@ struct f2fs_sb_info { block_t reserved_blocks; /* configurable reserved blocks */ block_t current_reserved_blocks; /* current reserved blocks */ + unsigned int nquota_files; /* # of quota sysfile */ + u32 s_next_generation; /* for NFS support */ /* # of pages, see count_type */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 964c99655942..dca69888d6d3 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2683,7 +2683,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) /* not used nids: 0, node, meta, (and root counted as valid node) */ nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count - - F2FS_RESERVED_NODE_NUM; + sbi->nquota_files - F2FS_RESERVED_NODE_NUM; nm_i->nid_cnt[FREE_NID] = 0; nm_i->nid_cnt[PREALLOC_NID] = 0; nm_i->nat_cnt = 0; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 187cead7bd37..037d22233886 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1009,7 +1009,8 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = user_block_count - valid_user_blocks(sbi) - sbi->current_reserved_blocks; - avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; + avail_node_count = sbi->total_node_count - sbi->nquota_files - + F2FS_RESERVED_NODE_NUM; if (avail_node_count > user_block_count) { buf->f_files = user_block_count; @@ -2470,6 +2471,13 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) else sb->s_qcop = &f2fs_quotactl_ops; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; + + if (f2fs_sb_has_quota_ino(sbi->sb)) { + for (i = 0; i < MAXQUOTAS; i++) { + if (f2fs_qf_ino(sbi->sb, i)) + sbi->nquota_files++; + } + } #endif sb->s_op = &f2fs_sops; From 2d69561135f2cd0e044657f52a031cea2d0d2652 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Fri, 17 Nov 2017 16:13:38 +0800 Subject: [PATCH 0525/1212] f2fs: no need to read nat block if nat_block_bitmap is set No need to read nat block if nat_block_bitmap is set. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index dca69888d6d3..81972b156ebe 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1952,9 +1952,6 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, unsigned int nat_ofs = NAT_BLOCK_OFFSET(start_nid); int i; - if (test_bit_le(nat_ofs, nm_i->nat_block_bitmap)) - return; - __set_bit_le(nat_ofs, nm_i->nat_block_bitmap); i = start_nid % NAT_ENTRY_PER_BLOCK; @@ -2059,10 +2056,13 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) down_read(&nm_i->nat_tree_lock); while (1) { - struct page *page = get_current_nat_page(sbi, nid); + if (!test_bit_le(NAT_BLOCK_OFFSET(nid), + nm_i->nat_block_bitmap)) { + struct page *page = get_current_nat_page(sbi, nid); - scan_nat_page(sbi, page, nid); - f2fs_put_page(page, 1); + scan_nat_page(sbi, page, nid); + f2fs_put_page(page, 1); + } nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK)); if (unlikely(nid >= nm_i->max_nid)) From e1398f6554b462729062d986b100e022b290e7a4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 21 Nov 2017 17:49:54 +0800 Subject: [PATCH 0526/1212] f2fs: remove unneeded memory footprint accounting We forgot to remov memory footprint accounting of per-cpu type variables, fix it. Fixes: 35782b233f37 ("f2fs: remove percpu_count due to performance regression") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 4d929627e210..674f9bbe98d9 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -179,7 +179,6 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem += sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize; si->base_mem += 2 * sizeof(struct f2fs_inode_info); si->base_mem += sizeof(*sbi->ckpt); - si->base_mem += sizeof(struct percpu_counter) * NR_COUNT_TYPE; /* build sm */ si->base_mem += sizeof(struct f2fs_sm_info); From 55e2f89181ceff5dcebd87c0138d64dba8c3644d Mon Sep 17 00:00:00 2001 From: LiFan Date: Wed, 22 Nov 2017 16:07:23 +0800 Subject: [PATCH 0527/1212] f2fs: fix concurrent problem for updating free bitmap alloc_nid_failed and scan_nat_page can be called at the same time, and we haven't protected add_free_nid and update_free_nid_bitmap with the same nid_list_lock. That could lead to Thread A Thread B - __build_free_nids - scan_nat_page - add_free_nid - alloc_nid_failed - update_free_nid_bitmap - update_free_nid_bitmap scan_nat_page will clear the free bitmap since the nid is PREALLOC_NID, but alloc_nid_failed needs to set the free bitmap. This results in free nid with free bitmap cleared. This patch update the bitmap under the same nid_list_lock in add_free_nid. And use __GFP_NOFAIL to make sure to update status of free nid correctly. Signed-off-by: Fan li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 85 +++++++++++++++++++++++++------------------------- 1 file changed, 43 insertions(+), 42 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 81972b156ebe..563c08c4aa7a 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1834,8 +1834,33 @@ static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i, } } +static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, + bool set, bool build) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid); + unsigned int nid_ofs = nid - START_NID(nid); + + if (!test_bit_le(nat_ofs, nm_i->nat_block_bitmap)) + return; + + if (set) { + if (test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs])) + return; + __set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + nm_i->free_nid_count[nat_ofs]++; + } else { + if (!test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs])) + return; + __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + if (!build) + nm_i->free_nid_count[nat_ofs]--; + } +} + /* return if the nid is recognized as free */ -static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) +static bool add_free_nid(struct f2fs_sb_info *sbi, + nid_t nid, bool build, bool update) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i, *e; @@ -1851,8 +1876,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) i->nid = nid; i->state = FREE_NID; - if (radix_tree_preload(GFP_NOFS)) - goto err; + radix_tree_preload(GFP_NOFS | __GFP_NOFAIL); spin_lock(&nm_i->nid_list_lock); @@ -1893,9 +1917,14 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) ret = true; err = __insert_free_nid(sbi, i, FREE_NID); err_out: + if (update) { + update_free_nid_bitmap(sbi, nid, ret, build); + if (!build) + nm_i->available_nids++; + } spin_unlock(&nm_i->nid_list_lock); radix_tree_preload_end(); -err: + if (err) kmem_cache_free(free_nid_slab, i); return ret; @@ -1919,30 +1948,6 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) kmem_cache_free(free_nid_slab, i); } -static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, - bool set, bool build) -{ - struct f2fs_nm_info *nm_i = NM_I(sbi); - unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid); - unsigned int nid_ofs = nid - START_NID(nid); - - if (!test_bit_le(nat_ofs, nm_i->nat_block_bitmap)) - return; - - if (set) { - if (test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs])) - return; - __set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); - nm_i->free_nid_count[nat_ofs]++; - } else { - if (!test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs])) - return; - __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); - if (!build) - nm_i->free_nid_count[nat_ofs]--; - } -} - static void scan_nat_page(struct f2fs_sb_info *sbi, struct page *nat_page, nid_t start_nid) { @@ -1957,18 +1962,18 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, i = start_nid % NAT_ENTRY_PER_BLOCK; for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) { - bool freed = false; - if (unlikely(start_nid >= nm_i->max_nid)) break; blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); f2fs_bug_on(sbi, blk_addr == NEW_ADDR); - if (blk_addr == NULL_ADDR) - freed = add_free_nid(sbi, start_nid, true); - spin_lock(&NM_I(sbi)->nid_list_lock); - update_free_nid_bitmap(sbi, start_nid, freed, true); - spin_unlock(&NM_I(sbi)->nid_list_lock); + if (blk_addr == NULL_ADDR) { + add_free_nid(sbi, start_nid, true, true); + } else { + spin_lock(&NM_I(sbi)->nid_list_lock); + update_free_nid_bitmap(sbi, start_nid, false, true); + spin_unlock(&NM_I(sbi)->nid_list_lock); + } } } @@ -1986,7 +1991,7 @@ static void scan_curseg_cache(struct f2fs_sb_info *sbi) addr = le32_to_cpu(nat_in_journal(journal, i).block_addr); nid = le32_to_cpu(nid_in_journal(journal, i)); if (addr == NULL_ADDR) - add_free_nid(sbi, nid, true); + add_free_nid(sbi, nid, true, false); else remove_free_nid(sbi, nid); } @@ -2013,7 +2018,7 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi) break; nid = i * NAT_ENTRY_PER_BLOCK + idx; - add_free_nid(sbi, nid, true); + add_free_nid(sbi, nid, true, false); if (nm_i->nid_cnt[FREE_NID] >= MAX_FREE_NIDS) goto out; @@ -2519,11 +2524,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, nat_reset_flag(ne); __clear_nat_cache_dirty(NM_I(sbi), set, ne); if (nat_get_blkaddr(ne) == NULL_ADDR) { - add_free_nid(sbi, nid, false); - spin_lock(&NM_I(sbi)->nid_list_lock); - NM_I(sbi)->available_nids++; - update_free_nid_bitmap(sbi, nid, true, false); - spin_unlock(&NM_I(sbi)->nid_list_lock); + add_free_nid(sbi, nid, false, true); } else { spin_lock(&NM_I(sbi)->nid_list_lock); update_free_nid_bitmap(sbi, nid, false, false); From 47ee9b259811529a4653910f39648dbecd6bf89c Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Wed, 22 Nov 2017 18:23:38 +0800 Subject: [PATCH 0528/1212] f2fs: introduce sysfs readdir_ra to readahead inode block in readdir This patch introduces a sysfs interface readdir_ra to enable/disable readaheading inode block in f2fs_readdir. When readdir_ra is enabled, it improves the performance of "readdir + stat". For 300,000 files: time find /data/test > /dev/null disable readdir_ra: 1m25.69s real 0m01.94s user 0m50.80s system enable readdir_ra: 0m18.55s real 0m00.44s user 0m15.39s system Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 6 ++++++ fs/f2fs/dir.c | 4 ++++ fs/f2fs/f2fs.h | 1 + fs/f2fs/sysfs.c | 2 ++ 4 files changed, 13 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 2baed1151eac..db7aab1516de 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -186,3 +186,9 @@ Date: August 2017 Contact: "Jaegeuk Kim" Description: Controls sleep time of GC urgent mode + +What: /sys/fs/f2fs//readdir_ra +Date: November 2017 +Contact: "Sheng Yong" +Description: + Controls readahead inode block in readdir. diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 1955707b138b..55fb45b66ed2 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -798,6 +798,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, unsigned int bit_pos; struct f2fs_dir_entry *de = NULL; struct fscrypt_str de_name = FSTR_INIT(NULL, 0); + struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode); bit_pos = ((unsigned long)ctx->pos % d->max); @@ -836,6 +837,9 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, le32_to_cpu(de->ino), d_type)) return 1; + if (sbi->readdir_ra == 1) + ra_node_page(sbi, le32_to_cpu(de->ino)); + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); ctx->pos = start_pos + bit_pos; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8c03659b13cd..e146ad84e09a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1157,6 +1157,7 @@ struct f2fs_sb_info { int dir_level; /* directory level */ int inline_xattr_size; /* inline xattr size */ unsigned int trigger_ssr_threshold; /* threshold to trigger ssr */ + int readdir_ra; /* readahead inode in readdir */ block_t user_block_count; /* # of user blocks */ block_t total_valid_block_count; /* # of valid blocks */ diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 9835348b6e5d..93c3364250dd 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -299,6 +299,7 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra); #ifdef CONFIG_F2FS_FAULT_INJECTION F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); @@ -346,6 +347,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(cp_interval), ATTR_LIST(idle_interval), ATTR_LIST(iostat_enable), + ATTR_LIST(readdir_ra), #ifdef CONFIG_F2FS_FAULT_INJECTION ATTR_LIST(inject_rate), ATTR_LIST(inject_type), From f130dbb98a68aeab036a82f25589d59fb9a4721d Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Wed, 22 Nov 2017 18:23:39 +0800 Subject: [PATCH 0529/1212] f2fs: still write data if preallocate only partial blocks If there is not enough space left, f2fs_preallocate_blocks may only preallocte partial blocks. As a result, the write operation fails but i_blocks is not 0. To avoid this, f2fs should write data in non-preallocation way and write as many data as the size of i_blocks. Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 043394aa6c62..58fec1c9d460 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -860,8 +860,14 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) if (err) return err; } - if (!f2fs_has_inline_data(inode)) - return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); + if (!f2fs_has_inline_data(inode)) { + err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); + if (map.m_len > 0 && err == -ENOSPC) { + set_inode_flag(inode, FI_NO_PREALLOC); + err = 0; + } + return err; + } return err; } From e5c7c86010305630cfe34130d113a770d169ab4f Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Wed, 22 Nov 2017 18:23:40 +0800 Subject: [PATCH 0530/1212] f2fs: remove unused parameter Commit d260081ccf37 ("f2fs: change recovery policy of xattr node block") removes the use of blkaddr, which is no longer used. So remove the parameter. Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 +-- fs/f2fs/node.c | 2 +- fs/f2fs/recovery.c | 6 +++--- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e146ad84e09a..a785fd3453bb 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2689,8 +2689,7 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid); int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink); void recover_inline_xattr(struct inode *inode, struct page *page); -int recover_xattr_data(struct inode *inode, struct page *page, - block_t blkaddr); +int recover_xattr_data(struct inode *inode, struct page *page); int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); int restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 563c08c4aa7a..ef7330e939f3 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2246,7 +2246,7 @@ void recover_inline_xattr(struct inode *inode, struct page *page) f2fs_put_page(ipage, 1); } -int recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) +int recover_xattr_data(struct inode *inode, struct page *page) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 92c57ace1939..7d63faf51e52 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -404,7 +404,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, } static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, - struct page *page, block_t blkaddr) + struct page *page) { struct dnode_of_data dn; struct node_info ni; @@ -415,7 +415,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, if (IS_INODE(page)) { recover_inline_xattr(inode, page); } else if (f2fs_has_xattr_block(ofs_of_node(page))) { - err = recover_xattr_data(inode, page, blkaddr); + err = recover_xattr_data(inode, page); if (!err) recovered++; goto out; @@ -568,7 +568,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, break; } } - err = do_recover_data(sbi, entry->inode, page, blkaddr); + err = do_recover_data(sbi, entry->inode, page); if (err) { f2fs_put_page(page, 1); break; From e1f9be2f7c82b8b0ac1340fe9ddc3d25fea24b71 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 23 Nov 2017 23:26:52 +0800 Subject: [PATCH 0531/1212] f2fs: fix lock dependency in between dio_rwsem & i_mmap_sem test/generic/208 reports a potential deadlock as below: Chain exists of: &mm->mmap_sem --> &fi->i_mmap_sem --> &fi->dio_rwsem[WRITE] Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&fi->dio_rwsem[WRITE]); lock(&fi->i_mmap_sem); lock(&fi->dio_rwsem[WRITE]); lock(&mm->mmap_sem); This patch changes the lock dependency as below in fallocate() to fix this issue: - dio_rwsem - i_mmap_sem Fixes: bb06664a534b ("f2fs: avoid race in between GC and block exchange") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index bfff53f658e1..e2990f67a4ee 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1170,14 +1170,14 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) pg_start = offset >> PAGE_SHIFT; pg_end = (offset + len) >> PAGE_SHIFT; + /* avoid gc operation during block exchange */ + down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); /* write out all dirty pages from offset */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); if (ret) - goto out; - - /* avoid gc operation during block exchange */ - down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + goto out_unlock; truncate_pagecache(inode, offset); @@ -1196,9 +1196,8 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (!ret) f2fs_i_size_write(inode, new_size); out_unlock: - up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); -out: up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); return ret; } @@ -1369,6 +1368,9 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); + /* avoid gc operation during block exchange */ + down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); ret = truncate_blocks(inode, i_size_read(inode), true); if (ret) @@ -1379,9 +1381,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) if (ret) goto out; - /* avoid gc operation during block exchange */ - down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); - truncate_pagecache(inode, offset); pg_start = offset >> PAGE_SHIFT; @@ -1409,10 +1408,9 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) if (!ret) f2fs_i_size_write(inode, new_size); - - up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); out: up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); return ret; } From b01e03d724dec89a9366526e7e023212ebf9ccec Mon Sep 17 00:00:00 2001 From: LiFan Date: Sat, 25 Nov 2017 11:46:18 +0800 Subject: [PATCH 0532/1212] f2fs: remove an excess variable Remove the variable page_idx which no one would miss. Signed-off-by: Fan li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 58fec1c9d460..d2558828915d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1202,7 +1202,6 @@ static int f2fs_mpage_readpages(struct address_space *mapping, unsigned nr_pages) { struct bio *bio = NULL; - unsigned page_idx; sector_t last_block_in_bio = 0; struct inode *inode = mapping->host; const unsigned blkbits = inode->i_blkbits; @@ -1219,8 +1218,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, map.m_flags = 0; map.m_next_pgofs = NULL; - for (page_idx = 0; nr_pages; page_idx++, nr_pages--) { - + for (; nr_pages; nr_pages--) { if (pages) { page = list_last_entry(pages, struct page, lru); From 6d025237a1f8f205c08efdb80ba991bf41df98b1 Mon Sep 17 00:00:00 2001 From: Zhikang Zhang Date: Sun, 26 Nov 2017 02:34:28 +0800 Subject: [PATCH 0533/1212] f2fs: remove repeated f2fs_bug_on f2fs: remove repeated f2fs_bug_on which has already existed in function invalidate_blocks. Signed-off-by: Zhikang Zhang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index ef7330e939f3..dda40f5c4c9b 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -702,7 +702,6 @@ static void truncate_node(struct dnode_of_data *dn) struct node_info ni; get_node_info(sbi, dn->nid, &ni); - f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR); /* Deallocate node address */ invalidate_blocks(sbi, ni.blk_addr); @@ -2260,7 +2259,6 @@ int recover_xattr_data(struct inode *inode, struct page *page) /* 1: invalidate the previous xattr nid */ get_node_info(sbi, prev_xnid, &ni); - f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR); invalidate_blocks(sbi, ni.blk_addr); dec_valid_node_count(sbi, inode, false); set_node_addr(sbi, &ni, NULL_ADDR, false); From 101c6a96ad1c69011659bb1ddc9bad2b534aac17 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 29 Nov 2017 12:35:28 -0800 Subject: [PATCH 0534/1212] f2fs: switch to fscrypt_file_open() Reviewed-by: Chao Yu Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index e2990f67a4ee..81f298ac4cb2 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -474,22 +474,10 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) static int f2fs_file_open(struct inode *inode, struct file *filp) { - struct dentry *dir; + int err = fscrypt_file_open(inode, filp); - if (f2fs_encrypted_inode(inode)) { - int ret = fscrypt_get_encryption_info(inode); - if (ret) - return -EACCES; - if (!fscrypt_has_encryption_key(inode)) - return -ENOKEY; - } - dir = dget_parent(file_dentry(filp)); - if (f2fs_encrypted_inode(d_inode(dir)) && - !fscrypt_has_permitted_context(d_inode(dir), inode)) { - dput(dir); - return -EPERM; - } - dput(dir); + if (err) + return err; return dquot_file_open(inode, filp); } From aeaac517a12d29fdb94edec69d1e6140f775e483 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 29 Nov 2017 12:35:29 -0800 Subject: [PATCH 0535/1212] f2fs: switch to fscrypt_prepare_link() Reviewed-by: Chao Yu Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index cf8f4370d256..c93df5a1b305 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -240,9 +240,9 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, if (unlikely(f2fs_cp_error(sbi))) return -EIO; - if (f2fs_encrypted_inode(dir) && - !fscrypt_has_permitted_context(dir, inode)) - return -EPERM; + err = fscrypt_prepare_link(old_dentry, dir, dentry); + if (err) + return err; if (is_inode_flag_set(dir, FI_PROJ_INHERIT) && (!projid_eq(F2FS_I(dir)->i_projid, From 9ab470eaf8a8fb0bee0eb781f151ddbe677385b4 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 29 Nov 2017 12:35:30 -0800 Subject: [PATCH 0536/1212] f2fs: switch to fscrypt_prepare_rename() Reviewed-by: Chao Yu Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 31 +++++++------------------------ 1 file changed, 7 insertions(+), 24 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index c93df5a1b305..e7fd30e45f47 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -797,18 +797,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (unlikely(f2fs_cp_error(sbi))) return -EIO; - if ((f2fs_encrypted_inode(old_dir) && - !fscrypt_has_encryption_key(old_dir)) || - (f2fs_encrypted_inode(new_dir) && - !fscrypt_has_encryption_key(new_dir))) - return -ENOKEY; - - if ((old_dir != new_dir) && f2fs_encrypted_inode(new_dir) && - !fscrypt_has_permitted_context(new_dir, old_inode)) { - err = -EPERM; - goto out; - } - if (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && (!projid_eq(F2FS_I(new_dir)->i_projid, F2FS_I(old_dentry->d_inode)->i_projid))) @@ -999,18 +987,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, if (unlikely(f2fs_cp_error(sbi))) return -EIO; - if ((f2fs_encrypted_inode(old_dir) && - !fscrypt_has_encryption_key(old_dir)) || - (f2fs_encrypted_inode(new_dir) && - !fscrypt_has_encryption_key(new_dir))) - return -ENOKEY; - - if ((f2fs_encrypted_inode(old_dir) || f2fs_encrypted_inode(new_dir)) && - (old_dir != new_dir) && - (!fscrypt_has_permitted_context(new_dir, old_inode) || - !fscrypt_has_permitted_context(old_dir, new_inode))) - return -EPERM; - if ((is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && !projid_eq(F2FS_I(new_dir)->i_projid, F2FS_I(old_dentry->d_inode)->i_projid)) || @@ -1150,9 +1126,16 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { + int err; + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; + err = fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry, + flags); + if (err) + return err; + if (flags & RENAME_EXCHANGE) { return f2fs_cross_rename(old_dir, old_dentry, new_dir, new_dentry); From bb8b850365ffd071b14def46b80eaa07bded0e13 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 29 Nov 2017 12:35:31 -0800 Subject: [PATCH 0537/1212] f2fs: switch to fscrypt_prepare_lookup() Reviewed-by: Chao Yu Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index e7fd30e45f47..6e2c78c06f79 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -357,20 +357,9 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, trace_f2fs_lookup_start(dir, dentry, flags); - if (f2fs_encrypted_inode(dir)) { - err = fscrypt_get_encryption_info(dir); - - /* - * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is - * created while the directory was encrypted and we - * don't have access to the key. - */ - if (fscrypt_has_encryption_key(dir)) - fscrypt_set_encrypted_dentry(dentry); - fscrypt_set_d_op(dentry); - if (err && err != -ENOKEY) - goto out; - } + err = fscrypt_prepare_lookup(dir, dentry, flags); + if (err) + goto out; if (dentry->d_name.len > F2FS_NAME_LEN) { err = -ENAMETOOLONG; From c80f01959114c0f49bac2a006a419b8d59104353 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 29 Nov 2017 12:35:32 -0800 Subject: [PATCH 0538/1212] f2fs: switch to fscrypt_prepare_setattr() Reviewed-by: Chao Yu Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 81f298ac4cb2..69bb0bb44826 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -724,6 +724,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (err) return err; + err = fscrypt_prepare_setattr(dentry, attr); + if (err) + return err; + if (is_quota_modification(inode, attr)) { err = dquot_initialize(inode); if (err) @@ -739,14 +743,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } if (attr->ia_valid & ATTR_SIZE) { - if (f2fs_encrypted_inode(inode)) { - err = fscrypt_get_encryption_info(inode); - if (err) - return err; - if (!fscrypt_has_encryption_key(inode)) - return -ENOKEY; - } - if (attr->ia_size <= i_size_read(inode)) { down_write(&F2FS_I(inode)->i_mmap_sem); truncate_setsize(inode, attr->ia_size); From 3bc01114a338a9ac336b3e139948e69ef0488a43 Mon Sep 17 00:00:00 2001 From: Hyunchul Lee Date: Tue, 28 Nov 2017 09:23:00 +0900 Subject: [PATCH 0539/1212] f2fs: apply write hints to select the type of segment for direct write When blocks are allocated for direct write, select the type of segment using the kiocb hint. But if an inode has FI_NO_ALLOC, use the inode hint. Signed-off-by: Hyunchul Lee Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 28 ++++++++++++++++++++-------- fs/f2fs/f2fs.h | 11 +++++++++++ fs/f2fs/file.c | 6 ++++-- fs/f2fs/segment.c | 2 -- 4 files changed, 35 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d2558828915d..3b0cf32c1d66 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -781,7 +781,7 @@ struct page *get_new_data_page(struct inode *inode, return page; } -static int __allocate_data_block(struct dnode_of_data *dn) +static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct f2fs_summary sum; @@ -806,7 +806,7 @@ static int __allocate_data_block(struct dnode_of_data *dn) set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr, - &sum, CURSEG_WARM_DATA, NULL, false); + &sum, seg_type, NULL, false); set_data_blkaddr(dn); /* update i_size */ @@ -849,12 +849,16 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) map.m_len = 0; map.m_next_pgofs = NULL; + map.m_seg_type = NO_CHECK_TYPE; - if (iocb->ki_flags & IOCB_DIRECT) + if (iocb->ki_flags & IOCB_DIRECT) { + /* map.m_seg_type = rw_hint_to_seg_type(iocb->ki_hint); */ + map.m_seg_type = rw_hint_to_seg_type(WRITE_LIFE_NOT_SET); return f2fs_map_blocks(inode, &map, 1, __force_buffered_io(inode, WRITE) ? F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO); + } if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) { err = f2fs_convert_inline_inode(inode); if (err) @@ -964,7 +968,8 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, last_ofs_in_node = dn.ofs_in_node; } } else { - err = __allocate_data_block(&dn); + err = __allocate_data_block(&dn, + map->m_seg_type); if (!err) set_inode_flag(inode, FI_APPEND_WRITE); } @@ -1057,7 +1062,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, static int __get_data_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create, int flag, - pgoff_t *next_pgofs) + pgoff_t *next_pgofs, int seg_type) { struct f2fs_map_blocks map; int err; @@ -1065,6 +1070,7 @@ static int __get_data_block(struct inode *inode, sector_t iblock, map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; map.m_next_pgofs = next_pgofs; + map.m_seg_type = seg_type; err = f2fs_map_blocks(inode, &map, create, flag); if (!err) { @@ -1080,14 +1086,18 @@ static int get_data_block(struct inode *inode, sector_t iblock, pgoff_t *next_pgofs) { return __get_data_block(inode, iblock, bh_result, create, - flag, next_pgofs); + flag, next_pgofs, + NO_CHECK_TYPE); } static int get_data_block_dio(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_DEFAULT, NULL); + F2FS_GET_BLOCK_DEFAULT, NULL, + rw_hint_to_seg_type( + WRITE_LIFE_NOT_SET)); + /* inode->i_write_hint)); */ } static int get_data_block_bmap(struct inode *inode, sector_t iblock, @@ -1098,7 +1108,8 @@ static int get_data_block_bmap(struct inode *inode, sector_t iblock, return -EFBIG; return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_BMAP, NULL); + F2FS_GET_BLOCK_BMAP, NULL, + NO_CHECK_TYPE); } static inline sector_t logical_to_blk(struct inode *inode, loff_t offset) @@ -1217,6 +1228,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, map.m_len = 0; map.m_flags = 0; map.m_next_pgofs = NULL; + map.m_seg_type = NO_CHECK_TYPE; for (; nr_pages; nr_pages--) { if (pages) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a785fd3453bb..721ea01f28bb 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -606,6 +606,7 @@ struct f2fs_map_blocks { unsigned int m_len; unsigned int m_flags; pgoff_t *m_next_pgofs; /* point next possible non-hole pgofs */ + int m_seg_type; }; /* for flag in get_data_block */ @@ -2503,6 +2504,15 @@ static inline void *kvzalloc(size_t size, gfp_t flags) return ret; } +enum rw_hint { + WRITE_LIFE_NOT_SET = 0, + WRITE_LIFE_NONE = 1, /* RWH_WRITE_LIFE_NONE */ + WRITE_LIFE_SHORT = 2, /* RWH_WRITE_LIFE_SHORT */ + WRITE_LIFE_MEDIUM = 3, /* RWH_WRITE_LIFE_MEDIUM */ + WRITE_LIFE_LONG = 4, /* RWH_WRITE_LIFE_LONG */ + WRITE_LIFE_EXTREME = 5, /* RWH_WRITE_LIFE_EXTREME */ +}; + static inline int get_extra_isize(struct inode *inode) { return F2FS_I(inode)->i_extra_isize / sizeof(__le32); @@ -2756,6 +2766,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi); void destroy_segment_manager(struct f2fs_sb_info *sbi); int __init create_segment_manager_caches(void); void destroy_segment_manager_caches(void); +int rw_hint_to_seg_type(enum rw_hint hint); /* * checkpoint.c diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 69bb0bb44826..a5e96f8fc42c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1402,7 +1402,8 @@ static int expand_inode_data(struct inode *inode, loff_t offset, loff_t len, int mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_map_blocks map = { .m_next_pgofs = NULL }; + struct f2fs_map_blocks map = { .m_next_pgofs = NULL, + .m_seg_type = NO_CHECK_TYPE }; pgoff_t pg_end; loff_t new_size = i_size_read(inode); loff_t off_end; @@ -2042,7 +2043,8 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, struct f2fs_defragment *range) { struct inode *inode = file_inode(filp); - struct f2fs_map_blocks map = { .m_next_pgofs = NULL }; + struct f2fs_map_blocks map = { .m_next_pgofs = NULL, + .m_seg_type = NO_CHECK_TYPE }; struct extent_info ei = {0,0,0}; pgoff_t pg_start, pg_end; unsigned int blk_per_seg = sbi->blocks_per_seg; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 94939a5a96c8..82fb22b5e4ad 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2514,7 +2514,6 @@ static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) return false; } -#if 0 int rw_hint_to_seg_type(enum rw_hint hint) { switch (hint) { @@ -2526,7 +2525,6 @@ int rw_hint_to_seg_type(enum rw_hint hint) return CURSEG_WARM_DATA; } } -#endif static int __get_segment_type_2(struct f2fs_io_info *fio) { From d94680798786ffd3e8c87c6d2727a75c6616dc5b Mon Sep 17 00:00:00 2001 From: LiFan Date: Tue, 28 Nov 2017 20:17:41 +0800 Subject: [PATCH 0540/1212] f2fs: remove a redundant conditional expression Avoid checking is_inode repeatedly, and make the logic a little bit clearer. Signed-off-by: Fan li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 721ea01f28bb..62e6f630381c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2057,11 +2057,11 @@ static inline block_t datablock_addr(struct inode *inode, raw_node = F2FS_NODE(node_page); /* from GC path only */ - if (!inode) { - if (is_inode) + if (is_inode) { + if (!inode) base = offset_in_addr(&raw_node->i); - } else if (f2fs_has_extra_attr(inode) && is_inode) { - base = get_extra_isize(inode); + else if (f2fs_has_extra_attr(inode)) + base = get_extra_isize(inode); } addr_array = blkaddr_in_node(raw_node); From 8b33886c37cdff86070ca0fec4bdf7f644dea219 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 30 Nov 2017 19:28:17 +0800 Subject: [PATCH 0541/1212] f2fs: inject fault to kzalloc This patch introduces f2fs_kzalloc based on f2fs_kmalloc in order to support error injection for kzalloc(). Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/debug.c | 2 +- fs/f2fs/f2fs.h | 6 ++++++ fs/f2fs/namei.c | 2 +- fs/f2fs/node.c | 7 ++++--- fs/f2fs/segment.c | 30 ++++++++++++++++-------------- fs/f2fs/xattr.c | 8 ++++---- 7 files changed, 33 insertions(+), 24 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 2eb778174a9b..8e629434cd05 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -797,7 +797,7 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) block_t cp_blk_no; int i; - sbi->ckpt = kzalloc(cp_blks * blk_size, GFP_KERNEL); + sbi->ckpt = f2fs_kzalloc(sbi, cp_blks * blk_size, GFP_KERNEL); if (!sbi->ckpt) return -ENOMEM; /* diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 674f9bbe98d9..a66107b5cfff 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -439,7 +439,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_stat_info *si; - si = kzalloc(sizeof(struct f2fs_stat_info), GFP_KERNEL); + si = f2fs_kzalloc(sbi, sizeof(struct f2fs_stat_info), GFP_KERNEL); if (!si) return -ENOMEM; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 62e6f630381c..569d9fb0bada 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2513,6 +2513,12 @@ enum rw_hint { WRITE_LIFE_EXTREME = 5, /* RWH_WRITE_LIFE_EXTREME */ }; +static inline void *f2fs_kzalloc(struct f2fs_sb_info *sbi, + size_t size, gfp_t flags) +{ + return f2fs_kmalloc(sbi, size, flags | __GFP_ZERO); +} + static inline int get_extra_isize(struct inode *inode) { return F2FS_I(inode)->i_extra_isize / sizeof(__le32); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 6e2c78c06f79..f44ce8c34966 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -530,7 +530,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, struct qstr istr = QSTR_INIT(symname, len); struct fscrypt_str ostr; - sd = kzalloc(disk_link.len, GFP_NOFS); + sd = f2fs_kzalloc(sbi, disk_link.len, GFP_NOFS); if (!sd) { err = -ENOMEM; goto err_out; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index dda40f5c4c9b..f10f685a2601 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2603,8 +2603,8 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) nm_i->nat_bits_blocks = F2FS_BYTES_TO_BLK((nat_bits_bytes << 1) + 8 + F2FS_BLKSIZE - 1); - nm_i->nat_bits = kzalloc(nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, - GFP_KERNEL); + nm_i->nat_bits = f2fs_kzalloc(sbi, + nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, GFP_KERNEL); if (!nm_i->nat_bits) return -ENOMEM; @@ -2750,7 +2750,8 @@ int build_node_manager(struct f2fs_sb_info *sbi) { int err; - sbi->nm_info = kzalloc(sizeof(struct f2fs_nm_info), GFP_KERNEL); + sbi->nm_info = f2fs_kzalloc(sbi, sizeof(struct f2fs_nm_info), + GFP_KERNEL); if (!sbi->nm_info) return -ENOMEM; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 82fb22b5e4ad..19403f8e2161 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -657,7 +657,7 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) goto init_thread; } - fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL); + fcc = f2fs_kzalloc(sbi, sizeof(struct flush_cmd_control), GFP_KERNEL); if (!fcc) return -ENOMEM; atomic_set(&fcc->issued_flush, 0); @@ -1817,7 +1817,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) goto init_thread; } - dcc = kzalloc(sizeof(struct discard_cmd_control), GFP_KERNEL); + dcc = f2fs_kzalloc(sbi, sizeof(struct discard_cmd_control), GFP_KERNEL); if (!dcc) return -ENOMEM; @@ -3419,7 +3419,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) unsigned int bitmap_size; /* allocate memory for SIT information */ - sit_i = kzalloc(sizeof(struct sit_info), GFP_KERNEL); + sit_i = f2fs_kzalloc(sbi, sizeof(struct sit_info), GFP_KERNEL); if (!sit_i) return -ENOMEM; @@ -3437,29 +3437,30 @@ static int build_sit_info(struct f2fs_sb_info *sbi) for (start = 0; start < MAIN_SEGS(sbi); start++) { sit_i->sentries[start].cur_valid_map - = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); sit_i->sentries[start].ckpt_valid_map - = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); if (!sit_i->sentries[start].cur_valid_map || !sit_i->sentries[start].ckpt_valid_map) return -ENOMEM; #ifdef CONFIG_F2FS_CHECK_FS sit_i->sentries[start].cur_valid_map_mir - = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); if (!sit_i->sentries[start].cur_valid_map_mir) return -ENOMEM; #endif if (f2fs_discard_en(sbi)) { sit_i->sentries[start].discard_map - = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, + GFP_KERNEL); if (!sit_i->sentries[start].discard_map) return -ENOMEM; } } - sit_i->tmp_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + sit_i->tmp_map = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); if (!sit_i->tmp_map) return -ENOMEM; @@ -3508,7 +3509,7 @@ static int build_free_segmap(struct f2fs_sb_info *sbi) unsigned int bitmap_size, sec_bitmap_size; /* allocate memory for free segmap information */ - free_i = kzalloc(sizeof(struct free_segmap_info), GFP_KERNEL); + free_i = f2fs_kzalloc(sbi, sizeof(struct free_segmap_info), GFP_KERNEL); if (!free_i) return -ENOMEM; @@ -3549,12 +3550,12 @@ static int build_curseg(struct f2fs_sb_info *sbi) for (i = 0; i < NR_CURSEG_TYPE; i++) { mutex_init(&array[i].curseg_mutex); - array[i].sum_blk = kzalloc(PAGE_SIZE, GFP_KERNEL); + array[i].sum_blk = f2fs_kzalloc(sbi, PAGE_SIZE, GFP_KERNEL); if (!array[i].sum_blk) return -ENOMEM; init_rwsem(&array[i].journal_rwsem); - array[i].journal = kzalloc(sizeof(struct f2fs_journal), - GFP_KERNEL); + array[i].journal = f2fs_kzalloc(sbi, + sizeof(struct f2fs_journal), GFP_KERNEL); if (!array[i].journal) return -ENOMEM; array[i].segno = NULL_SEGNO; @@ -3712,7 +3713,8 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi) unsigned int bitmap_size, i; /* allocate memory for dirty segments list information */ - dirty_i = kzalloc(sizeof(struct dirty_seglist_info), GFP_KERNEL); + dirty_i = f2fs_kzalloc(sbi, sizeof(struct dirty_seglist_info), + GFP_KERNEL); if (!dirty_i) return -ENOMEM; @@ -3766,7 +3768,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi) struct f2fs_sm_info *sm_info; int err; - sm_info = kzalloc(sizeof(struct f2fs_sm_info), GFP_KERNEL); + sm_info = f2fs_kzalloc(sbi, sizeof(struct f2fs_sm_info), GFP_KERNEL); if (!sm_info) return -ENOMEM; diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 7acf56ebda65..47ac858787ea 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -345,8 +345,8 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, if (!size && !inline_size) return -ENODATA; - txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE, - GFP_F2FS_ZERO); + txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode), + inline_size + size + XATTR_PADDING_SIZE, GFP_NOFS); if (!txattr_addr) return -ENOMEM; @@ -398,8 +398,8 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, void *txattr_addr; int err; - txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE, - GFP_F2FS_ZERO); + txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode), + inline_size + size + XATTR_PADDING_SIZE, GFP_NOFS); if (!txattr_addr) return -ENOMEM; From 5d4e487b9929cced66ccdeb29e0ef429fed2f504 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 30 Nov 2017 19:28:18 +0800 Subject: [PATCH 0542/1212] f2fs: inject fault to kvmalloc This patch supports to inject fault into kvmalloc/kvzalloc. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 19 +++++++++++++++++++ fs/f2fs/file.c | 6 ++++-- fs/f2fs/node.c | 6 +++--- fs/f2fs/segment.c | 16 +++++++++------- fs/f2fs/super.c | 1 + 5 files changed, 36 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 569d9fb0bada..1320f7255fb1 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -44,6 +44,7 @@ #ifdef CONFIG_F2FS_FAULT_INJECTION enum { FAULT_KMALLOC, + FAULT_KVMALLOC, FAULT_PAGE_ALLOC, FAULT_PAGE_GET, FAULT_ALLOC_BIO, @@ -2519,6 +2520,24 @@ static inline void *f2fs_kzalloc(struct f2fs_sb_info *sbi, return f2fs_kmalloc(sbi, size, flags | __GFP_ZERO); } +static inline void *f2fs_kvmalloc(struct f2fs_sb_info *sbi, + size_t size, gfp_t flags) +{ +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_KVMALLOC)) { + f2fs_show_injection_info(FAULT_KVMALLOC); + return NULL; + } +#endif + return kvmalloc(size, flags); +} + +static inline void *f2fs_kvzalloc(struct f2fs_sb_info *sbi, + size_t size, gfp_t flags) +{ + return f2fs_kvmalloc(sbi, size, flags | __GFP_ZERO); +} + static inline int get_extra_isize(struct inode *inode) { return F2FS_I(inode)->i_extra_isize / sizeof(__le32); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a5e96f8fc42c..260aeb0d8bc2 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1082,11 +1082,13 @@ static int __exchange_data_block(struct inode *src_inode, while (len) { olen = min((pgoff_t)4 * ADDRS_PER_BLOCK, len); - src_blkaddr = kvzalloc(sizeof(block_t) * olen, GFP_KERNEL); + src_blkaddr = f2fs_kvzalloc(F2FS_I_SB(src_inode), + sizeof(block_t) * olen, GFP_KERNEL); if (!src_blkaddr) return -ENOMEM; - do_replace = kvzalloc(sizeof(int) * olen, GFP_KERNEL); + do_replace = f2fs_kvzalloc(F2FS_I_SB(src_inode), + sizeof(int) * olen, GFP_KERNEL); if (!do_replace) { kvfree(src_blkaddr); return -ENOMEM; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index f10f685a2601..d833efceae82 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2729,17 +2729,17 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); - nm_i->free_nid_bitmap = kvzalloc(nm_i->nat_blocks * + nm_i->free_nid_bitmap = f2fs_kvzalloc(sbi, nm_i->nat_blocks * NAT_ENTRY_BITMAP_SIZE, GFP_KERNEL); if (!nm_i->free_nid_bitmap) return -ENOMEM; - nm_i->nat_block_bitmap = kvzalloc(nm_i->nat_blocks / 8, + nm_i->nat_block_bitmap = f2fs_kvzalloc(sbi, nm_i->nat_blocks / 8, GFP_KERNEL); if (!nm_i->nat_block_bitmap) return -ENOMEM; - nm_i->free_nid_count = kvzalloc(nm_i->nat_blocks * + nm_i->free_nid_count = f2fs_kvzalloc(sbi, nm_i->nat_blocks * sizeof(unsigned short), GFP_KERNEL); if (!nm_i->free_nid_count) return -ENOMEM; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 19403f8e2161..ac12f3deac75 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3425,13 +3425,14 @@ static int build_sit_info(struct f2fs_sb_info *sbi) SM_I(sbi)->sit_info = sit_i; - sit_i->sentries = kvzalloc(MAIN_SEGS(sbi) * + sit_i->sentries = f2fs_kvzalloc(sbi, MAIN_SEGS(sbi) * sizeof(struct seg_entry), GFP_KERNEL); if (!sit_i->sentries) return -ENOMEM; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); - sit_i->dirty_sentries_bitmap = kvzalloc(bitmap_size, GFP_KERNEL); + sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(sbi, bitmap_size, + GFP_KERNEL); if (!sit_i->dirty_sentries_bitmap) return -ENOMEM; @@ -3465,7 +3466,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) return -ENOMEM; if (sbi->segs_per_sec > 1) { - sit_i->sec_entries = kvzalloc(MAIN_SECS(sbi) * + sit_i->sec_entries = f2fs_kvzalloc(sbi, MAIN_SECS(sbi) * sizeof(struct sec_entry), GFP_KERNEL); if (!sit_i->sec_entries) return -ENOMEM; @@ -3516,12 +3517,12 @@ static int build_free_segmap(struct f2fs_sb_info *sbi) SM_I(sbi)->free_info = free_i; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); - free_i->free_segmap = kvmalloc(bitmap_size, GFP_KERNEL); + free_i->free_segmap = f2fs_kvmalloc(sbi, bitmap_size, GFP_KERNEL); if (!free_i->free_segmap) return -ENOMEM; sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); - free_i->free_secmap = kvmalloc(sec_bitmap_size, GFP_KERNEL); + free_i->free_secmap = f2fs_kvmalloc(sbi, sec_bitmap_size, GFP_KERNEL); if (!free_i->free_secmap) return -ENOMEM; @@ -3701,7 +3702,7 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi) struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); - dirty_i->victim_secmap = kvzalloc(bitmap_size, GFP_KERNEL); + dirty_i->victim_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); if (!dirty_i->victim_secmap) return -ENOMEM; return 0; @@ -3724,7 +3725,8 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi) bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); for (i = 0; i < NR_DIRTY_TYPE; i++) { - dirty_i->dirty_segmap[i] = kvzalloc(bitmap_size, GFP_KERNEL); + dirty_i->dirty_segmap[i] = f2fs_kvzalloc(sbi, bitmap_size, + GFP_KERNEL); if (!dirty_i->dirty_segmap[i]) return -ENOMEM; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 037d22233886..17ac1c9b2f85 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -43,6 +43,7 @@ static struct kmem_cache *f2fs_inode_cachep; char *fault_name[FAULT_MAX] = { [FAULT_KMALLOC] = "kmalloc", + [FAULT_KVMALLOC] = "kvmalloc", [FAULT_PAGE_ALLOC] = "page alloc", [FAULT_PAGE_GET] = "page get", [FAULT_ALLOC_BIO] = "alloc bio", From e7db649b5fb191a56fb83ed47c3bbe08f4b7c955 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 30 Nov 2017 19:28:19 +0800 Subject: [PATCH 0543/1212] f2fs: spread f2fs_k{m,z}alloc Use f2fs_k{m,z}alloc as much as possible to increase fault injection points. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- fs/f2fs/super.c | 16 +++++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ac12f3deac75..fac18cc58c44 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3543,7 +3543,7 @@ static int build_curseg(struct f2fs_sb_info *sbi) struct curseg_info *array; int i; - array = kcalloc(NR_CURSEG_TYPE, sizeof(*array), GFP_KERNEL); + array = f2fs_kzalloc(sbi, sizeof(*array) * NR_CURSEG_TYPE, GFP_KERNEL); if (!array) return -ENOMEM; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 17ac1c9b2f85..ce3b4d88de6e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2154,14 +2154,15 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) if (nr_sectors & (bdev_zone_sectors(bdev) - 1)) FDEV(devi).nr_blkz++; - FDEV(devi).blkz_type = kmalloc(FDEV(devi).nr_blkz, GFP_KERNEL); + FDEV(devi).blkz_type = f2fs_kmalloc(sbi, FDEV(devi).nr_blkz, + GFP_KERNEL); if (!FDEV(devi).blkz_type) return -ENOMEM; #define F2FS_REPORT_NR_ZONES 4096 - zones = kcalloc(F2FS_REPORT_NR_ZONES, sizeof(struct blk_zone), - GFP_KERNEL); + zones = f2fs_kzalloc(sbi, sizeof(struct blk_zone) * + F2FS_REPORT_NR_ZONES, GFP_KERNEL); if (!zones) return -ENOMEM; @@ -2305,8 +2306,8 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) * Initialize multiple devices information, or single * zoned block device information. */ - sbi->devs = kcalloc(max_devices, sizeof(struct f2fs_dev_info), - GFP_KERNEL); + sbi->devs = f2fs_kzalloc(sbi, sizeof(struct f2fs_dev_info) * + max_devices, GFP_KERNEL); if (!sbi->devs) return -ENOMEM; @@ -2512,8 +2513,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) int n = (i == META) ? 1: NR_TEMP_TYPE; int j; - sbi->write_io[i] = kmalloc(n * sizeof(struct f2fs_bio_info), - GFP_KERNEL); + sbi->write_io[i] = f2fs_kmalloc(sbi, + n * sizeof(struct f2fs_bio_info), + GFP_KERNEL); if (!sbi->write_io[i]) { err = -ENOMEM; goto free_options; From 1f994d47080c0bac79eb20e90649664799cfce28 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 30 Nov 2017 19:28:20 +0800 Subject: [PATCH 0544/1212] f2fs: fix error handling in fill_super In fill_super, if we fail to call f2fs_build_stats(), it needs to detach from global f2fs shrink list, otherwise once system starts to shrink slab cache, we will encounter below panic: BUG: unable to handle kernel paging request at 00007d35 Oops: 0002 [#1] PREEMPT SMP EIP: __lock_acquire+0x70/0x12c0 Call Trace: lock_acquire+0xae/0x220 mutex_trylock+0xc5/0xf0 f2fs_shrink_count+0x32/0xb0 [f2fs] shrink_slab+0xf1/0x5b0 drop_slab_node+0x35/0x60 drop_slab+0xf/0x20 drop_caches_sysctl_handler+0x79/0xc0 proc_sys_call_handler+0xa4/0xc0 proc_sys_write+0x1f/0x30 __vfs_write+0x24/0x150 SyS_write+0x44/0x90 do_fast_syscall_32+0xa1/0x1ca entry_SYSENTER_32+0x4c/0x7b In addition, this patch relocates f2fs_join_shrinker in fill_super to avoid unneeded error handling of it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ce3b4d88de6e..b6a96a8fb794 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2623,18 +2623,16 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) goto free_nm; } - f2fs_join_shrinker(sbi); - err = f2fs_build_stats(sbi); if (err) - goto free_nm; + goto free_node_inode; /* read root inode and dentry */ root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); if (IS_ERR(root)) { f2fs_msg(sb, KERN_ERR, "Failed to read root inode"); err = PTR_ERR(root); - goto free_node_inode; + goto free_stats; } if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { iput(root); @@ -2730,6 +2728,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) sbi->valid_super_block ? 1 : 2, err); } + f2fs_join_shrinker(sbi); + f2fs_msg(sbi->sb, KERN_NOTICE, "Mounted with checkpoint version = %llx", cur_cp_version(F2FS_CKPT(sbi))); f2fs_update_time(sbi, CP_TIME); @@ -2756,14 +2756,12 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) free_root_inode: dput(sb->s_root); sb->s_root = NULL; -free_node_inode: - truncate_inode_pages_final(NODE_MAPPING(sbi)); - mutex_lock(&sbi->umount_mutex); - release_ino_entry(sbi, true); - f2fs_leave_shrinker(sbi); - iput(sbi->node_inode); - mutex_unlock(&sbi->umount_mutex); +free_stats: f2fs_destroy_stats(sbi); +free_node_inode: + release_ino_entry(sbi, true); + truncate_inode_pages_final(NODE_MAPPING(sbi)); + iput(sbi->node_inode); free_nm: destroy_node_manager(sbi); free_sm: From e4f5e26cdadf858848e1157067fc853d3c41eb4e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 30 Nov 2017 19:28:21 +0800 Subject: [PATCH 0545/1212] f2fs: clean up hash codes f2fs_chksum and f2fs_crc32 use the same 'crc32' crypto engine, also their implementation are almost the same, except with different shash description context. Introduce __f2fs_crc32 to wrap the common codes, and reuse it in f2fs_chksum and f2fs_crc32. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 43 +++++++++++++++++++------------------------ 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1320f7255fb1..751654b48a4d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1319,30 +1319,7 @@ static inline bool is_idle(struct f2fs_sb_info *sbi) /* * Inline functions */ -static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address, - unsigned int length) -{ - SHASH_DESC_ON_STACK(shash, sbi->s_chksum_driver); - u32 *ctx = (u32 *)shash_desc_ctx(shash); - int err; - - shash->tfm = sbi->s_chksum_driver; - shash->flags = 0; - *ctx = F2FS_SUPER_MAGIC; - - err = crypto_shash_update(shash, address, length); - BUG_ON(err); - - return *ctx; -} - -static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc, - void *buf, size_t buf_size) -{ - return f2fs_crc32(sbi, buf, buf_size) == blk_crc; -} - -static inline u32 f2fs_chksum(struct f2fs_sb_info *sbi, u32 crc, +static inline u32 __f2fs_crc32(struct f2fs_sb_info *sbi, u32 crc, const void *address, unsigned int length) { struct { @@ -1363,6 +1340,24 @@ static inline u32 f2fs_chksum(struct f2fs_sb_info *sbi, u32 crc, return *(u32 *)desc.ctx; } +static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address, + unsigned int length) +{ + return __f2fs_crc32(sbi, F2FS_SUPER_MAGIC, address, length); +} + +static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc, + void *buf, size_t buf_size) +{ + return f2fs_crc32(sbi, buf, buf_size) == blk_crc; +} + +static inline u32 f2fs_chksum(struct f2fs_sb_info *sbi, u32 crc, + const void *address, unsigned int length) +{ + return __f2fs_crc32(sbi, crc, address, length); +} + static inline struct f2fs_inode_info *F2FS_I(struct inode *inode) { return container_of(inode, struct f2fs_inode_info, vfs_inode); From f7986c416d1b4d50e9129c02d6e2d6849db3ea24 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 30 Nov 2017 19:28:22 +0800 Subject: [PATCH 0546/1212] f2fs: clean up f2fs_map_blocks f2fs_map_blocks(): if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) { if (create) { ... } else { ... if (flag == F2FS_GET_BLOCK_FIEMAP && blkaddr == NULL_ADDR) { ... } if (flag != F2FS_GET_BLOCK_FIEMAP || blkaddr != NEW_ADDR) goto sync_out; } It means we can break the loop in cases of: a) flag != F2FS_GET_BLOCK_FIEMAP or b) flag == F2FS_GET_BLOCK_FIEMAP && blkaddr == NULL_ADDR Condition b) is the same as previous one, so merge operations of them for readability. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 3b0cf32c1d66..b7fd9f010b2b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -986,9 +986,9 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, blkaddr == NULL_ADDR) { if (map->m_next_pgofs) *map->m_next_pgofs = pgofs + 1; + goto sync_out; } - if (flag != F2FS_GET_BLOCK_FIEMAP || - blkaddr != NEW_ADDR) + if (flag != F2FS_GET_BLOCK_FIEMAP) goto sync_out; } } From 925d0933d8f0f0ed01cca63c094ff6ae316d0787 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 30 Nov 2017 19:28:23 +0800 Subject: [PATCH 0547/1212] f2fs: don't return value in truncate_data_blocks_range There is no caller cares about return value of truncate_data_blocks_range, remove it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 751654b48a4d..9a8a2624944b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2594,7 +2594,7 @@ int f2fs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); int f2fs_setattr(struct dentry *dentry, struct iattr *attr); int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end); -int truncate_data_blocks_range(struct dnode_of_data *dn, int count); +void truncate_data_blocks_range(struct dnode_of_data *dn, int count); long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 260aeb0d8bc2..b88efbfd22e7 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -481,7 +481,7 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) return dquot_file_open(inode, filp); } -int truncate_data_blocks_range(struct dnode_of_data *dn, int count) +void truncate_data_blocks_range(struct dnode_of_data *dn, int count) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct f2fs_node *raw_node; @@ -524,7 +524,6 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) f2fs_update_time(sbi, REQ_TIME); trace_f2fs_truncate_data_blocks_range(dn->inode, dn->nid, dn->ofs_in_node, nr_free); - return nr_free; } void truncate_data_blocks(struct dnode_of_data *dn) From 04d44000d633c51c2732cfd4e3540ae250299646 Mon Sep 17 00:00:00 2001 From: LiFan Date: Tue, 5 Dec 2017 16:38:01 +0800 Subject: [PATCH 0548/1212] f2fs: use unlikely for release case Since the variable release is only nonzero when another unlikely case occurs, use unlikely() on it seems logical. Signed-off-by: Fan li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9a8a2624944b..58e9c35bd55b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1658,7 +1658,7 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, } spin_unlock(&sbi->stat_lock); - if (release) + if (unlikely(release)) dquot_release_reservation_block(inode, release); f2fs_i_blocks_write(inode, *count, true, true); return 0; From e81cafbeba4bf252b24778a17aef3f623a0815e3 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Wed, 6 Dec 2017 11:31:29 +0800 Subject: [PATCH 0549/1212] f2fs: no need return value in restore summary process No need return value in restore summary process Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/node.c | 3 +-- fs/f2fs/segment.c | 14 +++----------- 3 files changed, 5 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 58e9c35bd55b..b59be85c5e24 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2721,7 +2721,7 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink); void recover_inline_xattr(struct inode *inode, struct page *page); int recover_xattr_data(struct inode *inode, struct page *page); int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); -int restore_node_summary(struct f2fs_sb_info *sbi, +void restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum); void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); int build_node_manager(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d833efceae82..9453975c9799 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2347,7 +2347,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) return 0; } -int restore_node_summary(struct f2fs_sb_info *sbi, +void restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum) { struct f2fs_node *rn; @@ -2380,7 +2380,6 @@ int restore_node_summary(struct f2fs_sb_info *sbi, invalidate_mapping_pages(META_MAPPING(sbi), addr, addr + nrpages); } - return 0; } static void remove_nats_in_journal(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index fac18cc58c44..2206c297ec16 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2904,7 +2904,7 @@ void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr) } } -static int read_compacted_summaries(struct f2fs_sb_info *sbi) +static void read_compacted_summaries(struct f2fs_sb_info *sbi) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct curseg_info *seg_i; @@ -2961,7 +2961,6 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi) } } f2fs_put_page(page, 1); - return 0; } static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) @@ -3007,13 +3006,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) ns->ofs_in_node = 0; } } else { - int err; - - err = restore_node_summary(sbi, segno, sum); - if (err) { - f2fs_put_page(new, 1); - return err; - } + restore_node_summary(sbi, segno, sum); } } @@ -3052,8 +3045,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) META_CP, true); /* restore for compacted data summary */ - if (read_compacted_summaries(sbi)) - return -EINVAL; + read_compacted_summaries(sbi); type = CURSEG_HOT_NODE; } From cd38d5ada5a4dcad36e9791a279cecc3de57bd13 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 12 Dec 2017 14:11:40 +0800 Subject: [PATCH 0550/1212] f2fs: fix potential hangtask in f2fs_trace_pid As Jia-Ju Bai reported: "According to fs/f2fs/trace.c, the kernel module may sleep under a spinlock. The function call path is: f2fs_trace_pid (acquire the spinlock) f2fs_radix_tree_insert cond_resched --> may sleep I do not find a good way to fix it, so I only report. This possible bug is found by my static analysis tool (DSAC) and my code review." Obviously, it's problemetic to schedule in critical region of spinlock, which will cause uninterruptable sleep if there is no waker. This patch changes to use mutex lock intead of spinlock to avoid this condition. Reported-by: Jia-Ju Bai Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/trace.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index bccbbf2616d2..a1fcd00bbb2b 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -17,7 +17,7 @@ #include "trace.h" static RADIX_TREE(pids, GFP_ATOMIC); -static spinlock_t pids_lock; +static struct mutex pids_lock; static struct last_io_info last_io; static inline void __print_last_io(void) @@ -64,7 +64,7 @@ void f2fs_trace_pid(struct page *page) if (radix_tree_preload(GFP_NOFS)) return; - spin_lock(&pids_lock); + mutex_lock(&pids_lock); p = radix_tree_lookup(&pids, pid); if (p == current) goto out; @@ -77,7 +77,7 @@ void f2fs_trace_pid(struct page *page) MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), pid, current->comm); out: - spin_unlock(&pids_lock); + mutex_unlock(&pids_lock); radix_tree_preload_end(); } @@ -122,7 +122,7 @@ void f2fs_trace_ios(struct f2fs_io_info *fio, int flush) void f2fs_build_trace_ios(void) { - spin_lock_init(&pids_lock); + mutex_init(&pids_lock); } #define PIDVEC_SIZE 128 @@ -150,7 +150,7 @@ void f2fs_destroy_trace_ios(void) pid_t next_pid = 0; unsigned int found; - spin_lock(&pids_lock); + mutex_lock(&pids_lock); while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) { unsigned idx; @@ -158,5 +158,5 @@ void f2fs_destroy_trace_ios(void) for (idx = 0; idx < found; idx++) radix_tree_delete(&pids, pid[idx]); } - spin_unlock(&pids_lock); + mutex_unlock(&pids_lock); } From 8a2c11d8658d8136352713372fbd143a60af5533 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Tue, 5 Dec 2017 12:07:47 +0800 Subject: [PATCH 0551/1212] f2fs: fix an error case of missing update inode page -Thread A Thread B -write_checkpoint -block_operations -f2fs_unlock_all -f2fs_sync_file -f2fs_write_inode -f2fs_inode_synced -f2fs_sync_inode_meta -sync_node_pages -set_page_drity In this case, if sudden power off without next new checkpoint, the last inode page update will lost. wb_writeback is same with fsync. Yunlei also reproduced the bug by: @@ -366,7 +366,7 @@ int update_inode(struct inode *inode, struct page *node_page) struct extent_tree *et = F2FS_I(inode)->extent_tree; f2fs_inode_synced(inode); - + msleep(10000); f2fs_wait_on_page_writeback(node_page, NODE, true); shell 1: shell2: dd if=/dev/zero of=./test bs=1M count=10 sync echo "hello" >> ./test fsync test // sleep 10s sync //return quickly echo c > /proc/sysrq-trigger Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 4 ++-- fs/f2fs/inode.c | 16 +++++++--------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b59be85c5e24..9502ec303555 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2607,8 +2607,8 @@ void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page); struct inode *f2fs_iget(struct super_block *sb, unsigned long ino); struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino); int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink); -int update_inode(struct inode *inode, struct page *node_page); -int update_inode_page(struct inode *inode); +void update_inode(struct inode *inode, struct page *node_page); +void update_inode_page(struct inode *inode); int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc); void f2fs_evict_inode(struct inode *inode); void handle_failed_inode(struct inode *inode); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index b4c4f2b25304..234322889e65 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -360,14 +360,15 @@ struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino) return inode; } -int update_inode(struct inode *inode, struct page *node_page) +void update_inode(struct inode *inode, struct page *node_page) { struct f2fs_inode *ri; struct extent_tree *et = F2FS_I(inode)->extent_tree; - f2fs_inode_synced(inode); - f2fs_wait_on_page_writeback(node_page, NODE, true); + set_page_dirty(node_page); + + f2fs_inode_synced(inode); ri = F2FS_INODE(node_page); @@ -426,14 +427,12 @@ int update_inode(struct inode *inode, struct page *node_page) if (inode->i_nlink == 0) clear_inline_node(node_page); - return set_page_dirty(node_page); } -int update_inode_page(struct inode *inode) +void update_inode_page(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *node_page; - int ret = 0; retry: node_page = get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) { @@ -444,11 +443,10 @@ int update_inode_page(struct inode *inode) } else if (err != -ENOENT) { f2fs_stop_checkpoint(sbi, false); } - return 0; + return; } - ret = update_inode(inode, node_page); + update_inode(inode, node_page); f2fs_put_page(node_page, 1); - return ret; } int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) From e2bb618a0a6bb232c22b37d27a5a631f2fc198af Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 19 Dec 2017 19:16:34 -0800 Subject: [PATCH 0552/1212] f2fs: return error during fill_super Let's avoid BUG_ON during fill_super, when on-disk was totall corrupted. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 16 ++++++++++++---- fs/f2fs/segment.h | 22 ++++++++++++++++++---- 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 2206c297ec16..6af71864b501 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3557,7 +3557,7 @@ static int build_curseg(struct f2fs_sb_info *sbi) return restore_curseg_summaries(sbi); } -static void build_sit_entries(struct f2fs_sb_info *sbi) +static int build_sit_entries(struct f2fs_sb_info *sbi) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); @@ -3567,6 +3567,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) int sit_blk_cnt = SIT_BLK_CNT(sbi); unsigned int i, start, end; unsigned int readed, start_blk = 0; + int err = 0; do { readed = ra_meta_pages(sbi, start_blk, BIO_MAX_PAGES, @@ -3585,7 +3586,9 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)]; f2fs_put_page(page, 1); - check_block_count(sbi, start, &sit); + err = check_block_count(sbi, start, &sit); + if (err) + return err; seg_info_from_raw_sit(se, &sit); /* build discard map only one time */ @@ -3620,7 +3623,9 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) old_valid_blocks = se->valid_blocks; - check_block_count(sbi, start, &sit); + err = check_block_count(sbi, start, &sit); + if (err) + break; seg_info_from_raw_sit(se, &sit); if (f2fs_discard_en(sbi)) { @@ -3640,6 +3645,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) se->valid_blocks - old_valid_blocks; } up_read(&curseg->journal_rwsem); + return err; } static void init_free_segmap(struct f2fs_sb_info *sbi) @@ -3814,7 +3820,9 @@ int build_segment_manager(struct f2fs_sb_info *sbi) return err; /* reinit free segmap based on SIT */ - build_sit_entries(sbi); + err = build_sit_entries(sbi); + if (err) + return err; init_free_segmap(sbi); err = build_dirty_segmap(sbi); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 5264b6ed120c..5c4d432ebf1d 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -655,7 +655,7 @@ static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) /* * Summary block is always treated as an invalid block */ -static inline void check_block_count(struct f2fs_sb_info *sbi, +static inline int check_block_count(struct f2fs_sb_info *sbi, int segno, struct f2fs_sit_entry *raw_sit) { #ifdef CONFIG_F2FS_CHECK_FS @@ -677,11 +677,25 @@ static inline void check_block_count(struct f2fs_sb_info *sbi, cur_pos = next_pos; is_valid = !is_valid; } while (cur_pos < sbi->blocks_per_seg); - BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks); + + if (unlikely(GET_SIT_VBLOCKS(raw_sit) != valid_blocks)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Mismatch valid blocks %d vs. %d", + GET_SIT_VBLOCKS(raw_sit), valid_blocks); + set_sbi_flag(sbi, SBI_NEED_FSCK); + return -EINVAL; + } #endif /* check segment usage, and check boundary of a given segment number */ - f2fs_bug_on(sbi, GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg - || segno > TOTAL_SEGS(sbi) - 1); + if (unlikely(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg + || segno > TOTAL_SEGS(sbi) - 1)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Wrong valid blocks %d or segno %u", + GET_SIT_VBLOCKS(raw_sit), segno); + set_sbi_flag(sbi, SBI_NEED_FSCK); + return -EINVAL; + } + return 0; } static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi, From cff2c7fe417b5f5750af6e665dd972e8efe70761 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 28 Dec 2017 08:09:44 -0800 Subject: [PATCH 0553/1212] f2fs: recover directory operations by fsync This fixes generic/342 which doesn't recover renamed file which was fsynced before. It will be done via another fsync on newly created file. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 2 ++ fs/f2fs/f2fs.h | 2 ++ fs/f2fs/file.c | 3 +++ fs/f2fs/namei.c | 4 ++++ include/trace/events/f2fs.h | 3 ++- 5 files changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 55fb45b66ed2..bde445e4e690 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -713,6 +713,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); + add_ino_entry(F2FS_I_SB(dir), dir->i_ino, TRANS_DIR_INO); + if (f2fs_has_inline_dentry(dir)) return f2fs_delete_inline_entry(dentry, page, dir, inode); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9502ec303555..8fa9cc3cdf23 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -247,6 +247,7 @@ enum { ORPHAN_INO, /* for orphan ino list */ APPEND_INO, /* for append ino list */ UPDATE_INO, /* for update ino list */ + TRANS_DIR_INO, /* for trasactions dir ino list */ FLUSH_INO, /* for multiple device flushing */ MAX_INO_ENTRY, /* max. list */ }; @@ -988,6 +989,7 @@ enum cp_reason_type { CP_NODE_NEED_CP, CP_FASTBOOT_MODE, CP_SPEC_LOG_NUM, + CP_RECOVER_DIR, }; enum iostat_type { diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index b88efbfd22e7..de0a167c8238 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -168,6 +168,9 @@ static inline enum cp_reason_type need_do_checkpoint(struct inode *inode) cp_reason = CP_FASTBOOT_MODE; else if (sbi->active_logs == 2) cp_reason = CP_SPEC_LOG_NUM; + else if (need_dentry_mark(sbi, inode->i_ino) && + exist_written_data(sbi, F2FS_I(inode)->i_pino, TRANS_DIR_INO)) + cp_reason = CP_RECOVER_DIR; return cp_reason; } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index f44ce8c34966..a72c226c4d30 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -932,6 +932,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } f2fs_i_links_write(old_dir, false); } + add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); f2fs_unlock_op(sbi); @@ -1086,6 +1087,9 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, } f2fs_mark_inode_dirty_sync(new_dir, false); + add_ino_entry(sbi, old_dir->i_ino, TRANS_DIR_INO); + add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); + f2fs_unlock_op(sbi); if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 589df6f73789..0cdf6cc5c557 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -138,7 +138,8 @@ TRACE_DEFINE_ENUM(CP_TRIMMED); { CP_NO_SPC_ROLL, "no space roll forward" }, \ { CP_NODE_NEED_CP, "node needs cp" }, \ { CP_FASTBOOT_MODE, "fastboot mode" }, \ - { CP_SPEC_LOG_NUM, "log type is 2" }) + { CP_SPEC_LOG_NUM, "log type is 2" }, \ + { CP_RECOVER_DIR, "dir needs recovery" }) struct victim_sel_policy; struct f2fs_map_blocks; From 25ef3006ba2320a9ee75d3afbfb02c482de9ee1b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 28 Dec 2017 17:47:19 -0800 Subject: [PATCH 0554/1212] f2fs: fix missing error number for xattr operation This fixes generic/449 hang problem caused by no ENOSPC forever which should be returned by setxattr under disk full scenario. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 47ac858787ea..353fbff85bab 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -480,6 +480,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, if (F2FS_I(inode)->i_xattr_nid) { xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); if (IS_ERR(xpage)) { + err = PTR_ERR(xpage); alloc_nid_failed(sbi, new_nid); goto in_page_out; } @@ -490,6 +491,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, set_new_dnode(&dn, inode, NULL, NULL, new_nid); xpage = new_node_page(&dn, XATTR_NODE_OFFSET); if (IS_ERR(xpage)) { + err = PTR_ERR(xpage); alloc_nid_failed(sbi, new_nid); goto in_page_out; } From 54bf13a0adcdb523deb12c23405a853115ee13bb Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sun, 31 Dec 2017 16:26:38 -0800 Subject: [PATCH 0555/1212] f2fs: skip stop_checkpoint for user data writes We can give another chance to write user data, which can resolve generic/441. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b7fd9f010b2b..a1dc4cfdcb8e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -110,7 +110,8 @@ static void f2fs_write_end_io(struct bio *bio) if (unlikely(bio->bi_error)) { set_bit(AS_EIO, &page->mapping->flags); - f2fs_stop_checkpoint(sbi, true); + if (type == F2FS_WB_CP_DATA) + f2fs_stop_checkpoint(sbi, true); } dec_page_count(sbi, type); clear_cold_data(page); From 2b4d859bd9d89cd0dd4b2da699930208583488d4 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 2 Jan 2018 11:03:19 -0800 Subject: [PATCH 0556/1212] f2fs: enable quota at remount from r to w We have to enable quota only when remounting from read to write. Otherwise, we'll get remount failure. (e.g., write to write case) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b6a96a8fb794..ff9affb32890 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1266,7 +1266,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) err = dquot_suspend(sb, -1); if (err < 0) goto restore_opts; - } else { + } else if (f2fs_readonly(sb) && !(*flags & MS_RDONLY)) { /* dquot_resume needs RW */ sb->s_flags &= ~MS_RDONLY; if (sb_any_quota_suspended(sb)) { From 87b8168e9ef006e25036eba5fa0e7aa8ee95880a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 3 Jan 2018 17:30:19 +0800 Subject: [PATCH 0557/1212] f2fs: continue to do direct IO if we only preallocate partial blocks While doing direct IO, if we run out-of-space when we preallocate blocks, we should not return ENOSPC error directly, instead, we should continue to do following direct IO, which will keep directIO of f2fs acting like other filesystems. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a1dc4cfdcb8e..08b126366658 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -830,10 +830,12 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); struct f2fs_map_blocks map; + int flag; int err = 0; + bool direct_io = iocb->ki_flags & IOCB_DIRECT; /* convert inline data for Direct I/O*/ - if (iocb->ki_flags & IOCB_DIRECT) { + if (direct_io) { err = f2fs_convert_inline_inode(inode); if (err) return err; @@ -852,26 +854,30 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) map.m_next_pgofs = NULL; map.m_seg_type = NO_CHECK_TYPE; - if (iocb->ki_flags & IOCB_DIRECT) { + if (direct_io) { /* map.m_seg_type = rw_hint_to_seg_type(iocb->ki_hint); */ map.m_seg_type = rw_hint_to_seg_type(WRITE_LIFE_NOT_SET); - return f2fs_map_blocks(inode, &map, 1, - __force_buffered_io(inode, WRITE) ? - F2FS_GET_BLOCK_PRE_AIO : - F2FS_GET_BLOCK_PRE_DIO); + flag = __force_buffered_io(inode, WRITE) ? + F2FS_GET_BLOCK_PRE_AIO : + F2FS_GET_BLOCK_PRE_DIO; + goto map_blocks; } if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) { err = f2fs_convert_inline_inode(inode); if (err) return err; } - if (!f2fs_has_inline_data(inode)) { - err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); - if (map.m_len > 0 && err == -ENOSPC) { - set_inode_flag(inode, FI_NO_PREALLOC); - err = 0; - } + if (f2fs_has_inline_data(inode)) return err; + + flag = F2FS_GET_BLOCK_PRE_AIO; + +map_blocks: + err = f2fs_map_blocks(inode, &map, 1, flag); + if (map.m_len > 0 && err == -ENOSPC) { + if (!direct_io) + set_inode_flag(inode, FI_NO_PREALLOC); + err = 0; } return err; } From b242d7edc5379043477d7eb817d2e7488f2fc16c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 3 Jan 2018 17:32:51 +0800 Subject: [PATCH 0558/1212] f2fs: clean up unneeded declaration Commit 6afc662e68b5 ("f2fs: support flexible inline xattr size") declared f2fs_sb_has_flexible_inline_xattr in f2fs.h for latter being used in get_inline_xattr_addrs, but in latter version, related code has been changed, leave f2fs_sb_has_flexible_inline_xattr w/o any users. Let's remove it for cleanup. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8fa9cc3cdf23..4c9f762ed355 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2540,7 +2540,6 @@ static inline int get_extra_isize(struct inode *inode) return F2FS_I(inode)->i_extra_isize / sizeof(__le32); } -static inline int f2fs_sb_has_flexible_inline_xattr(struct super_block *sb); static inline int get_inline_xattr_addrs(struct inode *inode) { return F2FS_I(inode)->i_inline_xattr_size; From 591b336387338e054067ff82bb535d4fde06179f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 3 Jan 2018 10:55:07 -0800 Subject: [PATCH 0559/1212] f2fs: show precise # of blocks that user/root can use Let's show precise # of blocks that user/root can use through bavail and bfree respectively. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ff9affb32890..4f888e1c5bae 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -995,20 +995,19 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) struct super_block *sb = dentry->d_sb; struct f2fs_sb_info *sbi = F2FS_SB(sb); u64 id = huge_encode_dev(sb->s_bdev->bd_dev); - block_t total_count, user_block_count, start_count, ovp_count; + block_t total_count, user_block_count, start_count; u64 avail_node_count; total_count = le64_to_cpu(sbi->raw_super->block_count); user_block_count = sbi->user_block_count; start_count = le32_to_cpu(sbi->raw_super->segment0_blkaddr); - ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg; buf->f_type = F2FS_SUPER_MAGIC; buf->f_bsize = sbi->blocksize; buf->f_blocks = total_count - start_count; - buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count; - buf->f_bavail = user_block_count - valid_user_blocks(sbi) - + buf->f_bfree = user_block_count - valid_user_blocks(sbi) - sbi->current_reserved_blocks; + buf->f_bavail = buf->f_bfree; avail_node_count = sbi->total_node_count - sbi->nquota_files - F2FS_RESERVED_NODE_NUM; From 4c6bc4be375adf7c4c99188829dfd17fccae930c Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Wed, 3 Jan 2018 18:03:04 +0800 Subject: [PATCH 0560/1212] f2fs: update inode info to inode page for new file After checkpoint, 1. creat a new file A ,(with dirty inode && dirty inode page && xattr info) 2. backgroud wb write back file A inode page (without update from inode cache) 3. fsync file A, write back inode page of file A with inode cache info 4. sudden power off before new checkpoint In this case, recovery process will try to recover a zero inode page. Inline xattr flag of file A will be miss and xattr info will be taken as blkaddr index. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 9453975c9799..ec486ec074da 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2229,7 +2229,9 @@ void recover_inline_xattr(struct inode *inode, struct page *page) f2fs_bug_on(F2FS_I_SB(inode), IS_ERR(ipage)); ri = F2FS_INODE(page); - if (!(ri->i_inline & F2FS_INLINE_XATTR)) { + if (ri->i_inline & F2FS_INLINE_XATTR) { + set_inode_flag(inode, FI_INLINE_XATTR); + } else { clear_inode_flag(inode, FI_INLINE_XATTR); goto update_inode; } From 06a366757ff766936c307afef902300f602cb6a2 Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Thu, 4 Jan 2018 15:02:02 +0800 Subject: [PATCH 0561/1212] f2fs: check segment type in __f2fs_replace_block In some case, the node blocks has wrong blkaddr whose segment type is NODE, e.g., recover inode has missing xattr flag and the blkaddr is in the xattr range. Since fsck.f2fs does not check the recovery nodes, this will cause __f2fs_replace_block change the curseg of node and do the update_sit_entry(sbi, new_blkaddr, 1) with no next_blkoff refresh, as a result, when recovery process write checkpoint and sync nodes, the next_blkoff of curseg is used in the segment bit map, then it will cause f2fs_bug_on. So let's check segment type in __f2fs_replace_block. Signed-off-by: Yunlong Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6af71864b501..96b01c7bea42 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2820,6 +2820,7 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, } } + f2fs_bug_on(sbi, !IS_DATASEG(type)); curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); From 62438ba87b798597d73aa86d0181aaafd11cd067 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 27 Dec 2017 15:05:52 -0800 Subject: [PATCH 0562/1212] f2fs: add reserved blocks for root user This patch allows root to reserve some blocks via mount option. "-o reserve_root=N" means N x 4KB-sized blocks for root only. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 20 ++++++++++++++++---- fs/f2fs/super.c | 37 ++++++++++++++++++++++++++++++++++++- fs/f2fs/sysfs.c | 3 ++- 3 files changed, 54 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4c9f762ed355..38d595b99f58 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -96,6 +96,7 @@ extern char *fault_name[FAULT_MAX]; #define F2FS_MOUNT_PRJQUOTA 0x00200000 #define F2FS_MOUNT_QUOTA 0x00400000 #define F2FS_MOUNT_INLINE_XATTR_SIZE 0x00800000 +#define F2FS_MOUNT_RESERVE_ROOT 0x01000000 #define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option) @@ -1169,6 +1170,7 @@ struct f2fs_sb_info { block_t last_valid_block_count; /* for recovery */ block_t reserved_blocks; /* configurable reserved blocks */ block_t current_reserved_blocks; /* current reserved blocks */ + block_t root_reserved_blocks; /* root reserved blocks */ unsigned int nquota_files; /* # of quota sysfile */ @@ -1647,11 +1649,17 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, sbi->total_valid_block_count += (block_t)(*count); avail_user_block_count = sbi->user_block_count - sbi->current_reserved_blocks; + + if (!(test_opt(sbi, RESERVE_ROOT) && capable(CAP_SYS_RESOURCE))) + avail_user_block_count -= sbi->root_reserved_blocks; + if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) { diff = sbi->total_valid_block_count - avail_user_block_count; + if (diff > *count) + diff = *count; *count -= diff; release = diff; - sbi->total_valid_block_count = avail_user_block_count; + sbi->total_valid_block_count -= diff; if (!*count) { spin_unlock(&sbi->stat_lock); percpu_counter_sub(&sbi->alloc_valid_block_count, diff); @@ -1840,9 +1848,13 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); - valid_block_count = sbi->total_valid_block_count + 1; - if (unlikely(valid_block_count + sbi->current_reserved_blocks > - sbi->user_block_count)) { + valid_block_count = sbi->total_valid_block_count + + sbi->current_reserved_blocks + 1; + + if (!(test_opt(sbi, RESERVE_ROOT) && capable(CAP_SYS_RESOURCE))) + valid_block_count += sbi->root_reserved_blocks; + + if (unlikely(valid_block_count > sbi->user_block_count)) { spin_unlock(&sbi->stat_lock); goto enospc; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4f888e1c5bae..9ec270a961e2 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -107,6 +107,7 @@ enum { Opt_noextent_cache, Opt_noinline_data, Opt_data_flush, + Opt_reserve_root, Opt_mode, Opt_io_size_bits, Opt_fault_injection, @@ -157,6 +158,7 @@ static match_table_t f2fs_tokens = { {Opt_noextent_cache, "noextent_cache"}, {Opt_noinline_data, "noinline_data"}, {Opt_data_flush, "data_flush"}, + {Opt_reserve_root, "reserve_root=%u"}, {Opt_mode, "mode=%s"}, {Opt_io_size_bits, "io_bits=%u"}, {Opt_fault_injection, "fault_injection=%u"}, @@ -191,6 +193,19 @@ void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) va_end(args); } +static inline void limit_reserve_root(struct f2fs_sb_info *sbi) +{ + block_t limit = (sbi->user_block_count << 1) / 1000; + + /* limit is 0.2% */ + if (test_opt(sbi, RESERVE_ROOT) && sbi->root_reserved_blocks > limit) { + sbi->root_reserved_blocks = limit; + f2fs_msg(sbi->sb, KERN_INFO, + "Reduce reserved blocks for root = %u", + sbi->root_reserved_blocks); + } +} + static void init_once(void *foo) { struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo; @@ -488,6 +503,18 @@ static int parse_options(struct super_block *sb, char *options) case Opt_data_flush: set_opt(sbi, DATA_FLUSH); break; + case Opt_reserve_root: + if (args->from && match_int(args, &arg)) + return -EINVAL; + if (test_opt(sbi, RESERVE_ROOT)) { + f2fs_msg(sb, KERN_INFO, + "Preserve previous reserve_root=%u", + sbi->root_reserved_blocks); + } else { + sbi->root_reserved_blocks = arg; + set_opt(sbi, RESERVE_ROOT); + } + break; case Opt_mode: name = match_strdup(&args[0]); @@ -1007,7 +1034,10 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = total_count - start_count; buf->f_bfree = user_block_count - valid_user_blocks(sbi) - sbi->current_reserved_blocks; - buf->f_bavail = buf->f_bfree; + if (buf->f_bfree > sbi->root_reserved_blocks) + buf->f_bavail = buf->f_bfree - sbi->root_reserved_blocks; + else + buf->f_bavail = 0; avail_node_count = sbi->total_node_count - sbi->nquota_files - F2FS_RESERVED_NODE_NUM; @@ -1136,6 +1166,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) else if (test_opt(sbi, LFS)) seq_puts(seq, "lfs"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); + if (test_opt(sbi, RESERVE_ROOT)) + seq_printf(seq, ",reserve_root=%u", + sbi->root_reserved_blocks); if (F2FS_IO_SIZE_BITS(sbi)) seq_printf(seq, ",io_size=%uKB", F2FS_IO_SIZE_KB(sbi)); #ifdef CONFIG_F2FS_FAULT_INJECTION @@ -1334,6 +1367,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); + limit_reserve_root(sbi); return 0; restore_gc: if (need_restart_gc) { @@ -2577,6 +2611,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) sbi->last_valid_block_count = sbi->total_valid_block_count; sbi->reserved_blocks = 0; sbi->current_reserved_blocks = 0; + limit_reserve_root(sbi); for (i = 0; i < NR_INODE_TYPE; i++) { INIT_LIST_HEAD(&sbi->inode_list[i]); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 93c3364250dd..ab6028c332aa 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -162,7 +162,8 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, #endif if (a->struct_type == RESERVED_BLOCKS) { spin_lock(&sbi->stat_lock); - if (t > (unsigned long)sbi->user_block_count) { + if (t > (unsigned long)(sbi->user_block_count - + sbi->root_reserved_blocks)) { spin_unlock(&sbi->stat_lock); return -EINVAL; } From b78e9302e2e358d45ea4377bf2c20d045f1c3b8a Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 5 Jan 2018 09:41:20 +0000 Subject: [PATCH 0563/1212] f2fs: make local functions static Fixes the following sparse warnings: fs/f2fs/segment.c:887:6: warning: symbol '__check_sit_bitmap' was not declared. Should it be static? fs/f2fs/segment.c:1327:6: warning: symbol 'f2fs_wait_discard_bio' was not declared. Should it be static? fs/f2fs/super.c:1661:5: warning: symbol 'f2fs_get_projid' was not declared. Should it be static? Signed-off-by: Wei Yongjun Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 4 ++-- fs/f2fs/super.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 96b01c7bea42..116e50470360 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -965,7 +965,7 @@ static int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, return 0; } -void __check_sit_bitmap(struct f2fs_sb_info *sbi, +static void __check_sit_bitmap(struct f2fs_sb_info *sbi, block_t start, block_t end) { #ifdef CONFIG_F2FS_CHECK_FS @@ -1404,7 +1404,7 @@ static void __wait_all_discard_cmd(struct f2fs_sb_info *sbi, } /* This should be covered by global mutex, &sit_i->sentry_lock */ -void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) +static void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_cmd *dc; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 9ec270a961e2..ec13397b635c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1693,7 +1693,7 @@ void f2fs_quota_off_umount(struct super_block *sb) } #if 0 -int f2fs_get_projid(struct inode *inode, kprojid_t *projid) +static int f2fs_get_projid(struct inode *inode, kprojid_t *projid) { *projid = F2FS_I(inode)->i_projid; return 0; From d4f19f6266abaf573312c78723e09fb6498980ab Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 8 Jan 2018 18:48:33 +0800 Subject: [PATCH 0564/1212] f2fs: avoid high cpu usage in discard thread We take very long time to finish generic/476, this is because we will check consistence of all discard entries in global rb tree while traversing all different granularity pending lists, even when the list is empty, in order to avoid that unneeded overhead, we have to skip the check when coming up an empty list. generic/476 time consumption: cost Before patch & w/o consistence check 57s Before patch & w/ consistence check 1426s After patch 78s Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 116e50470360..d13e36b292a4 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1284,6 +1284,8 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, pend_list = &dcc->pend_list[i]; mutex_lock(&dcc->cmd_lock); + if (list_empty(pend_list)) + goto next; f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); blk_start_plug(&plug); list_for_each_entry_safe(dc, tmp, pend_list, list) { @@ -1302,6 +1304,7 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, break; } blk_finish_plug(&plug); +next: mutex_unlock(&dcc->cmd_lock); if (iter >= dpolicy->max_requests) From 1338f376d5a344fe786cc497e68c223508d2a937 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 8 Jan 2018 18:48:34 +0800 Subject: [PATCH 0565/1212] f2fs: remove unused pend_list_tag In commit 78997b569f56 ("f2fs: split discard policy"), we have get rid of using pend_list_tag field in struct discard_cmd_control, but forgot to remove it, now do it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 38d595b99f58..635866e33a31 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -334,7 +334,6 @@ struct discard_cmd_control { struct task_struct *f2fs_issue_discard; /* discard thread */ struct list_head entry_list; /* 4KB discard entry list */ struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */ - unsigned char pend_list_tag[MAX_PLIST_NUM];/* tag for pending entries */ struct list_head wait_list; /* store on-flushing entries */ struct list_head fstrim_list; /* in-flight discard from fstrim */ wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ From f53dcf6799abaf7776bc82679beb4382e8a85f9b Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Tue, 9 Jan 2018 19:33:39 +0800 Subject: [PATCH 0566/1212] f2fs: implement cgroup writeback support Cgroup writeback requires explicit support from the filesystem. f2fs's data and node writeback IOs go through __write_data_page, which sets fio for submiting IOs. So, we add io_wbc for fio, associate bios with blkcg by invoking wbc_init_bio() and account IOs issuing by wbc_account_io(). In addtion, f2fs_fill_super() is updated to set SB_I_CGROUPWB. Meta writeback IOs is left alone by this patch and will always be attributed to the root cgroup. The results show that f2fs can throttle writeback nicely for data writing and file creating. Reviewed-by: Chao Yu Signed-off-by: Yufen Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 12 ++++++++++-- fs/f2fs/f2fs.h | 1 + fs/f2fs/node.c | 1 + fs/f2fs/super.c | 1 + 4 files changed, 13 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 08b126366658..8148dff3732e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -168,6 +168,7 @@ static bool __same_bdev(struct f2fs_sb_info *sbi, * Low-level block read/write IO operations. */ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, + struct writeback_control *wbc, int npages, bool is_read) { struct bio *bio; @@ -177,6 +178,8 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, f2fs_target_device(sbi, blk_addr, bio); bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; bio->bi_private = is_read ? NULL : sbi; + if (wbc) + wbc_init_bio(wbc, bio); return bio; } @@ -372,7 +375,8 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) f2fs_trace_ios(fio, 0); /* Allocate a new bio */ - bio = __bio_alloc(fio->sbi, fio->new_blkaddr, 1, is_read_io(fio->op)); + bio = __bio_alloc(fio->sbi, fio->new_blkaddr, fio->io_wbc, + 1, is_read_io(fio->op)); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_put(bio); @@ -434,7 +438,7 @@ int f2fs_submit_page_write(struct f2fs_io_info *fio) dec_page_count(sbi, WB_DATA_TYPE(bio_page)); goto out_fail; } - io->bio = __bio_alloc(sbi, fio->new_blkaddr, + io->bio = __bio_alloc(sbi, fio->new_blkaddr, fio->io_wbc, BIO_MAX_PAGES, false); io->fio = *fio; } @@ -444,6 +448,9 @@ int f2fs_submit_page_write(struct f2fs_io_info *fio) goto alloc_new; } + if (fio->io_wbc) + wbc_account_io(fio->io_wbc, bio_page, PAGE_SIZE); + io->last_block_in_bio = fio->new_blkaddr; f2fs_trace_ios(fio, 0); @@ -1528,6 +1535,7 @@ static int __write_data_page(struct page *page, bool *submitted, .submitted = false, .need_lock = LOCK_RETRY, .io_type = io_type, + .io_wbc = wbc, }; trace_f2fs_writepage(page, DATA); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 635866e33a31..90b412bb4e3b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1025,6 +1025,7 @@ struct f2fs_io_info { int need_lock; /* indicate we need to lock cp_rwsem */ bool in_list; /* indicate fio is in io_list */ enum iostat_type io_type; /* io type */ + struct writeback_control *io_wbc; /* writeback control */ }; #define is_read_io(rw) ((rw) == READ) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index ec486ec074da..676b0e3f5ef3 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1339,6 +1339,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, .encrypted_page = NULL, .submitted = false, .io_type = io_type, + .io_wbc = wbc, }; trace_f2fs_writepage(page, NODE); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ec13397b635c..38e33f6d1f93 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2526,6 +2526,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); memcpy(sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid)); + sb->s_iflags |= SB_I_CGROUPWB; /* init f2fs-specific super block info */ sbi->valid_super_block = valid_super_block; From 2ce6b9d8167e2785ea01011bc60c2f95b6313dea Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 4 Jan 2018 21:36:09 -0800 Subject: [PATCH 0567/1212] f2fs: add resgid and resuid to reserve root blocks This patch adds mount options to reserve some blocks via resgid=%u,resuid=%u. It only activates with reserve_root=%u. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 27 +++++++++++++++++++++++++-- fs/f2fs/super.c | 46 ++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 69 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 90b412bb4e3b..c35f87423123 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -198,6 +199,12 @@ static inline struct timespec current_time(struct inode *inode) return timespec_trunc(now, inode->i_sb->s_time_gran); } +/* + * Default values for user and/or group using reserved blocks + */ +#define F2FS_DEF_RESUID 0 +#define F2FS_DEF_RESGID 0 + /* * For checkpoint manager */ @@ -1171,6 +1178,8 @@ struct f2fs_sb_info { block_t reserved_blocks; /* configurable reserved blocks */ block_t current_reserved_blocks; /* current reserved blocks */ block_t root_reserved_blocks; /* root reserved blocks */ + kuid_t s_resuid; /* reserved blocks for uid */ + kgid_t s_resgid; /* reserved blocks for gid */ unsigned int nquota_files; /* # of quota sysfile */ @@ -1620,6 +1629,20 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs) return ofs == XATTR_NODE_OFFSET; } +static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi) +{ + if (!test_opt(sbi, RESERVE_ROOT)) + return false; + if (capable(CAP_SYS_RESOURCE)) + return true; + if (uid_eq(sbi->s_resuid, current_fsuid())) + return true; + if (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && + in_group_p(sbi->s_resgid)) + return true; + return false; +} + static inline void f2fs_i_blocks_write(struct inode *, block_t, bool, bool); static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, blkcnt_t *count) @@ -1650,7 +1673,7 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, avail_user_block_count = sbi->user_block_count - sbi->current_reserved_blocks; - if (!(test_opt(sbi, RESERVE_ROOT) && capable(CAP_SYS_RESOURCE))) + if (!__allow_reserved_blocks(sbi)) avail_user_block_count -= sbi->root_reserved_blocks; if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) { @@ -1851,7 +1874,7 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, valid_block_count = sbi->total_valid_block_count + sbi->current_reserved_blocks + 1; - if (!(test_opt(sbi, RESERVE_ROOT) && capable(CAP_SYS_RESOURCE))) + if (!__allow_reserved_blocks(sbi)) valid_block_count += sbi->root_reserved_blocks; if (unlikely(valid_block_count > sbi->user_block_count)) { diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 38e33f6d1f93..d89834b79646 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -108,6 +108,8 @@ enum { Opt_noinline_data, Opt_data_flush, Opt_reserve_root, + Opt_resgid, + Opt_resuid, Opt_mode, Opt_io_size_bits, Opt_fault_injection, @@ -159,6 +161,8 @@ static match_table_t f2fs_tokens = { {Opt_noinline_data, "noinline_data"}, {Opt_data_flush, "data_flush"}, {Opt_reserve_root, "reserve_root=%u"}, + {Opt_resgid, "resgid=%u"}, + {Opt_resuid, "resuid=%u"}, {Opt_mode, "mode=%s"}, {Opt_io_size_bits, "io_bits=%u"}, {Opt_fault_injection, "fault_injection=%u"}, @@ -204,6 +208,15 @@ static inline void limit_reserve_root(struct f2fs_sb_info *sbi) "Reduce reserved blocks for root = %u", sbi->root_reserved_blocks); } + if (!test_opt(sbi, RESERVE_ROOT) && + (!uid_eq(sbi->s_resuid, + make_kuid(&init_user_ns, F2FS_DEF_RESUID)) || + !gid_eq(sbi->s_resgid, + make_kgid(&init_user_ns, F2FS_DEF_RESGID)))) + f2fs_msg(sbi->sb, KERN_INFO, + "Ignore s_resuid=%u, s_resgid=%u w/o reserve_root", + from_kuid_munged(&init_user_ns, sbi->s_resuid), + from_kgid_munged(&init_user_ns, sbi->s_resgid)); } static void init_once(void *foo) @@ -336,6 +349,8 @@ static int parse_options(struct super_block *sb, char *options) substring_t args[MAX_OPT_ARGS]; char *p, *name; int arg = 0; + kuid_t uid; + kgid_t gid; #ifdef CONFIG_QUOTA int ret; #endif @@ -515,6 +530,28 @@ static int parse_options(struct super_block *sb, char *options) set_opt(sbi, RESERVE_ROOT); } break; + case Opt_resuid: + if (args->from && match_int(args, &arg)) + return -EINVAL; + uid = make_kuid(current_user_ns(), arg); + if (!uid_valid(uid)) { + f2fs_msg(sb, KERN_ERR, + "Invalid uid value %d", arg); + return -EINVAL; + } + sbi->s_resuid = uid; + break; + case Opt_resgid: + if (args->from && match_int(args, &arg)) + return -EINVAL; + gid = make_kgid(current_user_ns(), arg); + if (!gid_valid(gid)) { + f2fs_msg(sb, KERN_ERR, + "Invalid gid value %d", arg); + return -EINVAL; + } + sbi->s_resgid = gid; + break; case Opt_mode: name = match_strdup(&args[0]); @@ -1167,8 +1204,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, "lfs"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); if (test_opt(sbi, RESERVE_ROOT)) - seq_printf(seq, ",reserve_root=%u", - sbi->root_reserved_blocks); + seq_printf(seq, ",reserve_root=%u,resuid=%u,resgid=%u", + sbi->root_reserved_blocks, + from_kuid_munged(&init_user_ns, sbi->s_resuid), + from_kgid_munged(&init_user_ns, sbi->s_resgid)); if (F2FS_IO_SIZE_BITS(sbi)) seq_printf(seq, ",io_size=%uKB", F2FS_IO_SIZE_KB(sbi)); #ifdef CONFIG_F2FS_FAULT_INJECTION @@ -2463,6 +2502,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) sb->s_fs_info = sbi; sbi->raw_super = raw_super; + sbi->s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); + sbi->s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); + /* precompute checksum seed for metadata */ if (f2fs_sb_has_inode_chksum(sb)) sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid, From d49132d45cb07dc77904bf9b6501df2dd77b251b Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Wed, 10 Jan 2018 16:49:10 +0900 Subject: [PATCH 0568/1212] f2fs: handle newly created page when revoking inmem pages When committing inmem pages is successful, we revoke already committed blocks in __revoke_inmem_pages() and finally replace the committed ones with the old blocks using f2fs_replace_block(). However, if the committed block was newly created one, the address of the old block is NEW_ADDR and __f2fs_replace_block() cannot handle NEW_ADDR as new_blkaddr properly and a kernel panic occurrs. Signed-off-by: Daeho Jeong Tested-by: Shu Tan Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d13e36b292a4..7638ebb1c343 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -248,7 +248,11 @@ static int __revoke_inmem_pages(struct inode *inode, goto next; } get_node_info(sbi, dn.nid, &ni); - f2fs_replace_block(sbi, &dn, dn.data_blkaddr, + if (cur->old_addr == NEW_ADDR) { + invalidate_blocks(sbi, dn.data_blkaddr); + f2fs_update_data_blkaddr(&dn, NEW_ADDR); + } else + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, cur->old_addr, ni.version, true, true); f2fs_put_dnode(&dn); } From b203c58dfd5538d1a7f99737db6d3653b7601c82 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 10 Jan 2018 18:18:51 +0800 Subject: [PATCH 0569/1212] f2fs: fix to caclulate required free section correctly When calculating required free section during file defragmenting, we should skip holes in file, otherwise we will probably fail to defrag sparse file with large size. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index de0a167c8238..56f6b21cd9a9 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2101,10 +2101,12 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, continue; } - if (blk_end && blk_end != map.m_pblk) { + if (blk_end && blk_end != map.m_pblk) fragmented = true; - break; - } + + /* record total count of block that we're going to move */ + total += map.m_len; + blk_end = map.m_pblk + map.m_len; map.m_lblk += map.m_len; @@ -2113,10 +2115,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, if (!fragmented) goto out; - map.m_lblk = pg_start; - map.m_len = pg_end - pg_start; - - sec_num = (map.m_len + BLKS_PER_SEC(sbi) - 1) / BLKS_PER_SEC(sbi); + sec_num = (total + BLKS_PER_SEC(sbi) - 1) / BLKS_PER_SEC(sbi); /* * make sure there are enough free section for LFS allocation, this can @@ -2128,6 +2127,10 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, goto out; } + map.m_lblk = pg_start; + map.m_len = pg_end - pg_start; + total = 0; + while (map.m_lblk < pg_end) { pgoff_t idx; int cnt = 0; From 10f4a4140b618ea89740dd76ff47aedbd5161f84 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Thu, 11 Jan 2018 14:19:32 +0800 Subject: [PATCH 0570/1212] f2fs: check node page again in write end io Check node page again in write end io in case of data corruption during inflght IO. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8148dff3732e..bbb6eb79351f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -113,6 +113,10 @@ static void f2fs_write_end_io(struct bio *bio) if (type == F2FS_WB_CP_DATA) f2fs_stop_checkpoint(sbi, true); } + + f2fs_bug_on(sbi, page->mapping == NODE_MAPPING(sbi) && + page->index != nid_of_node(page)); + dec_page_count(sbi, type); clear_cold_data(page); end_page_writeback(page); From 6afa9a94d09b1e9155db2aa41555e31696d31bf7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 11 Jan 2018 14:37:35 +0800 Subject: [PATCH 0571/1212] f2fs: fix to cover f2fs_inline_data_fiemap with inode_lock This patch fix to cover f2fs_inline_data_fiemap with inode_lock in order to make that interface avoiding race with mapping change. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index bbb6eb79351f..ab32b33c8e02 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1154,14 +1154,14 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret) return ret; + inode_lock(inode); + if (f2fs_has_inline_data(inode)) { ret = f2fs_inline_data_fiemap(inode, fieinfo, start, len); if (ret != -EAGAIN) - return ret; + goto out; } - inode_lock(inode); - if (logical_to_blk(inode, len) == 0) len = blk_to_logical(inode, 1); From 58b1f5b0fcf1b203c3a44eaedd0f2db572a01069 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 11 Jan 2018 14:39:57 +0800 Subject: [PATCH 0572/1212] f2fs: support FIEMAP_FLAG_XATTR This patch enables ->fiemap to handle FIEMAP_FLAG_XATTR flag for xattr mapping info lookup purpose. It makes f2fs passing generic/425 test in fstest. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 68 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ab32b33c8e02..2be6e1999ab3 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1140,6 +1140,68 @@ static inline loff_t blk_to_logical(struct inode *inode, sector_t blk) return (blk << inode->i_blkbits); } +static int f2fs_xattr_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct page *page; + struct node_info ni; + __u64 phys = 0, len; + __u32 flags; + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + int err = 0; + + if (f2fs_has_inline_xattr(inode)) { + int offset; + + page = f2fs_grab_cache_page(NODE_MAPPING(sbi), + inode->i_ino, false); + if (!page) + return -ENOMEM; + + get_node_info(sbi, inode->i_ino, &ni); + + phys = (__u64)blk_to_logical(inode, ni.blk_addr); + offset = offsetof(struct f2fs_inode, i_addr) + + sizeof(__le32) * (DEF_ADDRS_PER_INODE - + F2FS_INLINE_XATTR_ADDRS(inode)); + + phys += offset; + len = inline_xattr_size(inode); + + f2fs_put_page(page, 1); + + flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED; + + if (!xnid) + flags |= FIEMAP_EXTENT_LAST; + + err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags); + if (err || err == 1) + return err; + } + + if (xnid) { + page = f2fs_grab_cache_page(NODE_MAPPING(sbi), xnid, false); + if (!page) + return -ENOMEM; + + get_node_info(sbi, xnid, &ni); + + phys = (__u64)blk_to_logical(inode, ni.blk_addr); + len = inode->i_sb->s_blocksize; + + f2fs_put_page(page, 1); + + flags = FIEMAP_EXTENT_LAST; + } + + if (phys) + err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags); + + return (err < 0 ? err : 0); +} + int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { @@ -1150,12 +1212,17 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u32 flags = 0; int ret = 0; - ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); + ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR); if (ret) return ret; inode_lock(inode); + if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { + ret = f2fs_xattr_fiemap(inode, fieinfo); + goto out; + } + if (f2fs_has_inline_data(inode)) { ret = f2fs_inline_data_fiemap(inode, fieinfo, start, len); if (ret != -EAGAIN) From 292c8e1cfd4d2eafdd7e90ea269f146dc275e412 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Thu, 11 Jan 2018 11:26:19 +0900 Subject: [PATCH 0573/1212] f2fs: prevent newly created inode from being dirtied incorrectly Now, we invoke f2fs_mark_inode_dirty_sync() to make an inode dirty in advance of creating a new node page for the inode. By this, some inodes whose node page is not created yet can be linked into the global dirty list. If the checkpoint is executed at this moment, the inode will be written back by writeback_single_inode() and finally update_inode_page() will fail to detach the inode from the global dirty list because the inode doesn't have a node page. The problem is that the inode's state in VFS layer will become clean after execution of writeback_single_inode() and it's still linked in the global dirty list of f2fs and this will cause a kernel panic. So, we will prevent the newly created inode from being dirtied during the FI_NEW_INODE flag of the inode is set. We will make it dirty right after the flag is cleared. Signed-off-by: Daeho Jeong Signed-off-by: Youngjin Gil Tested-by: Hobin Woo Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/inode.c | 3 +++ fs/f2fs/namei.c | 4 ++-- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c35f87423123..c26e4a3a04b2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2214,6 +2214,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode, case FI_INLINE_XATTR: case FI_INLINE_DATA: case FI_INLINE_DENTRY: + case FI_NEW_INODE: if (set) return; case FI_DATA_EXIST: diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 234322889e65..1dc77a40d0ad 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -22,6 +22,9 @@ void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync) { + if (is_inode_flag_set(inode, FI_NEW_INODE)) + return; + if (f2fs_inode_dirtied(inode, sync)) return; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a72c226c4d30..7573779a8e7c 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -74,12 +74,12 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (err) goto fail_drop; + set_inode_flag(inode, FI_NEW_INODE); + /* If the directory encrypted, then we should encrypt the inode. */ if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) f2fs_set_encrypted_inode(inode); - set_inode_flag(inode, FI_NEW_INODE); - if (f2fs_sb_has_extra_attr(sbi->sb)) { set_inode_flag(inode, FI_EXTRA_ATTR); F2FS_I(inode)->i_extra_isize = F2FS_TOTAL_EXTRA_ATTR_SIZE; From 84960fca96c4330d79d384fe21357f26537357de Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 7 Dec 2017 16:25:39 -0800 Subject: [PATCH 0574/1212] f2fs: add an ioctl to disable GC for specific file This patch gives a flag to disable GC on given file, which would be useful, when user wants to keep its block map. It also conducts in-place-update for dontmove file. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 + fs/f2fs/f2fs.h | 29 +++++++++++++- fs/f2fs/file.c | 83 +++++++++++++++++++++++++++++++++++++++++ fs/f2fs/gc.c | 11 ++++++ fs/f2fs/gc.h | 2 + fs/f2fs/sysfs.c | 2 + include/linux/f2fs_fs.h | 9 ++++- 7 files changed, 136 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 2be6e1999ab3..8c539fe293c8 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1478,6 +1478,8 @@ static inline bool need_inplace_update(struct f2fs_io_info *fio) { struct inode *inode = fio->page->mapping->host; + if (f2fs_is_pinned_file(inode)) + return true; if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode)) return false; if (is_cold_data(fio->page)) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c26e4a3a04b2..8a3096f3b3d3 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -423,6 +423,8 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_IOC_GARBAGE_COLLECT_RANGE _IOW(F2FS_IOCTL_MAGIC, 11, \ struct f2fs_gc_range) #define F2FS_IOC_GET_FEATURES _IOR(F2FS_IOCTL_MAGIC, 12, __u32) +#define F2FS_IOC_SET_PIN_FILE _IOW(F2FS_IOCTL_MAGIC, 13, __u32) +#define F2FS_IOC_GET_PIN_FILE _IOR(F2FS_IOCTL_MAGIC, 14, __u32) #define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY #define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY @@ -657,7 +659,10 @@ struct f2fs_inode_info { unsigned long i_flags; /* keep an inode flags for ioctl */ unsigned char i_advise; /* use to give file attribute hints */ unsigned char i_dir_level; /* use for dentry level for large dir */ - unsigned int i_current_depth; /* use only in directory structure */ + union { + unsigned int i_current_depth; /* only for directory depth */ + unsigned short i_gc_failures; /* only for regular file */ + }; unsigned int i_pino; /* parent inode number */ umode_t i_acl_mode; /* keep file acl mode temporarily */ @@ -1206,6 +1211,9 @@ struct f2fs_sb_info { /* threshold for converting bg victims for fg */ u64 fggc_threshold; + /* threshold for gc trials on pinned files */ + u64 gc_pin_file_threshold; + /* maximum # of trials to find a victim segment for SSR and GC */ unsigned int max_victim_search; @@ -2205,6 +2213,7 @@ enum { FI_HOT_DATA, /* indicate file is hot */ FI_EXTRA_ATTR, /* indicate file has extra attribute */ FI_PROJ_INHERIT, /* indicate file inherits projectid */ + FI_PIN_FILE, /* indicate file should not be gced */ }; static inline void __mark_inode_dirty_flag(struct inode *inode, @@ -2219,6 +2228,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode, return; case FI_DATA_EXIST: case FI_INLINE_DOTS: + case FI_PIN_FILE: f2fs_mark_inode_dirty_sync(inode, true); } } @@ -2299,6 +2309,13 @@ static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth) f2fs_mark_inode_dirty_sync(inode, true); } +static inline void f2fs_i_gc_failures_write(struct inode *inode, + unsigned int count) +{ + F2FS_I(inode)->i_gc_failures = count; + f2fs_mark_inode_dirty_sync(inode, true); +} + static inline void f2fs_i_xnid_write(struct inode *inode, nid_t xnid) { F2FS_I(inode)->i_xattr_nid = xnid; @@ -2327,6 +2344,8 @@ static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) set_bit(FI_INLINE_DOTS, &fi->flags); if (ri->i_inline & F2FS_EXTRA_ATTR) set_bit(FI_EXTRA_ATTR, &fi->flags); + if (ri->i_inline & F2FS_PIN_FILE) + set_bit(FI_PIN_FILE, &fi->flags); } static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) @@ -2345,6 +2364,8 @@ static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) ri->i_inline |= F2FS_INLINE_DOTS; if (is_inode_flag_set(inode, FI_EXTRA_ATTR)) ri->i_inline |= F2FS_EXTRA_ATTR; + if (is_inode_flag_set(inode, FI_PIN_FILE)) + ri->i_inline |= F2FS_PIN_FILE; } static inline int f2fs_has_extra_attr(struct inode *inode) @@ -2390,6 +2411,11 @@ static inline int f2fs_has_inline_dots(struct inode *inode) return is_inode_flag_set(inode, FI_INLINE_DOTS); } +static inline bool f2fs_is_pinned_file(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_PIN_FILE); +} + static inline bool f2fs_is_atomic_file(struct inode *inode) { return is_inode_flag_set(inode, FI_ATOMIC_FILE); @@ -2634,6 +2660,7 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end); void truncate_data_blocks_range(struct dnode_of_data *dn, int count); long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int f2fs_pin_file_control(struct inode *inode, bool inc); /* * inode.c diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 56f6b21cd9a9..617ff6f6f268 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2454,6 +2454,83 @@ static int f2fs_ioc_get_features(struct file *filp, unsigned long arg) return put_user(sb_feature, (u32 __user *)arg); } +int f2fs_pin_file_control(struct inode *inode, bool inc) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + /* Use i_gc_failures for normal file as a risk signal. */ + if (inc) + f2fs_i_gc_failures_write(inode, fi->i_gc_failures + 1); + + if (fi->i_gc_failures > sbi->gc_pin_file_threshold) { + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: Enable GC = ino %lx after %x GC trials\n", + __func__, inode->i_ino, fi->i_gc_failures); + clear_inode_flag(inode, FI_PIN_FILE); + return -EAGAIN; + } + return 0; +} + +static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + __u32 pin; + int ret = 0; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (get_user(pin, (__u32 __user *)arg)) + return -EFAULT; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + if (f2fs_readonly(F2FS_I_SB(inode)->sb)) + return -EROFS; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + inode_lock(inode); + + if (!pin) { + clear_inode_flag(inode, FI_PIN_FILE); + F2FS_I(inode)->i_gc_failures = 1; + goto done; + } + + if (f2fs_pin_file_control(inode, false)) { + ret = -EAGAIN; + goto out; + } + ret = f2fs_convert_inline_inode(inode); + if (ret) + goto out; + + set_inode_flag(inode, FI_PIN_FILE); + ret = F2FS_I(inode)->i_gc_failures; +done: + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_ioc_get_pin_file(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + __u32 pin = 0; + + if (is_inode_flag_set(inode, FI_PIN_FILE)) + pin = F2FS_I(inode)->i_gc_failures; + return put_user(pin, (u32 __user *)arg); +} + long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp))))) @@ -2500,6 +2577,10 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_flush_device(filp, arg); case F2FS_IOC_GET_FEATURES: return f2fs_ioc_get_features(filp, arg); + case F2FS_IOC_GET_PIN_FILE: + return f2fs_ioc_get_pin_file(filp, arg); + case F2FS_IOC_SET_PIN_FILE: + return f2fs_ioc_set_pin_file(filp, arg); default: return -ENOTTY; } @@ -2578,6 +2659,8 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_MOVE_RANGE: case F2FS_IOC_FLUSH_DEVICE: case F2FS_IOC_GET_FEATURES: + case F2FS_IOC_GET_PIN_FILE: + case F2FS_IOC_SET_PIN_FILE: break; default: return -ENOIOCTLCMD; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index be9fd616736b..d98b869456ce 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -624,6 +624,11 @@ static void move_data_block(struct inode *inode, block_t bidx, if (f2fs_is_atomic_file(inode)) goto out; + if (f2fs_is_pinned_file(inode)) { + f2fs_pin_file_control(inode, true); + goto out; + } + set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE); if (err) @@ -720,6 +725,11 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, if (f2fs_is_atomic_file(inode)) goto out; + if (f2fs_is_pinned_file(inode)) { + if (gc_type == FG_GC) + f2fs_pin_file_control(inode, true); + goto out; + } if (gc_type == BG_GC) { if (PageWriteback(page)) @@ -1091,6 +1101,7 @@ void build_gc_manager(struct f2fs_sb_info *sbi) sbi->fggc_threshold = div64_u64((main_count - ovp_count) * BLKS_PER_SEC(sbi), (main_count - resv_count)); + sbi->gc_pin_file_threshold = DEF_GC_FAILED_PINNED_FILES; /* give warm/cold data area from slower device */ if (sbi->s_ndevs && sbi->segs_per_sec == 1) diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 9325191fab2d..b0045d4c8d1e 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -20,6 +20,8 @@ #define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */ #define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */ +#define DEF_GC_FAILED_PINNED_FILES 2048 + /* Search max. number of dirty segments to select a victim segment */ #define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */ diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index ab6028c332aa..41887e6ec1b3 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -301,6 +301,7 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold); #ifdef CONFIG_F2FS_FAULT_INJECTION F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); @@ -349,6 +350,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(idle_interval), ATTR_LIST(iostat_enable), ATTR_LIST(readdir_ra), + ATTR_LIST(gc_pin_file_thresh), #ifdef CONFIG_F2FS_FAULT_INJECTION ATTR_LIST(inject_rate), ATTR_LIST(inject_type), diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index fef1caeddf54..9bba23187c04 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -212,6 +212,7 @@ struct f2fs_extent { #define F2FS_DATA_EXIST 0x08 /* file inline data exist flag */ #define F2FS_INLINE_DOTS 0x10 /* file having implicit dot dentries */ #define F2FS_EXTRA_ATTR 0x20 /* file having extra attribute */ +#define F2FS_PIN_FILE 0x40 /* file should not be gced */ struct f2fs_inode { __le16 i_mode; /* file mode */ @@ -229,7 +230,13 @@ struct f2fs_inode { __le32 i_ctime_nsec; /* change time in nano scale */ __le32 i_mtime_nsec; /* modification time in nano scale */ __le32 i_generation; /* file version (for NFS) */ - __le32 i_current_depth; /* only for directory depth */ + union { + __le32 i_current_depth; /* only for directory depth */ + __le16 i_gc_failures; /* + * # of gc failures on pinned file. + * only for regular files. + */ + }; __le32 i_xattr_nid; /* nid to save xattr */ __le32 i_flags; /* file attributes */ __le32 i_pino; /* parent inode number */ From 999f806a7c9e29e74019e6c2566be04d54c956ea Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 11 Jan 2018 14:42:30 +0800 Subject: [PATCH 0575/1212] f2fs: support F2FS_IOC_PRECACHE_EXTENTS This patch introduces a new ioctl F2FS_IOC_PRECACHE_EXTENTS to precache extent info like ext4, in order to gain better performance during triggering AIO by eliminating synchronous waiting of mapping info. Referred commit: 7869a4a6c5ca ("ext4: add support for extent pre-caching") In addition, with newly added extent precache abilitiy, this patch add to support FIEMAP_FLAG_CACHE in ->fiemap. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 39 +++++++++++++++++++++++++++++++++++++++ fs/f2fs/f2fs.h | 4 ++++ fs/f2fs/file.c | 44 ++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 85 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8c539fe293c8..95a649467272 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -863,6 +863,7 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) map.m_len = 0; map.m_next_pgofs = NULL; + map.m_next_extent = NULL; map.m_seg_type = NO_CHECK_TYPE; if (direct_io) { @@ -930,6 +931,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, blkcnt_t prealloc; struct extent_info ei = {0,0,0}; block_t blkaddr; + unsigned int start_pgofs; if (!maxblocks) return 0; @@ -945,6 +947,8 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, map->m_pblk = ei.blk + pgofs - ei.fofs; map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgofs); map->m_flags = F2FS_MAP_MAPPED; + if (map->m_next_extent) + *map->m_next_extent = pgofs + map->m_len; goto out; } @@ -963,10 +967,14 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, if (map->m_next_pgofs) *map->m_next_pgofs = get_next_page_offset(&dn, pgofs); + if (map->m_next_extent) + *map->m_next_extent = + get_next_page_offset(&dn, pgofs); } goto unlock_out; } + start_pgofs = pgofs; prealloc = 0; last_ofs_in_node = ofs_in_node = dn.ofs_in_node; end_offset = ADDRS_PER_PAGE(dn.node_page, inode); @@ -1000,6 +1008,8 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, map->m_pblk = 0; goto sync_out; } + if (flag == F2FS_GET_BLOCK_PRECACHE) + goto sync_out; if (flag == F2FS_GET_BLOCK_FIEMAP && blkaddr == NULL_ADDR) { if (map->m_next_pgofs) @@ -1058,6 +1068,16 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, else if (dn.ofs_in_node < end_offset) goto next_block; + if (flag == F2FS_GET_BLOCK_PRECACHE) { + if (map->m_flags & F2FS_MAP_MAPPED) { + unsigned int ofs = start_pgofs - map->m_lblk; + + f2fs_update_extent_cache_range(&dn, + start_pgofs, map->m_pblk + ofs, + map->m_len - ofs); + } + } + f2fs_put_dnode(&dn); if (create) { @@ -1067,6 +1087,17 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, goto next_dnode; sync_out: + if (flag == F2FS_GET_BLOCK_PRECACHE) { + if (map->m_flags & F2FS_MAP_MAPPED) { + unsigned int ofs = start_pgofs - map->m_lblk; + + f2fs_update_extent_cache_range(&dn, + start_pgofs, map->m_pblk + ofs, + map->m_len - ofs); + } + if (map->m_next_extent) + *map->m_next_extent = pgofs + 1; + } f2fs_put_dnode(&dn); unlock_out: if (create) { @@ -1088,6 +1119,7 @@ static int __get_data_block(struct inode *inode, sector_t iblock, map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; map.m_next_pgofs = next_pgofs; + map.m_next_extent = NULL; map.m_seg_type = seg_type; err = f2fs_map_blocks(inode, &map, create, flag); @@ -1212,6 +1244,12 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u32 flags = 0; int ret = 0; + if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { + ret = f2fs_precache_extents(inode); + if (ret) + return ret; + } + ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR); if (ret) return ret; @@ -1313,6 +1351,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, map.m_len = 0; map.m_flags = 0; map.m_next_pgofs = NULL; + map.m_next_extent = NULL; map.m_seg_type = NO_CHECK_TYPE; for (; nr_pages; nr_pages--) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8a3096f3b3d3..28e5a52aadb4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -425,6 +425,7 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_IOC_GET_FEATURES _IOR(F2FS_IOCTL_MAGIC, 12, __u32) #define F2FS_IOC_SET_PIN_FILE _IOW(F2FS_IOCTL_MAGIC, 13, __u32) #define F2FS_IOC_GET_PIN_FILE _IOR(F2FS_IOCTL_MAGIC, 14, __u32) +#define F2FS_IOC_PRECACHE_EXTENTS _IO(F2FS_IOCTL_MAGIC, 15) #define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY #define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY @@ -617,6 +618,7 @@ struct f2fs_map_blocks { unsigned int m_len; unsigned int m_flags; pgoff_t *m_next_pgofs; /* point next possible non-hole pgofs */ + pgoff_t *m_next_extent; /* point to next possible extent */ int m_seg_type; }; @@ -627,6 +629,7 @@ enum { F2FS_GET_BLOCK_BMAP, F2FS_GET_BLOCK_PRE_DIO, F2FS_GET_BLOCK_PRE_AIO, + F2FS_GET_BLOCK_PRECACHE, }; /* @@ -2658,6 +2661,7 @@ int f2fs_getattr(struct vfsmount *mnt, struct dentry *dentry, int f2fs_setattr(struct dentry *dentry, struct iattr *attr); int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end); void truncate_data_blocks_range(struct dnode_of_data *dn, int count); +int f2fs_precache_extents(struct inode *inode); long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int f2fs_pin_file_control(struct inode *inode, bool inc); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 617ff6f6f268..29c1aed384f6 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1407,7 +1407,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset, { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_map_blocks map = { .m_next_pgofs = NULL, - .m_seg_type = NO_CHECK_TYPE }; + .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE }; pgoff_t pg_end; loff_t new_size = i_size_read(inode); loff_t off_end; @@ -2048,7 +2048,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, { struct inode *inode = file_inode(filp); struct f2fs_map_blocks map = { .m_next_pgofs = NULL, - .m_seg_type = NO_CHECK_TYPE }; + .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE }; struct extent_info ei = {0,0,0}; pgoff_t pg_start, pg_end; unsigned int blk_per_seg = sbi->blocks_per_seg; @@ -2531,6 +2531,43 @@ static int f2fs_ioc_get_pin_file(struct file *filp, unsigned long arg) return put_user(pin, (u32 __user *)arg); } +int f2fs_precache_extents(struct inode *inode) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_map_blocks map; + pgoff_t m_next_extent; + loff_t end; + int err; + + if (is_inode_flag_set(inode, FI_NO_EXTENT)) + return -EOPNOTSUPP; + + map.m_lblk = 0; + map.m_next_pgofs = NULL; + map.m_next_extent = &m_next_extent; + map.m_seg_type = NO_CHECK_TYPE; + end = F2FS_I_SB(inode)->max_file_blocks; + + while (map.m_lblk < end) { + map.m_len = end - map.m_lblk; + + down_write(&fi->dio_rwsem[WRITE]); + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_PRECACHE); + up_write(&fi->dio_rwsem[WRITE]); + if (err) + return err; + + map.m_lblk = m_next_extent; + } + + return err; +} + +static int f2fs_ioc_precache_extents(struct file *filp, unsigned long arg) +{ + return f2fs_precache_extents(file_inode(filp)); +} + long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp))))) @@ -2581,6 +2618,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_get_pin_file(filp, arg); case F2FS_IOC_SET_PIN_FILE: return f2fs_ioc_set_pin_file(filp, arg); + case F2FS_IOC_PRECACHE_EXTENTS: + return f2fs_ioc_precache_extents(filp, arg); default: return -ENOTTY; } @@ -2661,6 +2700,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_GET_FEATURES: case F2FS_IOC_GET_PIN_FILE: case F2FS_IOC_SET_PIN_FILE: + case F2FS_IOC_PRECACHE_EXTENTS: break; default: return -ENOIOCTLCMD; From 4dca47531eb037d663a903508f636f8758cbc172 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 10 Jan 2018 18:18:52 +0800 Subject: [PATCH 0576/1212] f2fs: speed up defragment on sparse file We have supported to get next page offset with valid mapping crossing hole in f2fs_map_blocks, utilizing it to speed up defragment on sparse file. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 6 +++++- fs/f2fs/file.c | 11 ++++++----- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 95a649467272..ec6698bc8021 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1016,8 +1016,12 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, *map->m_next_pgofs = pgofs + 1; goto sync_out; } - if (flag != F2FS_GET_BLOCK_FIEMAP) + if (flag != F2FS_GET_BLOCK_FIEMAP) { + /* for defragment case */ + if (map->m_next_pgofs) + *map->m_next_pgofs = pgofs + 1; goto sync_out; + } } } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 29c1aed384f6..02c20d55cf90 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2047,10 +2047,10 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, struct f2fs_defragment *range) { struct inode *inode = file_inode(filp); - struct f2fs_map_blocks map = { .m_next_pgofs = NULL, - .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE }; + struct f2fs_map_blocks map = { .m_next_extent = NULL, + .m_seg_type = NO_CHECK_TYPE }; struct extent_info ei = {0,0,0}; - pgoff_t pg_start, pg_end; + pgoff_t pg_start, pg_end, next_pgofs; unsigned int blk_per_seg = sbi->blocks_per_seg; unsigned int total = 0, sec_num; block_t blk_end = 0; @@ -2084,6 +2084,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, } map.m_lblk = pg_start; + map.m_next_pgofs = &next_pgofs; /* * lookup mapping info in dnode page cache, skip defragmenting if all @@ -2097,7 +2098,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, goto out; if (!(map.m_flags & F2FS_MAP_FLAGS)) { - map.m_lblk++; + map.m_lblk = next_pgofs; continue; } @@ -2142,7 +2143,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, goto clear_out; if (!(map.m_flags & F2FS_MAP_FLAGS)) { - map.m_lblk++; + map.m_lblk = next_pgofs; continue; } From 18d267c273a96a600279b576f492439f969ca6a6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 15 Jan 2018 17:16:46 +0800 Subject: [PATCH 0577/1212] f2fs: fix to drop all inmem pages correctly In commit 57864ae5ce3a ("f2fs: limit # of inmemory pages"), we have limited memory footprint of all inmem pages with 20% of total memory, otherwise, if we exceed the threshold, we will try to drop all inmem pages to avoid excessive memory pressure resulting in performance regression. But in some unrelated error paths, we will also drop all inmem pages, which should be wrong, fix it in this patch. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ec6698bc8021..6401cf431026 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2082,7 +2082,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *page = NULL; pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT; - bool need_balance = false; + bool need_balance = false, drop_atomic = false; block_t blkaddr = NULL_ADDR; int err = 0; @@ -2091,6 +2091,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, if (f2fs_is_atomic_file(inode) && !available_free_memory(sbi, INMEM_PAGES)) { err = -ENOMEM; + drop_atomic = true; goto fail; } @@ -2171,7 +2172,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, fail: f2fs_put_page(page, 1); f2fs_write_failed(mapping, pos + len); - if (f2fs_is_atomic_file(inode)) + if (drop_atomic) drop_inmem_pages_all(sbi); return err; } From c4027d08430b904f9000a13dabaced9078fd0a11 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 5 Jan 2018 16:02:36 -0800 Subject: [PATCH 0578/1212] f2fs: allow quota to use reserved blocks This patch allows quota to use reserved blocks all the time. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 28e5a52aadb4..6dc15d0ea3c8 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1640,10 +1640,15 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs) return ofs == XATTR_NODE_OFFSET; } -static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi) +static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi, + struct inode *inode) { + if (!inode) + return true; if (!test_opt(sbi, RESERVE_ROOT)) return false; + if (IS_NOQUOTA(inode)) + return true; if (capable(CAP_SYS_RESOURCE)) return true; if (uid_eq(sbi->s_resuid, current_fsuid())) @@ -1684,7 +1689,7 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, avail_user_block_count = sbi->user_block_count - sbi->current_reserved_blocks; - if (!__allow_reserved_blocks(sbi)) + if (!__allow_reserved_blocks(sbi, inode)) avail_user_block_count -= sbi->root_reserved_blocks; if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) { @@ -1885,7 +1890,7 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, valid_block_count = sbi->total_valid_block_count + sbi->current_reserved_blocks + 1; - if (!__allow_reserved_blocks(sbi)) + if (!__allow_reserved_blocks(sbi, inode)) valid_block_count += sbi->root_reserved_blocks; if (unlikely(valid_block_count > sbi->user_block_count)) { From d5efd57e013bfcc82e2338b799df0877fc8db236 Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Wed, 17 Jan 2018 12:11:31 +0800 Subject: [PATCH 0579/1212] f2fs: avoid hungtask when GC encrypted block if io_bits is set When io_bits is set, GCing encrypted block may hit the following hungtask. Since io_bits requires aligned block address, f2fs_submit_page_write may return -EAGAIN if new_blkaddr does not satisify io_bits alignment. As a result, the encrypted page will never be writtenback. This patch makes move_data_block aware the EAGAIN error and cancel the writeback. [ 246.751371] INFO: task kworker/u4:4:797 blocked for more than 90 seconds. [ 246.752423] Not tainted 4.15.0-rc4+ #11 [ 246.754176] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 246.755336] kworker/u4:4 D25448 797 2 0x80000000 [ 246.755597] Workqueue: writeback wb_workfn (flush-7:0) [ 246.755616] Call Trace: [ 246.755695] ? __schedule+0x322/0xa90 [ 246.755761] ? blk_init_request_from_bio+0x120/0x120 [ 246.755773] ? pci_mmcfg_check_reserved+0xb0/0xb0 [ 246.755801] ? __radix_tree_create+0x19e/0x200 [ 246.755813] ? delete_node+0x136/0x370 [ 246.755838] schedule+0x43/0xc0 [ 246.755904] io_schedule+0x17/0x40 [ 246.755939] wait_on_page_bit_common+0x17b/0x240 [ 246.755950] ? wake_page_function+0xa0/0xa0 [ 246.755961] ? add_to_page_cache_lru+0x160/0x160 [ 246.755972] ? page_cache_tree_insert+0x170/0x170 [ 246.755983] ? __lru_cache_add+0x96/0xb0 [ 246.756086] __filemap_fdatawait_range+0x14f/0x1c0 [ 246.756097] ? wait_on_page_bit_common+0x240/0x240 [ 246.756120] ? __wake_up_locked_key_bookmark+0x20/0x20 [ 246.756167] ? wait_on_all_pages_writeback+0xc9/0x100 [ 246.756179] ? __remove_ino_entry+0x120/0x120 [ 246.756192] ? wait_woken+0x100/0x100 [ 246.756204] filemap_fdatawait_range+0x9/0x20 [ 246.756216] write_checkpoint+0x18a1/0x1f00 [ 246.756254] ? blk_get_request+0x10/0x10 [ 246.756265] ? cpumask_next_and+0x43/0x60 [ 246.756279] ? f2fs_sync_inode_meta+0x160/0x160 [ 246.756289] ? remove_element.isra.4+0xa0/0xa0 [ 246.756300] ? __put_compound_page+0x40/0x40 [ 246.756310] ? f2fs_sync_fs+0xec/0x1c0 [ 246.756320] ? f2fs_sync_fs+0x120/0x1c0 [ 246.756329] f2fs_sync_fs+0x120/0x1c0 [ 246.756357] ? trace_event_raw_event_f2fs__page+0x260/0x260 [ 246.756393] ? ata_build_rw_tf+0x173/0x410 [ 246.756397] f2fs_balance_fs_bg+0x198/0x390 [ 246.756405] ? drop_inmem_page+0x230/0x230 [ 246.756415] ? ahci_qc_prep+0x1bb/0x2e0 [ 246.756418] ? ahci_qc_issue+0x1df/0x290 [ 246.756422] ? __accumulate_pelt_segments+0x42/0xd0 [ 246.756426] ? f2fs_write_node_pages+0xd1/0x380 [ 246.756429] f2fs_write_node_pages+0xd1/0x380 [ 246.756437] ? sync_node_pages+0x8f0/0x8f0 [ 246.756440] ? update_curr+0x53/0x220 [ 246.756444] ? __accumulate_pelt_segments+0xa2/0xd0 [ 246.756448] ? __update_load_avg_se.isra.39+0x349/0x360 [ 246.756452] ? do_writepages+0x2a/0xa0 [ 246.756456] do_writepages+0x2a/0xa0 [ 246.756460] __writeback_single_inode+0x70/0x490 [ 246.756463] ? check_preempt_wakeup+0x199/0x310 [ 246.756467] writeback_sb_inodes+0x2a2/0x660 [ 246.756471] ? is_empty_dir_inode+0x40/0x40 [ 246.756474] ? __writeback_single_inode+0x490/0x490 [ 246.756477] ? string+0xbf/0xf0 [ 246.756480] ? down_read_trylock+0x35/0x60 [ 246.756484] __writeback_inodes_wb+0x9f/0xf0 [ 246.756488] wb_writeback+0x41d/0x4b0 [ 246.756492] ? writeback_inodes_wb.constprop.55+0x150/0x150 [ 246.756498] ? set_worker_desc+0xf7/0x130 [ 246.756502] ? current_is_workqueue_rescuer+0x60/0x60 [ 246.756511] ? _find_next_bit+0x2c/0xa0 [ 246.756514] ? wb_workfn+0x400/0x5d0 [ 246.756518] wb_workfn+0x400/0x5d0 [ 246.756521] ? finish_task_switch+0xdf/0x2a0 [ 246.756525] ? inode_wait_for_writeback+0x30/0x30 [ 246.756529] process_one_work+0x3a7/0x6f0 [ 246.756533] worker_thread+0x82/0x750 [ 246.756537] kthread+0x16f/0x1c0 [ 246.756541] ? trace_event_raw_event_workqueue_work+0x110/0x110 [ 246.756544] ? kthread_create_worker_on_cpu+0xb0/0xb0 [ 246.756548] ret_from_fork+0x1f/0x30 Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d98b869456ce..d0de3429c26c 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -691,7 +691,12 @@ static void move_data_block(struct inode *inode, block_t bidx, fio.op = REQ_OP_WRITE; fio.op_flags = REQ_SYNC | REQ_NOIDLE; fio.new_blkaddr = newaddr; - f2fs_submit_page_write(&fio); + err = f2fs_submit_page_write(&fio); + if (err) { + if (PageWriteback(fio.encrypted_page)) + end_page_writeback(fio.encrypted_page); + goto put_page_out; + } f2fs_update_iostat(fio.sbi, FS_GC_DATA_IO, F2FS_BLKSIZE); From c1b74c96709223b65a03732cfc9963483e3d105f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 17 Jan 2018 16:31:35 +0800 Subject: [PATCH 0580/1212] f2fs: clean up error path of fill_super This patch cleans up error path of fille_super to avoid unneeded release step. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d89834b79646..aaeba346e9d7 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2612,14 +2612,14 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) err = init_percpu_info(sbi); if (err) - goto free_options; + goto free_bio_info; if (F2FS_IO_SIZE(sbi) > 1) { sbi->write_io_dummy = mempool_create_page_pool(2 * (F2FS_IO_SIZE(sbi) - 1), 0); if (!sbi->write_io_dummy) { err = -ENOMEM; - goto free_options; + goto free_percpu; } } @@ -2851,10 +2851,12 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) iput(sbi->meta_inode); free_io_dummy: mempool_destroy(sbi->write_io_dummy); -free_options: +free_percpu: + destroy_percpu_info(sbi); +free_bio_info: for (i = 0; i < NR_PAGE_TYPE; i++) kfree(sbi->write_io[i]); - destroy_percpu_info(sbi); +free_options: #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) kfree(sbi->s_qf_names[i]); From eeb0118b8340767cb5be7ccc0abeaba11416b317 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 17 Jan 2018 16:31:36 +0800 Subject: [PATCH 0581/1212] f2fs: kill F2FS_INLINE_XATTR_ADDRS for cleanup Use get_inline_xattr_addrs directly instead of F2FS_INLINE_XATTR_ADDRS. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/f2fs.h | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 6401cf431026..7bd2c9e7e873 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1200,7 +1200,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, phys = (__u64)blk_to_logical(inode, ni.blk_addr); offset = offsetof(struct f2fs_inode, i_addr) + sizeof(__le32) * (DEF_ADDRS_PER_INODE - - F2FS_INLINE_XATTR_ADDRS(inode)); + get_inline_xattr_addrs(inode)); phys += offset; len = inline_xattr_size(inode); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6dc15d0ea3c8..23944b3417dd 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -478,10 +478,9 @@ struct f2fs_flush_device { #define DEF_MIN_INLINE_SIZE 1 static inline int get_extra_isize(struct inode *inode); static inline int get_inline_xattr_addrs(struct inode *inode); -#define F2FS_INLINE_XATTR_ADDRS(inode) get_inline_xattr_addrs(inode) #define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ (CUR_ADDRS_PER_INODE(inode) - \ - F2FS_INLINE_XATTR_ADDRS(inode) - \ + get_inline_xattr_addrs(inode) - \ DEF_INLINE_RESERVED_SIZE)) /* for inline dir */ @@ -2388,7 +2387,7 @@ static inline int f2fs_has_inline_xattr(struct inode *inode) static inline unsigned int addrs_per_inode(struct inode *inode) { - return CUR_ADDRS_PER_INODE(inode) - F2FS_INLINE_XATTR_ADDRS(inode); + return CUR_ADDRS_PER_INODE(inode) - get_inline_xattr_addrs(inode); } static inline void *inline_xattr_addr(struct inode *inode, struct page *page) @@ -2396,7 +2395,7 @@ static inline void *inline_xattr_addr(struct inode *inode, struct page *page) struct f2fs_inode *ri = F2FS_INODE(page); return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE - - F2FS_INLINE_XATTR_ADDRS(inode)]); + get_inline_xattr_addrs(inode)]); } static inline int inline_xattr_size(struct inode *inode) From f31d52811c1f654de5f8a01c5e277b56e737e9c3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 17 Jan 2018 16:31:37 +0800 Subject: [PATCH 0582/1212] f2fs: fix to update last_disk_size correctly This patch fixes to update last_disk_size only when writing out page successfully. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7bd2c9e7e873..5dc6e461aa31 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1715,10 +1715,14 @@ static int __write_data_page(struct page *page, bool *submitted, } } - down_write(&F2FS_I(inode)->i_sem); - if (F2FS_I(inode)->last_disk_size < psize) - F2FS_I(inode)->last_disk_size = psize; - up_write(&F2FS_I(inode)->i_sem); + if (err) { + file_set_keep_isize(inode); + } else { + down_write(&F2FS_I(inode)->i_sem); + if (F2FS_I(inode)->last_disk_size < psize) + F2FS_I(inode)->last_disk_size = psize; + up_write(&F2FS_I(inode)->i_sem); + } done: if (err && err != -ENOENT) From 700b53f21ee8c4feb0238b10ea23b76a8f1e7231 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 17 Jan 2018 16:31:38 +0800 Subject: [PATCH 0583/1212] f2fs: split need_inplace_update This patch splits need_inplace_update to two functions: a. should_update_inplace() includes all conditions that we must use IPU. b. should_update_outplace() includes all conditions that we must use OPU. So that, in f2fs_ioc_set_pin_file() and f2fs_defragment_range(), we can use corresponding function to check whether we can trigger OPU/IPU or not. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 75 ++++++++++++++++++++++++++++++++++++++++++----- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/file.c | 7 ++++- fs/f2fs/segment.h | 41 -------------------------- 4 files changed, 75 insertions(+), 50 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 5dc6e461aa31..2003ebdc9b52 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1517,20 +1517,79 @@ static int encrypt_one_page(struct f2fs_io_info *fio) return PTR_ERR(fio->encrypted_page); } +static inline bool check_inplace_update_policy(struct inode *inode, + struct f2fs_io_info *fio) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned int policy = SM_I(sbi)->ipu_policy; + + if (policy & (0x1 << F2FS_IPU_FORCE)) + return true; + if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi)) + return true; + if (policy & (0x1 << F2FS_IPU_UTIL) && + utilization(sbi) > SM_I(sbi)->min_ipu_util) + return true; + if (policy & (0x1 << F2FS_IPU_SSR_UTIL) && need_SSR(sbi) && + utilization(sbi) > SM_I(sbi)->min_ipu_util) + return true; + + /* + * IPU for rewrite async pages + */ + if (policy & (0x1 << F2FS_IPU_ASYNC) && + fio && fio->op == REQ_OP_WRITE && + !(fio->op_flags & REQ_SYNC) && + !f2fs_encrypted_inode(inode)) + return true; + + /* this is only set during fdatasync */ + if (policy & (0x1 << F2FS_IPU_FSYNC) && + is_inode_flag_set(inode, FI_NEED_IPU)) + return true; + + return false; +} + +bool should_update_inplace(struct inode *inode, struct f2fs_io_info *fio) +{ + if (f2fs_is_pinned_file(inode)) + return true; + + /* if this is cold file, we should overwrite to avoid fragmentation */ + if (file_is_cold(inode)) + return true; + + return check_inplace_update_policy(inode, fio); +} + +bool should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (test_opt(sbi, LFS)) + return true; + if (S_ISDIR(inode->i_mode)) + return true; + if (f2fs_is_atomic_file(inode)) + return true; + if (fio) { + if (is_cold_data(fio->page)) + return true; + if (IS_ATOMIC_WRITTEN_PAGE(fio->page)) + return true; + } + return false; +} + static inline bool need_inplace_update(struct f2fs_io_info *fio) { struct inode *inode = fio->page->mapping->host; - if (f2fs_is_pinned_file(inode)) - return true; - if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode)) - return false; - if (is_cold_data(fio->page)) - return false; - if (IS_ATOMIC_WRITTEN_PAGE(fio->page)) + if (should_update_outplace(inode, fio)) return false; - return need_inplace_update_policy(inode, fio); + return should_update_inplace(inode, fio); } static inline bool valid_ipu_blkaddr(struct f2fs_io_info *fio) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 23944b3417dd..29dad838bf42 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2928,6 +2928,8 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int create, int flag); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); +bool should_update_inplace(struct inode *inode, struct f2fs_io_info *fio); +bool should_update_outplace(struct inode *inode, struct f2fs_io_info *fio); void f2fs_set_page_dirty_nobuffers(struct page *page); int __f2fs_write_data_pages(struct address_space *mapping, struct writeback_control *wbc, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 02c20d55cf90..2eb9710bf263 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2058,7 +2058,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, int err; /* if in-place-update policy is enabled, don't waste time here */ - if (need_inplace_update_policy(inode, NULL)) + if (should_update_inplace(inode, NULL)) return -EINVAL; pg_start = range->start >> PAGE_SHIFT; @@ -2498,6 +2498,11 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) inode_lock(inode); + if (should_update_outplace(inode, NULL)) { + ret = -EINVAL; + goto out; + } + if (!pin) { clear_inode_flag(inode, FI_PIN_FILE); F2FS_I(inode)->i_gc_failures = 1; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 5c4d432ebf1d..e123dd30f2e4 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -580,47 +580,6 @@ enum { F2FS_IPU_ASYNC, }; -static inline bool need_inplace_update_policy(struct inode *inode, - struct f2fs_io_info *fio) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - unsigned int policy = SM_I(sbi)->ipu_policy; - - if (test_opt(sbi, LFS)) - return false; - - /* if this is cold file, we should overwrite to avoid fragmentation */ - if (file_is_cold(inode)) - return true; - - if (policy & (0x1 << F2FS_IPU_FORCE)) - return true; - if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi)) - return true; - if (policy & (0x1 << F2FS_IPU_UTIL) && - utilization(sbi) > SM_I(sbi)->min_ipu_util) - return true; - if (policy & (0x1 << F2FS_IPU_SSR_UTIL) && need_SSR(sbi) && - utilization(sbi) > SM_I(sbi)->min_ipu_util) - return true; - - /* - * IPU for rewrite async pages - */ - if (policy & (0x1 << F2FS_IPU_ASYNC) && - fio && fio->op == REQ_OP_WRITE && - !(fio->op_flags & REQ_SYNC) && - !f2fs_encrypted_inode(inode)) - return true; - - /* this is only set during fdatasync */ - if (policy & (0x1 << F2FS_IPU_FSYNC) && - is_inode_flag_set(inode, FI_NEED_IPU)) - return true; - - return false; -} - static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi, int type) { From bb924f777717669e420038c1edd0962ac9205111 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 17 Jan 2018 22:28:52 +0800 Subject: [PATCH 0584/1212] f2fs: hanlde error case in f2fs_ioc_shutdown This patch makes f2fs_ioc_shutdown handling error case correctly. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 2eb9710bf263..b108395a6e38 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1815,14 +1815,20 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) switch (in) { case F2FS_GOING_DOWN_FULLSYNC: sb = freeze_bdev(sb->s_bdev); - if (sb && !IS_ERR(sb)) { + if (IS_ERR(sb)) { + ret = PTR_ERR(sb); + goto out; + } + if (sb) { f2fs_stop_checkpoint(sbi, false); thaw_bdev(sb->s_bdev, sb); } break; case F2FS_GOING_DOWN_METASYNC: /* do checkpoint only */ - f2fs_sync_fs(sb, 1); + ret = f2fs_sync_fs(sb, 1); + if (ret) + goto out; f2fs_stop_checkpoint(sbi, false); break; case F2FS_GOING_DOWN_NOSYNC: From 8069a0e983d999641331e3a7c8cda42de0ae1166 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 18 Jan 2018 17:23:29 +0800 Subject: [PATCH 0585/1212] f2fs: stop gc/discard thread after fs shutdown Once filesystem shuts down, daemons like gc/discard thread should be aware of it, and do exit, in addtion, drop all cached pending discard commands and turn off real-time discard mode. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 7 +++++++ fs/f2fs/segment.c | 5 +++++ 3 files changed, 13 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 29dad838bf42..26f8aefe5f5f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2820,6 +2820,7 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); void init_discard_policy(struct discard_policy *dpolicy, int discard_type, unsigned int granularity); +void drop_discard_cmd(struct f2fs_sb_info *sbi); void stop_discard_thread(struct f2fs_sb_info *sbi); bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index b108395a6e38..86507b51b7d3 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1842,6 +1842,13 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) ret = -EINVAL; goto out; } + + stop_gc_thread(sbi); + stop_discard_thread(sbi); + + drop_discard_cmd(sbi); + clear_opt(sbi, DISCARD); + f2fs_update_time(sbi, REQ_TIME); out: mnt_drop_write_file(filp); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7638ebb1c343..cfc19d8d4625 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1343,6 +1343,11 @@ static bool __drop_discard_cmd(struct f2fs_sb_info *sbi) return dropped; } +void drop_discard_cmd(struct f2fs_sb_info *sbi) +{ + __drop_discard_cmd(sbi); +} + static unsigned int __wait_one_discard_bio(struct f2fs_sb_info *sbi, struct discard_cmd *dc) { From 70b3a923daff38468c03ad2b5a4b6efd65e5afa3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 18 Jan 2018 17:29:10 +0800 Subject: [PATCH 0586/1212] f2fs: drop page cache after fs shutdown Don't remain dirtied page cache in f2fs after shutdown, it can mitigate memory pressure of whole system, in order to keep other modules working properly. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 7 +++++-- fs/f2fs/data.c | 12 ++++++------ fs/f2fs/node.c | 19 ++++++++++--------- 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 8e629434cd05..91c18dd62974 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -238,12 +238,15 @@ static int __f2fs_write_meta_page(struct page *page, trace_f2fs_writepage(page, META); + if (unlikely(f2fs_cp_error(sbi))) { + dec_page_count(sbi, F2FS_DIRTY_META); + unlock_page(page); + return 0; + } if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; if (wbc->for_reclaim && page->index < GET_SUM_BLOCK(sbi, 0)) goto redirty_out; - if (unlikely(f2fs_cp_error(sbi))) - goto redirty_out; write_meta_page(sbi, page, io_type); dec_page_count(sbi, F2FS_DIRTY_META); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 2003ebdc9b52..c80f138b0f33 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1715,6 +1715,12 @@ static int __write_data_page(struct page *page, bool *submitted, trace_f2fs_writepage(page, DATA); + /* we should bypass data pages to proceed the kworkder jobs */ + if (unlikely(f2fs_cp_error(sbi))) { + mapping_set_error(page->mapping, -EIO); + goto out; + } + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; @@ -1739,12 +1745,6 @@ static int __write_data_page(struct page *page, bool *submitted, available_free_memory(sbi, BASE_CHECK)))) goto redirty_out; - /* we should bypass data pages to proceed the kworkder jobs */ - if (unlikely(f2fs_cp_error(sbi))) { - mapping_set_error(page->mapping, -EIO); - goto out; - } - /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { fio.need_lock = LOCK_DONE; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 676b0e3f5ef3..ad5df8bc51ad 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1344,10 +1344,14 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, trace_f2fs_writepage(page, NODE); + if (unlikely(f2fs_cp_error(sbi))) { + dec_page_count(sbi, F2FS_DIRTY_NODES); + unlock_page(page); + return 0; + } + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; - if (unlikely(f2fs_cp_error(sbi))) - goto redirty_out; /* get old block addr of this node page */ nid = nid_of_node(page); @@ -1592,12 +1596,6 @@ int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, struct page *page = pvec.pages[i]; bool submitted = false; - if (unlikely(f2fs_cp_error(sbi))) { - pagevec_release(&pvec); - ret = -EIO; - goto out; - } - /* * flushing sequence with step: * 0. indirect nodes @@ -1667,9 +1665,12 @@ int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, step++; goto next_step; } -out: + if (nwritten) f2fs_submit_merged_write(sbi, NODE); + + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; return ret; } From 64aa9569a1bffeafac71f48930e05f87a909d1cb Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Sat, 20 Jan 2018 15:46:33 +0800 Subject: [PATCH 0587/1212] f2fs: correct removexattr behavior for null valued extended attribute __vfs_removexattr() transfers "NULL" value to the setxattr handler of the f2fs filesystem in order to remove the extended attribute. But, __f2fs_setxattr() just ignores the removal request when the value of the extended attribute is already NULL. We have to remove the extended attribute itself even if the value of that is already NULL. We can reporduce this bug with the below: 1. touch file 2. setfattr -n "user.foo" file 3. setfattr -x "user.foo" file 4. getfattr -d file > user.foo Signed-off-by: Daeho Jeong Signed-off-by: Youngjin Gil Tested-by: Hobin Woo Tested-by: Chao Yu Reviewed-by: Chao Yu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 353fbff85bab..116be979b897 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -640,7 +640,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, goto exit; } - if (f2fs_xattr_value_same(here, value, size)) + if (value && f2fs_xattr_value_same(here, value, size)) goto exit; } else if ((flags & XATTR_REPLACE)) { error = -ENODATA; From e56500860be0787a5b78380463ec0fd027460de3 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 19 Jan 2018 20:01:40 -0800 Subject: [PATCH 0588/1212] f2fs: recover some i_inline flags This fixes lost i_inline flags during roll-forward. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 7d63faf51e52..b6d1ec620a8c 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -195,6 +195,20 @@ static int recover_dentry(struct inode *inode, struct page *ipage, return err; } +static void recover_inline_flags(struct inode *inode, struct f2fs_inode *ri) +{ + if (ri->i_inline & F2FS_PIN_FILE) + set_inode_flag(inode, FI_PIN_FILE); + else + clear_inode_flag(inode, FI_PIN_FILE); + if (ri->i_inline & F2FS_DATA_EXIST) + set_inode_flag(inode, FI_DATA_EXIST); + else + clear_inode_flag(inode, FI_DATA_EXIST); + if (!(ri->i_inline & F2FS_INLINE_DOTS)) + clear_inode_flag(inode, FI_INLINE_DOTS); +} + static void recover_inode(struct inode *inode, struct page *page) { struct f2fs_inode *raw = F2FS_INODE(page); @@ -211,13 +225,16 @@ static void recover_inode(struct inode *inode, struct page *page) F2FS_I(inode)->i_advise = raw->i_advise; + recover_inline_flags(inode, raw); + if (file_enc_name(inode)) name = ""; else name = F2FS_INODE(page)->i_name; - f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s", - ino_of_node(page), name); + f2fs_msg(inode->i_sb, KERN_NOTICE, + "recover_inode: ino = %x, name = %s, inline = %x", + ino_of_node(page), name, raw->i_inline); } static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, From 35b11839a1ae84d02338b2c96952bffa1c908df8 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 19 Jan 2018 13:42:33 -0800 Subject: [PATCH 0589/1212] f2fs: allow to recover node blocks given updated checkpoint If fsck.f2fs changes crc, we have no way to recover some inode blocks by roll- forward recovery. Let's relax the condition to recover them. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 1 + fs/f2fs/node.h | 4 ++++ include/linux/f2fs_fs.h | 1 + 3 files changed, 6 insertions(+) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 91c18dd62974..3c343e922f6e 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1161,6 +1161,7 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* set this flag to activate crc|cp_ver for recovery */ __set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG); + __clear_ckpt_flags(ckpt, CP_NOCRC_RECOVERY_FLAG); spin_unlock_irqrestore(&sbi->cp_lock, flags); } diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 0ee3e5ff49a3..081ef0d672bf 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -305,6 +305,10 @@ static inline bool is_recoverable_dnode(struct page *page) struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); __u64 cp_ver = cur_cp_version(ckpt); + /* Don't care crc part, if fsck.f2fs sets it. */ + if (__is_set_ckpt_flags(ckpt, CP_NOCRC_RECOVERY_FLAG)) + return (cp_ver << 32) == (cpver_of_node(page) << 32); + if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) cp_ver |= (cur_cp_crc(ckpt) << 32); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 9bba23187c04..9e0cb7b63883 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -117,6 +117,7 @@ struct f2fs_super_block { /* * For checkpoint */ +#define CP_NOCRC_RECOVERY_FLAG 0x00000200 #define CP_TRIMMED_FLAG 0x00000100 #define CP_NAT_BITS_FLAG 0x00000080 #define CP_CRC_RECOVERY_FLAG 0x00000040 From b007190234d624dad977a124ec8d520f4c874b6d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 25 Jan 2018 18:57:25 +0800 Subject: [PATCH 0590/1212] f2fs: use GFP_F2FS_ZERO for cleanup Clean up codes with GFP_F2FS_ZERO, no logic changes. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index ad5df8bc51ad..c294d0feea08 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -143,11 +143,9 @@ static struct nat_entry *__alloc_nat_entry(nid_t nid, bool no_fail) struct nat_entry *new; if (no_fail) - new = f2fs_kmem_cache_alloc(nat_entry_slab, - GFP_NOFS | __GFP_ZERO); + new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_F2FS_ZERO); else - new = kmem_cache_alloc(nat_entry_slab, - GFP_NOFS | __GFP_ZERO); + new = kmem_cache_alloc(nat_entry_slab, GFP_F2FS_ZERO); if (new) { nat_set_nid(new, nid); nat_reset_flag(new); From fa043fae90300e9b49218e204409a5066121b0a7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 25 Jan 2018 18:57:26 +0800 Subject: [PATCH 0591/1212] f2fs: clean up duplicated assignment in init_discard_policy Remove duplicated codes of assignment for .max_requests and .io_aware_gran. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index cfc19d8d4625..31c69c6660e7 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1795,25 +1795,20 @@ void init_discard_policy(struct discard_policy *dpolicy, dpolicy->sync = true; dpolicy->granularity = granularity; + dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; + dpolicy->io_aware_gran = MAX_PLIST_NUM; + if (discard_type == DPOLICY_BG) { dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; - dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; - dpolicy->io_aware_gran = MAX_PLIST_NUM; dpolicy->io_aware = true; } else if (discard_type == DPOLICY_FORCE) { dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; - dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; - dpolicy->io_aware_gran = MAX_PLIST_NUM; dpolicy->io_aware = true; } else if (discard_type == DPOLICY_FSTRIM) { - dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; - dpolicy->io_aware_gran = MAX_PLIST_NUM; dpolicy->io_aware = false; } else if (discard_type == DPOLICY_UMOUNT) { - dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; - dpolicy->io_aware_gran = MAX_PLIST_NUM; dpolicy->io_aware = false; } } From 1062a0c018296c6719e49b25bed206d414c18898 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 25 Jan 2018 18:57:27 +0800 Subject: [PATCH 0592/1212] f2fs: stop issuing discard if fs is readonly If filesystem is readonly, stop to issue discard in daemon. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 31c69c6660e7..6662c6caf477 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1486,6 +1486,8 @@ static int issue_discard_thread(void *data) msecs_to_jiffies(wait_ms)); if (try_to_freeze()) continue; + if (f2fs_readonly(sbi->sb)) + continue; if (kthread_should_stop()) return 0; From 9fb0de175172c63132cc84b630e8c50834269e1b Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Thu, 25 Jan 2018 17:27:11 +0800 Subject: [PATCH 0593/1212] f2fs: rebuild sit page from sit info in mem This patch rebuild sit page from sit info in mem instead of issue a read io. I test this method and the result is as below: Pre: mmc_perf_test-12061 [001] ...1 976.819992: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-12061 [001] ...1 976.856446: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit mmc_perf_test-12061 [003] ...1 998.976946: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-12061 [003] ...1 999.023269: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit mmc_perf_test-12061 [003] ...1 1022.060772: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-12061 [003] ...1 1022.111034: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit mmc_perf_test-12061 [002] ...1 1070.127643: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-12061 [003] ...1 1070.187352: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit mmc_perf_test-12061 [003] ...1 1095.942124: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-12061 [003] ...1 1095.995975: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit mmc_perf_test-12061 [003] ...1 1122.535091: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-12061 [003] ...1 1122.586521: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit mmc_perf_test-12061 [001] ...1 1147.897487: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-12061 [001] ...1 1147.959438: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit mmc_perf_test-12061 [003] ...1 1177.926951: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-12061 [002] ...1 1177.976823: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit mmc_perf_test-12061 [002] ...1 1204.176087: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-12061 [002] ...1 1204.239046: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit Some sit flush consume more than 50ms. Now: mmc_perf_test-2187 [007] ...1 196.840684: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-2187 [007] ...1 196.841258: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit mmc_perf_test-2187 [007] ...1 219.430582: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-2187 [007] ...1 219.431144: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit mmc_perf_test-2187 [002] ...1 243.638678: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-2187 [000] ...1 243.638980: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit mmc_perf_test-2187 [002] ...1 265.392180: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-2187 [002] ...1 265.392245: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit mmc_perf_test-2187 [000] ...1 290.309051: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-2187 [000] ...1 290.309116: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit mmc_perf_test-2187 [003] ...1 317.144209: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-2187 [003] ...1 317.145913: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit mmc_perf_test-2187 [005] ...1 343.224954: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-2187 [005] ...1 343.225574: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit mmc_perf_test-2187 [000] ...1 370.239846: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-2187 [000] ...1 370.241138: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit mmc_perf_test-2187 [001] ...1 397.029043: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-2187 [001] ...1 397.030750: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit mmc_perf_test-2187 [003] ...1 425.386377: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-2187 [003] ...1 425.387735: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit Most sit flush consume no more than 1ms. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 19 +++++-------------- fs/f2fs/segment.h | 29 +++++++++++++++++++++++++++-- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6662c6caf477..bf98f6f34b7e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3191,28 +3191,19 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, unsigned int start) { struct sit_info *sit_i = SIT_I(sbi); - struct page *src_page, *dst_page; + struct page *page; pgoff_t src_off, dst_off; - void *src_addr, *dst_addr; src_off = current_sit_addr(sbi, start); dst_off = next_sit_addr(sbi, src_off); - /* get current sit block page without lock */ - src_page = get_meta_page(sbi, src_off); - dst_page = grab_meta_page(sbi, dst_off); - f2fs_bug_on(sbi, PageDirty(src_page)); - - src_addr = page_address(src_page); - dst_addr = page_address(dst_page); - memcpy(dst_addr, src_addr, PAGE_SIZE); - - set_page_dirty(dst_page); - f2fs_put_page(src_page, 1); + page = grab_meta_page(sbi, dst_off); + seg_info_to_sit_page(sbi, page, start); + set_page_dirty(page); set_to_next_sit(sit_i, start); - return dst_page; + return page; } static struct sit_entry_set *grab_sit_entry_set(void) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index e123dd30f2e4..5d6d3e72be31 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -348,16 +348,41 @@ static inline void seg_info_from_raw_sit(struct seg_entry *se, se->mtime = le64_to_cpu(rs->mtime); } -static inline void seg_info_to_raw_sit(struct seg_entry *se, +static inline void __seg_info_to_raw_sit(struct seg_entry *se, struct f2fs_sit_entry *rs) { unsigned short raw_vblocks = (se->type << SIT_VBLOCKS_SHIFT) | se->valid_blocks; rs->vblocks = cpu_to_le16(raw_vblocks); memcpy(rs->valid_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE); + rs->mtime = cpu_to_le64(se->mtime); +} + +static inline void seg_info_to_sit_page(struct f2fs_sb_info *sbi, + struct page *page, unsigned int start) +{ + struct f2fs_sit_block *raw_sit; + struct seg_entry *se; + struct f2fs_sit_entry *rs; + unsigned int end = min(start + SIT_ENTRY_PER_BLOCK, + (unsigned long)MAIN_SEGS(sbi)); + int i; + + raw_sit = (struct f2fs_sit_block *)page_address(page); + for (i = 0; i < end - start; i++) { + rs = &raw_sit->entries[i]; + se = get_seg_entry(sbi, start + i); + __seg_info_to_raw_sit(se, rs); + } +} + +static inline void seg_info_to_raw_sit(struct seg_entry *se, + struct f2fs_sit_entry *rs) +{ + __seg_info_to_raw_sit(se, rs); + memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); se->ckpt_valid_blocks = se->valid_blocks; - rs->mtime = cpu_to_le64(se->mtime); } static inline unsigned int find_next_inuse(struct free_segmap_info *free_i, From 5f9b3abb911fa2f51f5690f4376cf919c2069662 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 25 Jan 2018 14:54:42 +0800 Subject: [PATCH 0594/1212] f2fs: support inode creation time This patch adds creation time field in inode layout to support showing kstat.btime in ->statx. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 7 +++++++ fs/f2fs/file.c | 31 +++++++++++++++++++++++++++++++ fs/f2fs/inode.c | 15 +++++++++++++++ fs/f2fs/namei.c | 3 ++- fs/f2fs/sysfs.c | 7 +++++++ include/linux/f2fs_fs.h | 4 +++- 6 files changed, 65 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 26f8aefe5f5f..0eeeeba0246d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -125,6 +125,7 @@ struct f2fs_mount_info { #define F2FS_FEATURE_INODE_CHKSUM 0x0020 #define F2FS_FEATURE_FLEXIBLE_INLINE_XATTR 0x0040 #define F2FS_FEATURE_QUOTA_INO 0x0080 +#define F2FS_FEATURE_INODE_CRTIME 0x0100 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -699,6 +700,7 @@ struct f2fs_inode_info { int i_extra_isize; /* size of extra space located in i_addr */ kprojid_t i_projid; /* id for project quota */ int i_inline_xattr_size; /* inline xattr size */ + struct timespec i_crtime; /* inode creation time */ }; static inline void get_extent_info(struct extent_info *ext, @@ -3299,6 +3301,11 @@ static inline int f2fs_sb_has_quota_ino(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_QUOTA_INO); } +static inline int f2fs_sb_has_inode_crtime(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CRTIME); +} + #ifdef CONFIG_BLK_DEV_ZONED static inline int get_blkz_type(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkaddr) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 86507b51b7d3..65cda5bc61b7 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -673,6 +673,37 @@ int f2fs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode = d_inode(dentry); +#if 0 + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_inode *ri; + unsigned int flags; + + if (f2fs_has_extra_attr(inode) && + f2fs_sb_has_inode_crtime(inode->i_sb) && + F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_crtime)) { + stat->result_mask |= STATX_BTIME; + stat->btime.tv_sec = fi->i_crtime.tv_sec; + stat->btime.tv_nsec = fi->i_crtime.tv_nsec; + } + + flags = fi->i_flags & (FS_FL_USER_VISIBLE | FS_PROJINHERIT_FL); + if (flags & FS_APPEND_FL) + stat->attributes |= STATX_ATTR_APPEND; + if (flags & FS_COMPR_FL) + stat->attributes |= STATX_ATTR_COMPRESSED; + if (f2fs_encrypted_inode(inode)) + stat->attributes |= STATX_ATTR_ENCRYPTED; + if (flags & FS_IMMUTABLE_FL) + stat->attributes |= STATX_ATTR_IMMUTABLE; + if (flags & FS_NODUMP_FL) + stat->attributes |= STATX_ATTR_NODUMP; + + stat->attributes_mask |= (STATX_ATTR_APPEND | + STATX_ATTR_COMPRESSED | + STATX_ATTR_ENCRYPTED | + STATX_ATTR_IMMUTABLE | + STATX_ATTR_NODUMP); +#endif generic_fillattr(inode, stat); /* we need to show initial sectors used for inline_data/dentries */ diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 1dc77a40d0ad..89c838bfb067 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -278,6 +278,12 @@ static int do_read_inode(struct inode *inode) i_projid = F2FS_DEF_PROJID; fi->i_projid = make_kprojid(&init_user_ns, i_projid); + if (f2fs_has_extra_attr(inode) && f2fs_sb_has_inode_crtime(sbi->sb) && + F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_crtime)) { + fi->i_crtime.tv_sec = le64_to_cpu(ri->i_crtime); + fi->i_crtime.tv_nsec = le32_to_cpu(ri->i_crtime_nsec); + } + f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); @@ -421,6 +427,15 @@ void update_inode(struct inode *inode, struct page *node_page) F2FS_I(inode)->i_projid); ri->i_projid = cpu_to_le32(i_projid); } + + if (f2fs_sb_has_inode_crtime(F2FS_I_SB(inode)->sb) && + F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, + i_crtime)) { + ri->i_crtime = + cpu_to_le64(F2FS_I(inode)->i_crtime.tv_sec); + ri->i_crtime_nsec = + cpu_to_le32(F2FS_I(inode)->i_crtime.tv_nsec); + } } __set_inode_rdev(inode, ri); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 7573779a8e7c..da7f709e3926 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -50,7 +50,8 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) inode->i_ino = ino; inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode->i_ctime = + F2FS_I(inode)->i_crtime = current_time(inode); inode->i_generation = sbi->s_next_generation++; err = insert_inode_locked(inode); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 41887e6ec1b3..d978c7b6ea04 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -113,6 +113,9 @@ static ssize_t features_show(struct f2fs_attr *a, if (f2fs_sb_has_quota_ino(sb)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "quota_ino"); + if (f2fs_sb_has_inode_crtime(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "inode_crtime"); len += snprintf(buf + len, PAGE_SIZE - len, "\n"); return len; } @@ -232,6 +235,7 @@ enum feat_id { FEAT_INODE_CHECKSUM, FEAT_FLEXIBLE_INLINE_XATTR, FEAT_QUOTA_INO, + FEAT_INODE_CRTIME, }; static ssize_t f2fs_feature_show(struct f2fs_attr *a, @@ -246,6 +250,7 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a, case FEAT_INODE_CHECKSUM: case FEAT_FLEXIBLE_INLINE_XATTR: case FEAT_QUOTA_INO: + case FEAT_INODE_CRTIME: return snprintf(buf, PAGE_SIZE, "supported\n"); } return 0; @@ -323,6 +328,7 @@ F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA); F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM); F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR); F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO); +F2FS_FEATURE_RO_ATTR(inode_crtime, FEAT_INODE_CRTIME); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -376,6 +382,7 @@ static struct attribute *f2fs_feat_attrs[] = { ATTR_LIST(inode_checksum), ATTR_LIST(flexible_inline_xattr), ATTR_LIST(quota_ino), + ATTR_LIST(inode_crtime), NULL, }; diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 9e0cb7b63883..c82ae65b5330 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -253,8 +253,10 @@ struct f2fs_inode { __le16 i_inline_xattr_size; /* inline xattr size, unit: 4 bytes */ __le32 i_projid; /* project id */ __le32 i_inode_checksum;/* inode meta checksum */ + __le64 i_crtime; /* creation time */ + __le32 i_crtime_nsec; /* creation time in nano scale */ __le32 i_extra_end[0]; /* for attribute size calculation */ - }; + } __packed; __le32 i_addr[DEF_ADDRS_PER_INODE]; /* Pointers to data blocks */ }; __le32 i_nid[DEF_NIDS_PER_INODE]; /* direct(2), indirect(2), From 39ed8376d611d2a211079be6a2ac08715f5a58c4 Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Wed, 28 Feb 2018 20:31:52 +0800 Subject: [PATCH 0595/1212] f2fs: don't put dentry page in pagecache into highmem Previous dentry page uses highmem, which will cause panic in platforms using highmem (such as arm), since the address space of dentry pages from highmem directly goes into the decryption path via the function fscrypt_fname_disk_to_usr. But sg_init_one assumes the address is not from highmem, and then cause panic since it doesn't call kmap_high but kunmap_high is triggered at the end. To fix this problem in a simple way, this patch avoids to put dentry page in pagecache into highmem. Signed-off-by: Yunlong Song Reviewed-by: Chao Yu [Jaegeuk Kim: fix coding style] Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 23 +++++------------------ fs/f2fs/f2fs.h | 6 ------ fs/f2fs/inline.c | 3 +-- fs/f2fs/inode.c | 2 +- fs/f2fs/namei.c | 32 ++++++++------------------------ fs/f2fs/recovery.c | 11 +++++------ include/linux/f2fs_fs.h | 1 - 7 files changed, 20 insertions(+), 58 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index bde445e4e690..560b707050ca 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -94,14 +94,12 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, struct f2fs_dir_entry *de; struct f2fs_dentry_ptr d; - dentry_blk = (struct f2fs_dentry_block *)kmap(dentry_page); + dentry_blk = (struct f2fs_dentry_block *)page_address(dentry_page); make_dentry_ptr_block(NULL, &d, dentry_blk); de = find_target_dentry(fname, namehash, max_slots, &d); if (de) *res_page = dentry_page; - else - kunmap(dentry_page); return de; } @@ -287,7 +285,6 @@ ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, de = f2fs_find_entry(dir, qstr, page); if (de) { res = le32_to_cpu(de->ino); - f2fs_dentry_kunmap(dir, *page); f2fs_put_page(*page, 0); } @@ -302,7 +299,6 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, f2fs_wait_on_page_writeback(page, type, true); de->ino = cpu_to_le32(inode->i_ino); set_de_type(de, inode->i_mode); - f2fs_dentry_kunmap(dir, page); set_page_dirty(page); dir->i_mtime = dir->i_ctime = current_time(dir); @@ -350,13 +346,11 @@ static int make_empty_dir(struct inode *inode, if (IS_ERR(dentry_page)) return PTR_ERR(dentry_page); - dentry_blk = kmap_atomic(dentry_page); + dentry_blk = page_address(dentry_page); make_dentry_ptr_block(NULL, &d, dentry_blk); do_make_empty_dir(inode, parent, &d); - kunmap_atomic(dentry_blk); - set_page_dirty(dentry_page); f2fs_put_page(dentry_page, 1); return 0; @@ -547,13 +541,12 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, if (IS_ERR(dentry_page)) return PTR_ERR(dentry_page); - dentry_blk = kmap(dentry_page); + dentry_blk = page_address(dentry_page); bit_pos = room_for_filename(&dentry_blk->dentry_bitmap, slots, NR_DENTRY_IN_BLOCK); if (bit_pos < NR_DENTRY_IN_BLOCK) goto add_dentry; - kunmap(dentry_page); f2fs_put_page(dentry_page, 1); } @@ -588,7 +581,6 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, if (inode) up_write(&F2FS_I(inode)->i_sem); - kunmap(dentry_page); f2fs_put_page(dentry_page, 1); return err; @@ -642,7 +634,6 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, F2FS_I(dir)->task = NULL; } if (de) { - f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); err = -EEXIST; } else if (IS_ERR(page)) { @@ -730,7 +721,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, NR_DENTRY_IN_BLOCK, 0); - kunmap(page); /* kunmap - pair of f2fs_find_entry */ set_page_dirty(page); dir->i_ctime = dir->i_mtime = current_time(dir); @@ -775,7 +765,7 @@ bool f2fs_empty_dir(struct inode *dir) return false; } - dentry_blk = kmap_atomic(dentry_page); + dentry_blk = page_address(dentry_page); if (bidx == 0) bit_pos = 2; else @@ -783,7 +773,6 @@ bool f2fs_empty_dir(struct inode *dir) bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, NR_DENTRY_IN_BLOCK, bit_pos); - kunmap_atomic(dentry_blk); f2fs_put_page(dentry_page, 1); @@ -901,19 +890,17 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) } } - dentry_blk = kmap(dentry_page); + dentry_blk = page_address(dentry_page); make_dentry_ptr_block(inode, &d, dentry_blk); err = f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr); if (err) { - kunmap(dentry_page); f2fs_put_page(dentry_page, 1); break; } - kunmap(dentry_page); f2fs_put_page(dentry_page, 1); } out_free: diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0eeeeba0246d..e54ffadb692c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2463,12 +2463,6 @@ static inline int f2fs_has_inline_dentry(struct inode *inode) return is_inode_flag_set(inode, FI_INLINE_DENTRY); } -static inline void f2fs_dentry_kunmap(struct inode *dir, struct page *page) -{ - if (!f2fs_has_inline_dentry(dir)) - kunmap(page); -} - static inline int is_file(struct inode *inode, int type) { return F2FS_I(inode)->i_advise & type; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 0fa5ca0907ba..12f6c6471c56 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -369,7 +369,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, f2fs_wait_on_page_writeback(page, DATA, true); zero_user_segment(page, MAX_INLINE_DATA(dir), PAGE_SIZE); - dentry_blk = kmap_atomic(page); + dentry_blk = page_address(page); make_dentry_ptr_inline(dir, &src, inline_dentry); make_dentry_ptr_block(dir, &dst, dentry_blk); @@ -386,7 +386,6 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, memcpy(dst.dentry, src.dentry, SIZE_OF_DIR_ENTRY * src.max); memcpy(dst.filename, src.filename, src.max * F2FS_SLOT_LEN); - kunmap_atomic(dentry_blk); if (!PageUptodate(page)) SetPageUptodate(page); set_page_dirty(page); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 89c838bfb067..10be247ca421 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -328,7 +328,7 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) inode->i_op = &f2fs_dir_inode_operations; inode->i_fop = &f2fs_dir_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO); + inode_nohighmem(inode); } else if (S_ISLNK(inode->i_mode)) { if (f2fs_encrypted_inode(inode)) inode->i_op = &f2fs_encrypted_symlink_inode_operations; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index da7f709e3926..6bb1adb84324 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -317,7 +317,6 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) de = f2fs_find_entry(dir, &dot, &page); if (de) { - f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); } else if (IS_ERR(page)) { err = PTR_ERR(page); @@ -329,14 +328,12 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) } de = f2fs_find_entry(dir, &dotdot, &page); - if (de) { - f2fs_dentry_kunmap(dir, page); + if (de) f2fs_put_page(page, 0); - } else if (IS_ERR(page)) { + else if (IS_ERR(page)) err = PTR_ERR(page); - } else { + else err = __f2fs_add_link(dir, &dotdot, NULL, pino, S_IFDIR); - } out: if (!err) clear_inode_flag(dir, FI_INLINE_DOTS); @@ -377,7 +374,6 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, } ino = le32_to_cpu(de->ino); - f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); inode = f2fs_iget(dir->i_sb, ino); @@ -452,7 +448,6 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) err = acquire_orphan_inode(sbi); if (err) { f2fs_unlock_op(sbi); - f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); goto fail; } @@ -610,7 +605,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_op = &f2fs_dir_inode_operations; inode->i_fop = &f2fs_dir_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO); + inode_nohighmem(inode); set_inode_flag(inode, FI_INC_LINK); f2fs_lock_op(sbi); @@ -924,13 +919,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } if (old_dir_entry) { - if (old_dir != new_dir && !whiteout) { + if (old_dir != new_dir && !whiteout) f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); - } else { - f2fs_dentry_kunmap(old_inode, old_dir_page); + else f2fs_put_page(old_dir_page, 0); - } f2fs_i_links_write(old_dir, false); } add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); @@ -943,20 +936,15 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, put_out_dir: f2fs_unlock_op(sbi); - if (new_page) { - f2fs_dentry_kunmap(new_dir, new_page); + if (new_page) f2fs_put_page(new_page, 0); - } out_whiteout: if (whiteout) iput(whiteout); out_dir: - if (old_dir_entry) { - f2fs_dentry_kunmap(old_inode, old_dir_page); + if (old_dir_entry) f2fs_put_page(old_dir_page, 0); - } out_old: - f2fs_dentry_kunmap(old_dir, old_page); f2fs_put_page(old_page, 0); out: return err; @@ -1098,19 +1086,15 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, return 0; out_new_dir: if (new_dir_entry) { - f2fs_dentry_kunmap(new_inode, new_dir_page); f2fs_put_page(new_dir_page, 0); } out_old_dir: if (old_dir_entry) { - f2fs_dentry_kunmap(old_inode, old_dir_page); f2fs_put_page(old_dir_page, 0); } out_new: - f2fs_dentry_kunmap(new_dir, new_page); f2fs_put_page(new_page, 0); out_old: - f2fs_dentry_kunmap(old_dir, old_page); f2fs_put_page(old_page, 0); out: return err; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index b6d1ec620a8c..210de28c9cd2 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -144,7 +144,7 @@ static int recover_dentry(struct inode *inode, struct page *ipage, retry: de = __f2fs_find_entry(dir, &fname, &page); if (de && inode->i_ino == le32_to_cpu(de->ino)) - goto out_unmap_put; + goto out_put; if (de) { einode = f2fs_iget_retry(inode->i_sb, le32_to_cpu(de->ino)); @@ -153,19 +153,19 @@ static int recover_dentry(struct inode *inode, struct page *ipage, err = PTR_ERR(einode); if (err == -ENOENT) err = -EEXIST; - goto out_unmap_put; + goto out_put; } err = dquot_initialize(einode); if (err) { iput(einode); - goto out_unmap_put; + goto out_put; } err = acquire_orphan_inode(F2FS_I_SB(inode)); if (err) { iput(einode); - goto out_unmap_put; + goto out_put; } f2fs_delete_entry(de, page, dir, einode); iput(einode); @@ -180,8 +180,7 @@ static int recover_dentry(struct inode *inode, struct page *ipage, goto retry; goto out; -out_unmap_put: - f2fs_dentry_kunmap(dir, page); +out_put: f2fs_put_page(page, 0); out: if (file_enc_name(inode)) diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index c82ae65b5330..073365c9808a 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -46,7 +46,6 @@ /* This flag is used by node and meta inodes, and by recovery */ #define GFP_F2FS_ZERO (GFP_NOFS | __GFP_ZERO) -#define GFP_F2FS_HIGH_ZERO (GFP_NOFS | __GFP_ZERO | __GFP_HIGHMEM) /* * For further optimization on multi-head logs, on-disk layout supports maximum From 3a2c7917782292956a32d4e1df8dc3dbecc01b25 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 5 Jan 2018 10:44:52 -0800 Subject: [PATCH 0596/1212] fscrypt: move fscrypt_has_encryption_key() to supp/notsupp headers fscrypt_has_encryption_key() is already split into two versions depending on whether the filesystem is being built with encryption support or not. Move them into the appropriate headers. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- include/linux/fscrypt.h | 10 ---------- include/linux/fscrypt_notsupp.h | 5 +++++ include/linux/fscrypt_supp.h | 5 +++++ 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 8641e56b8f8a..1e2343b46262 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -125,11 +125,6 @@ static inline struct page *fscrypt_control_page(struct page *page) return ((struct fscrypt_ctx *)page_private(page))->w.control_page; } -static inline bool fscrypt_has_encryption_key(const struct inode *inode) -{ - return (inode->i_crypt_info != NULL); -} - #include #else /* !__FS_HAS_ENCRYPTION */ @@ -140,11 +135,6 @@ static inline struct page *fscrypt_control_page(struct page *page) return ERR_PTR(-EINVAL); } -static inline bool fscrypt_has_encryption_key(const struct inode *inode) -{ - return 0; -} - #include #endif /* __FS_HAS_ENCRYPTION */ diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index c4c6bf2c390e..f8685c25b7b3 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -13,6 +13,11 @@ #ifndef _LINUX_FSCRYPT_NOTSUPP_H #define _LINUX_FSCRYPT_NOTSUPP_H +static inline bool fscrypt_has_encryption_key(const struct inode *inode) +{ + return false; +} + /* crypto.c */ static inline struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags) diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index 2db5e9706f60..1fb73a6892b1 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -10,6 +10,11 @@ #ifndef _LINUX_FSCRYPT_SUPP_H #define _LINUX_FSCRYPT_SUPP_H +static inline bool fscrypt_has_encryption_key(const struct inode *inode) +{ + return (inode->i_crypt_info != NULL); +} + /* crypto.c */ extern struct kmem_cache *fscrypt_info_cachep; extern struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *, gfp_t); From dfe0b3b1b67f6489ea857ef75135e27eb16638d4 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 5 Jan 2018 10:44:53 -0800 Subject: [PATCH 0597/1212] fscrypt: move fscrypt_control_page() to supp/notsupp headers fscrypt_control_page() is already split into two versions depending on whether the filesystem is being built with encryption support or not. Move them into the appropriate headers. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- include/linux/fscrypt.h | 18 ++---------------- include/linux/fscrypt_notsupp.h | 5 +++++ include/linux/fscrypt_supp.h | 6 ++++++ 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 1e2343b46262..0961315a5fff 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -119,24 +119,10 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) } #if __FS_HAS_ENCRYPTION - -static inline struct page *fscrypt_control_page(struct page *page) -{ - return ((struct fscrypt_ctx *)page_private(page))->w.control_page; -} - #include - -#else /* !__FS_HAS_ENCRYPTION */ - -static inline struct page *fscrypt_control_page(struct page *page) -{ - WARN_ON_ONCE(1); - return ERR_PTR(-EINVAL); -} - +#else #include -#endif /* __FS_HAS_ENCRYPTION */ +#endif /** * fscrypt_require_key - require an inode's encryption key diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index f8685c25b7b3..3d394a0737ed 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -47,6 +47,11 @@ static inline int fscrypt_decrypt_page(const struct inode *inode, return -EOPNOTSUPP; } +static inline struct page *fscrypt_control_page(struct page *page) +{ + WARN_ON_ONCE(1); + return ERR_PTR(-EINVAL); +} static inline void fscrypt_restore_control_page(struct page *page) { diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index 1fb73a6892b1..95ea7265e25b 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -24,6 +24,12 @@ extern struct page *fscrypt_encrypt_page(const struct inode *, struct page *, u64, gfp_t); extern int fscrypt_decrypt_page(const struct inode *, struct page *, unsigned int, unsigned int, u64); + +static inline struct page *fscrypt_control_page(struct page *page) +{ + return ((struct fscrypt_ctx *)page_private(page))->w.control_page; +} + extern void fscrypt_restore_control_page(struct page *); extern const struct dentry_operations fscrypt_d_ops; From 8216a0b51a3b24006d8318e28a7ef318bf142506 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 5 Jan 2018 10:44:54 -0800 Subject: [PATCH 0598/1212] fscrypt: move fscrypt_info_cachep declaration to fscrypt_private.h The fscrypt_info kmem_cache is internal to fscrypt; filesystems don't need to access it. So move its declaration into fscrypt_private.h. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/fscrypt_private.h | 1 + include/linux/fscrypt_supp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index c3ad415cd14f..09e99f5007de 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -82,6 +82,7 @@ static inline void bio_set_op_attrs(struct bio *bio, unsigned op, } /* crypto.c */ +extern struct kmem_cache *fscrypt_info_cachep; extern int fscrypt_initialize(unsigned int cop_flags); extern struct workqueue_struct *fscrypt_read_workqueue; extern int fscrypt_do_page_crypto(const struct inode *inode, diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index 95ea7265e25b..11522c880632 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -16,7 +16,6 @@ static inline bool fscrypt_has_encryption_key(const struct inode *inode) } /* crypto.c */ -extern struct kmem_cache *fscrypt_info_cachep; extern struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *, gfp_t); extern void fscrypt_release_ctx(struct fscrypt_ctx *); extern struct page *fscrypt_encrypt_page(const struct inode *, struct page *, From 3f16e09dadfbf319fe4a71b61f6049a83d7c277c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 5 Jan 2018 10:44:55 -0800 Subject: [PATCH 0599/1212] fscrypt: move fscrypt_ctx declaration to fscrypt_supp.h Filesystems only ever access 'struct fscrypt_ctx' through fscrypt functions. But when a filesystem is built without encryption support, these functions are all stubbed out, so the declaration of fscrypt_ctx is unneeded. Therefore, move it from fscrypt.h to fscrypt_supp.h. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- include/linux/fscrypt.h | 16 +--------------- include/linux/fscrypt_supp.h | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 0961315a5fff..f627ee378bc3 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -23,23 +23,9 @@ #define FS_CRYPTO_BLOCK_SIZE 16 +struct fscrypt_ctx; struct fscrypt_info; -struct fscrypt_ctx { - union { - struct { - struct page *bounce_page; /* Ciphertext page */ - struct page *control_page; /* Original page */ - } w; - struct { - struct bio *bio; - struct work_struct work; - } r; - struct list_head free_list; /* Free list */ - }; - u8 flags; /* Flags */ -}; - /** * For encrypted symlinks, the ciphertext length is stored at the beginning * of the string in little-endian format. diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index 11522c880632..40fee89fac9e 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -10,6 +10,21 @@ #ifndef _LINUX_FSCRYPT_SUPP_H #define _LINUX_FSCRYPT_SUPP_H +struct fscrypt_ctx { + union { + struct { + struct page *bounce_page; /* Ciphertext page */ + struct page *control_page; /* Original page */ + } w; + struct { + struct bio *bio; + struct work_struct work; + } r; + struct list_head free_list; /* Free list */ + }; + u8 flags; /* Flags */ +}; + static inline bool fscrypt_has_encryption_key(const struct inode *inode) { return (inode->i_crypt_info != NULL); From 7ed178bc8ae9eb13c0a0a155688e2a7187afd2bb Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 5 Jan 2018 10:44:56 -0800 Subject: [PATCH 0600/1212] fscrypt: split fscrypt_dummy_context_enabled() into supp/notsupp versions fscrypt_dummy_context_enabled() accesses ->s_cop, which now is only set when the filesystem is built with encryption support. This didn't actually matter because no filesystems called it. However, it will start being used soon, so fix it by moving it from fscrypt.h to fscrypt_supp.h and stubbing it out in fscrypt_notsupp.h. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- include/linux/fscrypt.h | 8 -------- include/linux/fscrypt_notsupp.h | 5 +++++ include/linux/fscrypt_supp.h | 6 ++++++ 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index f627ee378bc3..fc43cc303cf2 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -71,14 +71,6 @@ struct fscrypt_operations { unsigned (*max_namelen)(struct inode *); }; -static inline bool fscrypt_dummy_context_enabled(struct inode *inode) -{ - if (inode->i_sb->s_cop->dummy_context && - inode->i_sb->s_cop->dummy_context(inode)) - return true; - return false; -} - static inline bool fscrypt_valid_enc_modes(u32 contents_mode, u32 filenames_mode) { diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index 3d394a0737ed..151bbc3c61f1 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -18,6 +18,11 @@ static inline bool fscrypt_has_encryption_key(const struct inode *inode) return false; } +static inline bool fscrypt_dummy_context_enabled(struct inode *inode) +{ + return false; +} + /* crypto.c */ static inline struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags) diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index 40fee89fac9e..90965fa403b1 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -30,6 +30,12 @@ static inline bool fscrypt_has_encryption_key(const struct inode *inode) return (inode->i_crypt_info != NULL); } +static inline bool fscrypt_dummy_context_enabled(struct inode *inode) +{ + return inode->i_sb->s_cop->dummy_context && + inode->i_sb->s_cop->dummy_context(inode); +} + /* crypto.c */ extern struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *, gfp_t); extern void fscrypt_release_ctx(struct fscrypt_ctx *); From efefa434f47e1d907b3a4c31b9c9f1e561fe57d6 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 5 Jan 2018 10:44:57 -0800 Subject: [PATCH 0601/1212] fscrypt: move fscrypt_operations declaration to fscrypt_supp.h Filesystems now only define their fscrypt_operations when they are compiled with encryption support, so move the fscrypt_operations declaration from fscrypt.h to fscrypt_supp.h. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- include/linux/fscrypt.h | 18 ------------------ include/linux/fscrypt_supp.h | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index fc43cc303cf2..b29cdfc3486e 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -53,24 +53,6 @@ struct fscrypt_name { #define fname_name(p) ((p)->disk_name.name) #define fname_len(p) ((p)->disk_name.len) -/* - * fscrypt superblock flags - */ -#define FS_CFLG_OWN_PAGES (1U << 1) - -/* - * crypto opertions for filesystems - */ -struct fscrypt_operations { - unsigned int flags; - const char *key_prefix; - int (*get_context)(struct inode *, void *, size_t); - int (*set_context)(struct inode *, const void *, size_t, void *); - bool (*dummy_context)(struct inode *); - bool (*empty_dir)(struct inode *); - unsigned (*max_namelen)(struct inode *); -}; - static inline bool fscrypt_valid_enc_modes(u32 contents_mode, u32 filenames_mode) { diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index 90965fa403b1..c785f7297f29 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -10,6 +10,24 @@ #ifndef _LINUX_FSCRYPT_SUPP_H #define _LINUX_FSCRYPT_SUPP_H +/* + * fscrypt superblock flags + */ +#define FS_CFLG_OWN_PAGES (1U << 1) + +/* + * crypto operations for filesystems + */ +struct fscrypt_operations { + unsigned int flags; + const char *key_prefix; + int (*get_context)(struct inode *, void *, size_t); + int (*set_context)(struct inode *, const void *, size_t, void *); + bool (*dummy_context)(struct inode *); + bool (*empty_dir)(struct inode *); + unsigned (*max_namelen)(struct inode *); +}; + struct fscrypt_ctx { union { struct { From e6fe930580cb0344a4fbd0b15bb8cda5e3986fee Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 5 Jan 2018 10:44:58 -0800 Subject: [PATCH 0602/1212] fscrypt: move fscrypt_valid_enc_modes() to fscrypt_private.h The encryption modes are validated by fs/crypto/, not by individual filesystems. Therefore, move fscrypt_valid_enc_modes() from fscrypt.h to fscrypt_private.h. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/fscrypt_private.h | 14 ++++++++++++++ include/linux/fscrypt.h | 14 -------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 09e99f5007de..d5dc791d7228 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -81,6 +81,20 @@ static inline void bio_set_op_attrs(struct bio *bio, unsigned op, bio->bi_rw = op | op_flags; } +static inline bool fscrypt_valid_enc_modes(u32 contents_mode, + u32 filenames_mode) +{ + if (contents_mode == FS_ENCRYPTION_MODE_AES_128_CBC && + filenames_mode == FS_ENCRYPTION_MODE_AES_128_CTS) + return true; + + if (contents_mode == FS_ENCRYPTION_MODE_AES_256_XTS && + filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS) + return true; + + return false; +} + /* crypto.c */ extern struct kmem_cache *fscrypt_info_cachep; extern int fscrypt_initialize(unsigned int cop_flags); diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index b29cdfc3486e..b03cb23728ea 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -53,20 +53,6 @@ struct fscrypt_name { #define fname_name(p) ((p)->disk_name.name) #define fname_len(p) ((p)->disk_name.len) -static inline bool fscrypt_valid_enc_modes(u32 contents_mode, - u32 filenames_mode) -{ - if (contents_mode == FS_ENCRYPTION_MODE_AES_128_CBC && - filenames_mode == FS_ENCRYPTION_MODE_AES_128_CTS) - return true; - - if (contents_mode == FS_ENCRYPTION_MODE_AES_256_XTS && - filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS) - return true; - - return false; -} - static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) { if (str->len == 1 && str->name[0] == '.') From d9cadc11bdcf9907041dcaba204384c0ff552b81 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 5 Jan 2018 10:44:59 -0800 Subject: [PATCH 0603/1212] fscrypt: move fscrypt_is_dot_dotdot() to fs/crypto/fname.c Only fs/crypto/fname.c cares about treating the "." and ".." filenames specially with regards to encryption, so move fscrypt_is_dot_dotdot() from fscrypt.h to there. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/fname.c | 11 +++++++++++ include/linux/fscrypt.h | 11 ----------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 6eb434363ff2..bce476dc2c65 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -14,6 +14,17 @@ #include #include "fscrypt_private.h" +static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) +{ + if (str->len == 1 && str->name[0] == '.') + return true; + + if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.') + return true; + + return false; +} + /** * fname_encrypt() - encrypt a filename * diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index b03cb23728ea..f71d6326936e 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -53,17 +53,6 @@ struct fscrypt_name { #define fname_name(p) ((p)->disk_name.name) #define fname_len(p) ((p)->disk_name.len) -static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) -{ - if (str->len == 1 && str->name[0] == '.') - return true; - - if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.') - return true; - - return false; -} - #if __FS_HAS_ENCRYPTION #include #else From 7f43602f4d104ad482f54e3a9122f3e5f31d60d9 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 5 Jan 2018 10:45:00 -0800 Subject: [PATCH 0604/1212] fscrypt: trim down fscrypt.h includes fscrypt.h included way too many other headers, given that it is included by filesystems both with and without encryption support. Trim down the includes list by moving the needed includes into more appropriate places, and removing the unneeded ones. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/crypto.c | 1 + fs/crypto/fname.c | 1 + fs/crypto/keyinfo.c | 1 + include/linux/fscrypt.h | 6 ------ include/linux/fscrypt_supp.h | 3 +++ 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 732a786cce9d..ce654526c0fb 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "fscrypt_private.h" static unsigned int num_prealloc_crypto_pages = 32; diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index bce476dc2c65..f5db8bd500b6 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -12,6 +12,7 @@ #include #include +#include #include "fscrypt_private.h" static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 444c65ed6db8..7c00331da5df 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "fscrypt_private.h" static struct crypto_shash *essiv_hash_tfm; diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index f71d6326936e..486886811915 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -13,13 +13,7 @@ #ifndef _LINUX_FSCRYPT_H #define _LINUX_FSCRYPT_H -#include #include -#include -#include -#include -#include -#include #define FS_CRYPTO_BLOCK_SIZE 16 diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index c785f7297f29..cdfe1600f53e 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -10,6 +10,9 @@ #ifndef _LINUX_FSCRYPT_SUPP_H #define _LINUX_FSCRYPT_SUPP_H +#include +#include + /* * fscrypt superblock flags */ From a1cdacb7ae0db3e376c3c874df8c8793448ad1e9 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 5 Jan 2018 10:45:01 -0800 Subject: [PATCH 0605/1212] fscrypt: new helper functions for ->symlink() Currently, filesystems supporting fscrypt need to implement some tricky logic when creating encrypted symlinks, including handling a peculiar on-disk format (struct fscrypt_symlink_data) and correctly calculating the size of the encrypted symlink. Introduce helper functions to make things a bit easier: - fscrypt_prepare_symlink() computes and validates the size the symlink target will require on-disk. - fscrypt_encrypt_symlink() creates the encrypted target if needed. The new helpers actually fix some subtle bugs. First, when checking whether the symlink target was too long, filesystems didn't account for the fact that the NUL padding is meant to be truncated if it would cause the maximum length to be exceeded, as is done for filenames in directories. Consequently users would receive ENAMETOOLONG when creating symlinks close to what is supposed to be the maximum length. For example, with EXT4 with a 4K block size, the maximum symlink target length in an encrypted directory is supposed to be 4093 bytes (in comparison to 4095 in an unencrypted directory), but in FS_POLICY_FLAGS_PAD_32-mode only up to 4064 bytes were accepted. Second, symlink targets of "." and ".." were not being encrypted, even though they should be, as these names are special in *directory entries* but not in symlink targets. Fortunately, we can fix this simply by starting to encrypt them, as old kernels already accept them in encrypted form. Third, the output string length the filesystems were providing when doing the actual encryption was incorrect, as it was forgotten to exclude 'sizeof(struct fscrypt_symlink_data)'. Fortunately though, this bug didn't make a difference. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/fname.c | 8 +-- fs/crypto/fscrypt_private.h | 4 ++ fs/crypto/hooks.c | 90 +++++++++++++++++++++++++++++++++ include/linux/fscrypt.h | 64 +++++++++++++++++++++++ include/linux/fscrypt_notsupp.h | 16 ++++++ include/linux/fscrypt_supp.h | 6 +++ 6 files changed, 185 insertions(+), 3 deletions(-) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index f5db8bd500b6..55ca8d913c94 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -33,8 +33,8 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) * * Return: 0 on success, -errno on failure */ -static int fname_encrypt(struct inode *inode, - const struct qstr *iname, struct fscrypt_str *oname) +int fname_encrypt(struct inode *inode, + const struct qstr *iname, struct fscrypt_str *oname) { struct skcipher_request *req = NULL; DECLARE_CRYPTO_WAIT(wait); @@ -55,9 +55,11 @@ static int fname_encrypt(struct inode *inode, * Copy the filename to the output buffer for encrypting in-place and * pad it with the needed number of NUL bytes. */ + if (WARN_ON(oname->len < iname->len)) + return -ENOBUFS; cryptlen = max_t(unsigned int, iname->len, FS_CRYPTO_BLOCK_SIZE); cryptlen = round_up(cryptlen, padding); - cryptlen = min(cryptlen, lim); + cryptlen = min3(cryptlen, lim, oname->len); memcpy(oname->name, iname->name, iname->len); memset(oname->name + iname->len, 0, cryptlen - iname->len); diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index d5dc791d7228..0712b0ac974b 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -108,6 +108,10 @@ extern int fscrypt_do_page_crypto(const struct inode *inode, extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags); +/* fname.c */ +extern int fname_encrypt(struct inode *inode, + const struct qstr *iname, struct fscrypt_str *oname); + /* keyinfo.c */ extern void __exit fscrypt_essiv_cleanup(void); diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 9f5fb2eb9cf7..4b83e4af2e41 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -110,3 +110,93 @@ int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry) return 0; } EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup); + +int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len, + unsigned int max_len, + struct fscrypt_str *disk_link) +{ + int err; + + /* + * To calculate the size of the encrypted symlink target we need to know + * the amount of NUL padding, which is determined by the flags set in + * the encryption policy which will be inherited from the directory. + * The easiest way to get access to this is to just load the directory's + * fscrypt_info, since we'll need it to create the dir_entry anyway. + * + * Note: in test_dummy_encryption mode, @dir may be unencrypted. + */ + err = fscrypt_get_encryption_info(dir); + if (err) + return err; + if (!fscrypt_has_encryption_key(dir)) + return -ENOKEY; + + /* + * Calculate the size of the encrypted symlink and verify it won't + * exceed max_len. Note that for historical reasons, encrypted symlink + * targets are prefixed with the ciphertext length, despite this + * actually being redundant with i_size. This decreases by 2 bytes the + * longest symlink target we can accept. + * + * We could recover 1 byte by not counting a null terminator, but + * counting it (even though it is meaningless for ciphertext) is simpler + * for now since filesystems will assume it is there and subtract it. + */ + if (sizeof(struct fscrypt_symlink_data) + len > max_len) + return -ENAMETOOLONG; + disk_link->len = min_t(unsigned int, + sizeof(struct fscrypt_symlink_data) + + fscrypt_fname_encrypted_size(dir, len), + max_len); + disk_link->name = NULL; + return 0; +} +EXPORT_SYMBOL_GPL(__fscrypt_prepare_symlink); + +int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, + unsigned int len, struct fscrypt_str *disk_link) +{ + int err; + struct qstr iname = { .name = target, .len = len }; + struct fscrypt_symlink_data *sd; + unsigned int ciphertext_len; + struct fscrypt_str oname; + + err = fscrypt_require_key(inode); + if (err) + return err; + + if (disk_link->name) { + /* filesystem-provided buffer */ + sd = (struct fscrypt_symlink_data *)disk_link->name; + } else { + sd = kmalloc(disk_link->len, GFP_NOFS); + if (!sd) + return -ENOMEM; + } + ciphertext_len = disk_link->len - sizeof(*sd); + sd->len = cpu_to_le16(ciphertext_len); + + oname.name = sd->encrypted_path; + oname.len = ciphertext_len; + err = fname_encrypt(inode, &iname, &oname); + if (err) { + if (!disk_link->name) + kfree(sd); + return err; + } + BUG_ON(oname.len != ciphertext_len); + + /* + * Null-terminating the ciphertext doesn't make sense, but we still + * count the null terminator in the length, so we might as well + * initialize it just in case the filesystem writes it out. + */ + sd->encrypted_path[ciphertext_len] = '\0'; + + if (!disk_link->name) + disk_link->name = (unsigned char *)sd; + return 0; +} +EXPORT_SYMBOL_GPL(__fscrypt_encrypt_symlink); diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 486886811915..77a171da8254 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -192,4 +192,68 @@ static inline int fscrypt_prepare_setattr(struct dentry *dentry, return 0; } +/** + * fscrypt_prepare_symlink - prepare to create a possibly-encrypted symlink + * @dir: directory in which the symlink is being created + * @target: plaintext symlink target + * @len: length of @target excluding null terminator + * @max_len: space the filesystem has available to store the symlink target + * @disk_link: (out) the on-disk symlink target being prepared + * + * This function computes the size the symlink target will require on-disk, + * stores it in @disk_link->len, and validates it against @max_len. An + * encrypted symlink may be longer than the original. + * + * Additionally, @disk_link->name is set to @target if the symlink will be + * unencrypted, but left NULL if the symlink will be encrypted. For encrypted + * symlinks, the filesystem must call fscrypt_encrypt_symlink() to create the + * on-disk target later. (The reason for the two-step process is that some + * filesystems need to know the size of the symlink target before creating the + * inode, e.g. to determine whether it will be a "fast" or "slow" symlink.) + * + * Return: 0 on success, -ENAMETOOLONG if the symlink target is too long, + * -ENOKEY if the encryption key is missing, or another -errno code if a problem + * occurred while setting up the encryption key. + */ +static inline int fscrypt_prepare_symlink(struct inode *dir, + const char *target, + unsigned int len, + unsigned int max_len, + struct fscrypt_str *disk_link) +{ + if (IS_ENCRYPTED(dir) || fscrypt_dummy_context_enabled(dir)) + return __fscrypt_prepare_symlink(dir, len, max_len, disk_link); + + disk_link->name = (unsigned char *)target; + disk_link->len = len + 1; + if (disk_link->len > max_len) + return -ENAMETOOLONG; + return 0; +} + +/** + * fscrypt_encrypt_symlink - encrypt the symlink target if needed + * @inode: symlink inode + * @target: plaintext symlink target + * @len: length of @target excluding null terminator + * @disk_link: (in/out) the on-disk symlink target being prepared + * + * If the symlink target needs to be encrypted, then this function encrypts it + * into @disk_link->name. fscrypt_prepare_symlink() must have been called + * previously to compute @disk_link->len. If the filesystem did not allocate a + * buffer for @disk_link->name after calling fscrypt_prepare_link(), then one + * will be kmalloc()'ed and the filesystem will be responsible for freeing it. + * + * Return: 0 on success, -errno on failure + */ +static inline int fscrypt_encrypt_symlink(struct inode *inode, + const char *target, + unsigned int len, + struct fscrypt_str *disk_link) +{ + if (IS_ENCRYPTED(inode)) + return __fscrypt_encrypt_symlink(inode, target, len, disk_link); + return 0; +} + #endif /* _LINUX_FSCRYPT_H */ diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index 151bbc3c61f1..875c83672318 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -222,4 +222,20 @@ static inline int __fscrypt_prepare_lookup(struct inode *dir, return -EOPNOTSUPP; } +static inline int __fscrypt_prepare_symlink(struct inode *dir, + unsigned int len, + unsigned int max_len, + struct fscrypt_str *disk_link) +{ + return -EOPNOTSUPP; +} + +static inline int __fscrypt_encrypt_symlink(struct inode *inode, + const char *target, + unsigned int len, + struct fscrypt_str *disk_link) +{ + return -EOPNOTSUPP; +} + #endif /* _LINUX_FSCRYPT_NOTSUPP_H */ diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index cdfe1600f53e..6ccaad58d2be 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -204,5 +204,11 @@ extern int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *new_dentry, unsigned int flags); extern int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry); +extern int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len, + unsigned int max_len, + struct fscrypt_str *disk_link); +extern int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, + unsigned int len, + struct fscrypt_str *disk_link); #endif /* _LINUX_FSCRYPT_SUPP_H */ From fd457d2c4e0411e56b82f67a3b22c8c589f77038 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 5 Jan 2018 10:45:02 -0800 Subject: [PATCH 0606/1212] fscrypt: new helper function - fscrypt_get_symlink() Filesystems also have duplicate code to support ->get_link() on encrypted symlinks. Factor it out into a new function fscrypt_get_symlink(). It takes in the contents of the encrypted symlink on-disk and provides the target (decrypted or encoded) that should be returned from ->get_link(). Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/hooks.c | 71 +++++++++++++++++++++++++++++++++ include/linux/fscrypt_notsupp.h | 7 ++++ include/linux/fscrypt_supp.h | 2 + 3 files changed, 80 insertions(+) diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 4b83e4af2e41..534cfb212cdb 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -200,3 +200,74 @@ int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, return 0; } EXPORT_SYMBOL_GPL(__fscrypt_encrypt_symlink); + +/** + * fscrypt_get_symlink - get the target of an encrypted symlink + * @inode: the symlink inode + * @caddr: the on-disk contents of the symlink + * @max_size: size of @caddr buffer + * @done: if successful, will be set up to free the returned target + * + * If the symlink's encryption key is available, we decrypt its target. + * Otherwise, we encode its target for presentation. + * + * This may sleep, so the filesystem must have dropped out of RCU mode already. + * + * Return: the presentable symlink target or an ERR_PTR() + */ +void *fscrypt_get_symlink(struct inode *inode, const void *caddr, + unsigned int max_size) +{ + const struct fscrypt_symlink_data *sd; + struct fscrypt_str cstr, pstr; + int err; + + /* This is for encrypted symlinks only */ + if (WARN_ON(!IS_ENCRYPTED(inode))) + return ERR_PTR(-EINVAL); + + /* + * Try to set up the symlink's encryption key, but we can continue + * regardless of whether the key is available or not. + */ + err = fscrypt_get_encryption_info(inode); + if (err) + return ERR_PTR(err); + + /* + * For historical reasons, encrypted symlink targets are prefixed with + * the ciphertext length, even though this is redundant with i_size. + */ + + if (max_size < sizeof(*sd)) + return ERR_PTR(-EUCLEAN); + sd = caddr; + cstr.name = (unsigned char *)sd->encrypted_path; + cstr.len = le16_to_cpu(sd->len); + + if (cstr.len == 0) + return ERR_PTR(-EUCLEAN); + + if (cstr.len + sizeof(*sd) - 1 > max_size) + return ERR_PTR(-EUCLEAN); + + err = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr); + if (err) + return ERR_PTR(err); + + err = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); + if (err) + goto err_kfree; + + err = -EUCLEAN; + if (pstr.name[0] == '\0') + goto err_kfree; + + pstr.name[pstr.len] = '\0'; + return pstr.name; + +err_kfree: + kfree(pstr.name); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(fscrypt_get_symlink); diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index 875c83672318..1886b255adcb 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -238,4 +238,11 @@ static inline int __fscrypt_encrypt_symlink(struct inode *inode, return -EOPNOTSUPP; } +static inline void *fscrypt_get_symlink(struct inode *inode, + const void *caddr, + unsigned int max_size) +{ + return ERR_PTR(-EOPNOTSUPP); +} + #endif /* _LINUX_FSCRYPT_NOTSUPP_H */ diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index 6ccaad58d2be..92e50820fd4f 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -210,5 +210,7 @@ extern int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len, extern int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, unsigned int len, struct fscrypt_str *disk_link); +extern void *fscrypt_get_symlink(struct inode *inode, const void *caddr, + unsigned int max_size); #endif /* _LINUX_FSCRYPT_SUPP_H */ From 6b76f58e24bda781a9434989e8d6d4500e91891a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 11 Jan 2018 23:26:49 -0500 Subject: [PATCH 0607/1212] f2fs: switch to fscrypt ->symlink() helper functions Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/f2fs/namei.c | 66 ++++++++++++------------------------------------- 1 file changed, 16 insertions(+), 50 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 6bb1adb84324..7438f0d8c9f0 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -478,27 +478,16 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; size_t len = strlen(symname); - struct fscrypt_str disk_link = FSTR_INIT((char *)symname, len + 1); - struct fscrypt_symlink_data *sd = NULL; + struct fscrypt_str disk_link; int err; if (unlikely(f2fs_cp_error(sbi))) return -EIO; - if (f2fs_encrypted_inode(dir)) { - err = fscrypt_get_encryption_info(dir); - if (err) - return err; - - if (!fscrypt_has_encryption_key(dir)) - return -ENOKEY; - - disk_link.len = (fscrypt_fname_encrypted_size(dir, len) + - sizeof(struct fscrypt_symlink_data)); - } - - if (disk_link.len > dir->i_sb->s_blocksize) - return -ENAMETOOLONG; + err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize, + &disk_link); + if (err) + return err; err = dquot_initialize(dir); if (err) @@ -508,7 +497,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) return PTR_ERR(inode); - if (f2fs_encrypted_inode(inode)) + if (IS_ENCRYPTED(inode)) inode->i_op = &f2fs_encrypted_symlink_inode_operations; else inode->i_op = &f2fs_symlink_inode_operations; @@ -518,38 +507,13 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) - goto out; + goto out_handle_failed_inode; f2fs_unlock_op(sbi); alloc_nid_done(sbi, inode->i_ino); - if (f2fs_encrypted_inode(inode)) { - struct qstr istr = QSTR_INIT(symname, len); - struct fscrypt_str ostr; - - sd = f2fs_kzalloc(sbi, disk_link.len, GFP_NOFS); - if (!sd) { - err = -ENOMEM; - goto err_out; - } - - err = fscrypt_get_encryption_info(inode); - if (err) - goto err_out; - - if (!fscrypt_has_encryption_key(inode)) { - err = -ENOKEY; - goto err_out; - } - - ostr.name = sd->encrypted_path; - ostr.len = disk_link.len; - err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr); - if (err) - goto err_out; - - sd->len = cpu_to_le16(ostr.len); - disk_link.name = (char *)sd; - } + err = fscrypt_encrypt_symlink(inode, symname, len, &disk_link); + if (err) + goto err_out; err = page_symlink(inode, disk_link.name, disk_link.len); @@ -576,12 +540,14 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, f2fs_unlink(dir, dentry); } - kfree(sd); - f2fs_balance_fs(sbi, true); - return err; -out: + goto out_free_encrypted_link; + +out_handle_failed_inode: handle_failed_inode(inode); +out_free_encrypted_link: + if (disk_link.name != (unsigned char *)symname) + kfree(disk_link.name); return err; } From 7ac4756a247488122b526156f119833e6356bd72 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 11 Jan 2018 23:26:49 -0500 Subject: [PATCH 0608/1212] f2fs: switch to fscrypt_get_symlink() Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/f2fs/namei.c | 66 +++++++++---------------------------------------- 1 file changed, 11 insertions(+), 55 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 7438f0d8c9f0..72328a18c086 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -1093,65 +1093,21 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry, static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cookie) { - struct page *cpage = NULL; - char *caddr, *paddr = NULL; - struct fscrypt_str cstr = FSTR_INIT(NULL, 0); - struct fscrypt_str pstr = FSTR_INIT(NULL, 0); - struct fscrypt_symlink_data *sd; struct inode *inode = d_inode(dentry); - u32 max_size = inode->i_sb->s_blocksize; - int res; + struct page *page; + void *target; - res = fscrypt_get_encryption_info(inode); - if (res) - return ERR_PTR(res); + if (!dentry) + return ERR_PTR(-ECHILD); - cpage = read_mapping_page(inode->i_mapping, 0, NULL); - if (IS_ERR(cpage)) - return ERR_CAST(cpage); - caddr = page_address(cpage); + page = read_mapping_page(inode->i_mapping, 0, NULL); + if (IS_ERR(page)) + return ERR_CAST(page); - /* Symlink is encrypted */ - sd = (struct fscrypt_symlink_data *)caddr; - cstr.name = sd->encrypted_path; - cstr.len = le16_to_cpu(sd->len); - - /* this is broken symlink case */ - if (unlikely(cstr.len == 0)) { - res = -ENOENT; - goto errout; - } - - if ((cstr.len + sizeof(struct fscrypt_symlink_data) - 1) > max_size) { - /* Symlink data on the disk is corrupted */ - res = -EIO; - goto errout; - } - res = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr); - if (res) - goto errout; - - res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); - if (res) - goto errout; - - /* this is broken symlink case */ - if (unlikely(pstr.name[0] == 0)) { - res = -ENOENT; - goto errout; - } - - paddr = pstr.name; - - /* Null-terminate the name */ - paddr[pstr.len] = '\0'; - - put_page(cpage); - return *cookie = paddr; -errout: - fscrypt_fname_free_buffer(&pstr); - put_page(cpage); - return ERR_PTR(res); + target = fscrypt_get_symlink(inode, page_address(page), + inode->i_sb->s_blocksize); + put_page(page); + return *cookie = target; } const struct inode_operations f2fs_encrypted_symlink_inode_operations = { From f9550c24c20e3e4a89c2958c1496588a301d1409 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 11 Jan 2018 23:30:08 -0500 Subject: [PATCH 0609/1212] fscrypt: remove fscrypt_fname_usr_to_disk() fscrypt_fname_usr_to_disk() sounded very generic but was actually only used to encrypt symlinks. Remove it now that all filesystems have been switched over to fscrypt_encrypt_symlink(). Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/fname.c | 29 ----------------------------- include/linux/fscrypt_notsupp.h | 7 ------- include/linux/fscrypt_supp.h | 2 -- 3 files changed, 38 deletions(-) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 55ca8d913c94..897041751791 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -310,35 +310,6 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, } EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); -/** - * fscrypt_fname_usr_to_disk() - converts a filename from user space to disk - * space - * - * The caller must have allocated sufficient memory for the @oname string. - * - * Return: 0 on success, -errno on failure - */ -int fscrypt_fname_usr_to_disk(struct inode *inode, - const struct qstr *iname, - struct fscrypt_str *oname) -{ - if (fscrypt_is_dot_dotdot(iname)) { - oname->name[0] = '.'; - oname->name[iname->len - 1] = '.'; - oname->len = iname->len; - return 0; - } - if (inode->i_crypt_info) - return fname_encrypt(inode, iname, oname); - /* - * Without a proper key, a user is not allowed to modify the filenames - * in a directory. Consequently, a user space name cannot be mapped to - * a disk-space name - */ - return -ENOKEY; -} -EXPORT_SYMBOL(fscrypt_fname_usr_to_disk); - /** * fscrypt_setup_filename() - prepare to search a possibly encrypted directory * @dir: the directory that will be searched diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index 1886b255adcb..db31cf0c80c5 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -158,13 +158,6 @@ static inline int fscrypt_fname_disk_to_usr(struct inode *inode, return -EOPNOTSUPP; } -static inline int fscrypt_fname_usr_to_disk(struct inode *inode, - const struct qstr *iname, - struct fscrypt_str *oname) -{ - return -EOPNOTSUPP; -} - static inline bool fscrypt_match_name(const struct fscrypt_name *fname, const u8 *de_name, u32 de_name_len) { diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index 92e50820fd4f..ddd79019e3f9 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -112,8 +112,6 @@ extern int fscrypt_fname_alloc_buffer(const struct inode *, u32, extern void fscrypt_fname_free_buffer(struct fscrypt_str *); extern int fscrypt_fname_disk_to_usr(struct inode *, u32, u32, const struct fscrypt_str *, struct fscrypt_str *); -extern int fscrypt_fname_usr_to_disk(struct inode *, const struct qstr *, - struct fscrypt_str *); #define FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE 32 From 042ae9f4cfbfb555e6de68579870ac3c43594215 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 11 Jan 2018 23:30:08 -0500 Subject: [PATCH 0610/1212] fscrypt: move fscrypt_symlink_data to fscrypt_private.h Now that all filesystems have been converted to use the symlink helper functions, they no longer need the declaration of 'struct fscrypt_symlink_data'. Move it from fscrypt.h to fscrypt_private.h. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/fscrypt_private.h | 9 +++++++++ include/linux/fscrypt.h | 9 --------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 0712b0ac974b..e44e8e1419d6 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -49,6 +49,15 @@ struct fscrypt_context { #define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1 +/** + * For encrypted symlinks, the ciphertext length is stored at the beginning + * of the string in little-endian format. + */ +struct fscrypt_symlink_data { + __le16 len; + char encrypted_path[1]; +} __packed; + /* * A pointer to this structure is stored in the file system's in-core * representation of an inode. diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 77a171da8254..9e535af579e8 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -20,15 +20,6 @@ struct fscrypt_ctx; struct fscrypt_info; -/** - * For encrypted symlinks, the ciphertext length is stored at the beginning - * of the string in little-endian format. - */ -struct fscrypt_symlink_data { - __le16 len; - char encrypted_path[1]; -} __packed; - struct fscrypt_str { unsigned char *name; u32 len; From 168a90782888affff92b4a4fe950c9e5afca7179 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 11 Jan 2018 23:30:08 -0500 Subject: [PATCH 0611/1212] fscrypt: calculate NUL-padding length in one place only Currently, when encrypting a filename (either a real filename or a symlink target) we calculate the amount of NUL-padding twice: once before encryption and once during encryption in fname_encrypt(). It is needed before encryption to allocate the needed buffer size as well as calculate the size the symlink target will take up on-disk before creating the symlink inode. Calculating the size during encryption as well is redundant. Remove this redundancy by always calculating the exact size beforehand, and making fname_encrypt() just add as much NUL padding as is needed to fill the output buffer. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/fname.c | 53 ++++++++++++++++++------------------- fs/crypto/fscrypt_private.h | 4 +-- fs/crypto/hooks.c | 7 +---- 3 files changed, 29 insertions(+), 35 deletions(-) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 897041751791..3145665c9ca1 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -29,39 +29,29 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) /** * fname_encrypt() - encrypt a filename * - * The caller must have allocated sufficient memory for the @oname string. + * The output buffer must be at least as large as the input buffer. + * Any extra space is filled with NUL padding before encryption. * * Return: 0 on success, -errno on failure */ -int fname_encrypt(struct inode *inode, - const struct qstr *iname, struct fscrypt_str *oname) +int fname_encrypt(struct inode *inode, const struct qstr *iname, + u8 *out, unsigned int olen) { struct skcipher_request *req = NULL; DECLARE_CRYPTO_WAIT(wait); - struct fscrypt_info *ci = inode->i_crypt_info; - struct crypto_skcipher *tfm = ci->ci_ctfm; + struct crypto_skcipher *tfm = inode->i_crypt_info->ci_ctfm; int res = 0; char iv[FS_CRYPTO_BLOCK_SIZE]; struct scatterlist sg; - int padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK); - unsigned int lim; - unsigned int cryptlen; - - lim = inode->i_sb->s_cop->max_namelen(inode); - if (iname->len <= 0 || iname->len > lim) - return -EIO; /* * Copy the filename to the output buffer for encrypting in-place and * pad it with the needed number of NUL bytes. */ - if (WARN_ON(oname->len < iname->len)) + if (WARN_ON(olen < iname->len)) return -ENOBUFS; - cryptlen = max_t(unsigned int, iname->len, FS_CRYPTO_BLOCK_SIZE); - cryptlen = round_up(cryptlen, padding); - cryptlen = min3(cryptlen, lim, oname->len); - memcpy(oname->name, iname->name, iname->len); - memset(oname->name + iname->len, 0, cryptlen - iname->len); + memcpy(out, iname->name, iname->len); + memset(out + iname->len, 0, olen - iname->len); /* Initialize the IV */ memset(iv, 0, FS_CRYPTO_BLOCK_SIZE); @@ -76,8 +66,8 @@ int fname_encrypt(struct inode *inode, skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &wait); - sg_init_one(&sg, oname->name, cryptlen); - skcipher_request_set_crypt(req, &sg, &sg, cryptlen, iv); + sg_init_one(&sg, out, olen); + skcipher_request_set_crypt(req, &sg, &sg, olen, iv); /* Do the encryption */ res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); @@ -88,7 +78,6 @@ int fname_encrypt(struct inode *inode, return res; } - oname->len = cryptlen; return 0; } @@ -353,11 +342,21 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, return ret; if (dir->i_crypt_info) { - ret = fscrypt_fname_alloc_buffer(dir, iname->len, - &fname->crypto_buf); - if (ret) - return ret; - ret = fname_encrypt(dir, iname, &fname->crypto_buf); + unsigned int max_len = dir->i_sb->s_cop->max_namelen(dir); + + if (iname->len > max_len) + return -ENAMETOOLONG; + + fname->crypto_buf.len = + min(fscrypt_fname_encrypted_size(dir, iname->len), + max_len); + fname->crypto_buf.name = kmalloc(fname->crypto_buf.len, + GFP_NOFS); + if (!fname->crypto_buf.name) + return -ENOMEM; + + ret = fname_encrypt(dir, iname, fname->crypto_buf.name, + fname->crypto_buf.len); if (ret) goto errout; fname->disk_name.name = fname->crypto_buf.name; @@ -409,7 +408,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, return 0; errout: - fscrypt_fname_free_buffer(&fname->crypto_buf); + kfree(fname->crypto_buf.name); return ret; } EXPORT_SYMBOL(fscrypt_setup_filename); diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index e44e8e1419d6..eb40d32b8e79 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -118,8 +118,8 @@ extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags); /* fname.c */ -extern int fname_encrypt(struct inode *inode, - const struct qstr *iname, struct fscrypt_str *oname); +extern int fname_encrypt(struct inode *inode, const struct qstr *iname, + u8 *out, unsigned int olen); /* keyinfo.c */ extern void __exit fscrypt_essiv_cleanup(void); diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 534cfb212cdb..8b829400f467 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -161,7 +161,6 @@ int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, struct qstr iname = { .name = target, .len = len }; struct fscrypt_symlink_data *sd; unsigned int ciphertext_len; - struct fscrypt_str oname; err = fscrypt_require_key(inode); if (err) @@ -178,16 +177,12 @@ int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, ciphertext_len = disk_link->len - sizeof(*sd); sd->len = cpu_to_le16(ciphertext_len); - oname.name = sd->encrypted_path; - oname.len = ciphertext_len; - err = fname_encrypt(inode, &iname, &oname); + err = fname_encrypt(inode, &iname, sd->encrypted_path, ciphertext_len); if (err) { if (!disk_link->name) kfree(sd); return err; } - BUG_ON(oname.len != ciphertext_len); - /* * Null-terminating the ciphertext doesn't make sense, but we still * count the null terminator in the length, so we might as well From 82bec888567bbe1143ae2173b2ef442070ecbe4a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 11 Jan 2018 23:30:08 -0500 Subject: [PATCH 0612/1212] fscrypt: define fscrypt_fname_alloc_buffer() to be for presented names Previously fscrypt_fname_alloc_buffer() was used to allocate buffers for both presented (decrypted or encoded) and encrypted filenames. That was confusing, because it had to allocate the worst-case size for either, e.g. including NUL-padding even when it was meaningless. But now that fscrypt_setup_filename() no longer calls it, it is only used in the ->get_link() and ->readdir() paths, which specifically want a buffer for presented filenames. Therefore, switch the behavior over to allocating the buffer for presented filenames only. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/fname.c | 29 ++++++++++++++--------------- include/linux/fscrypt_notsupp.h | 2 +- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 3145665c9ca1..aee2c3c36048 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -203,37 +203,36 @@ u32 fscrypt_fname_encrypted_size(const struct inode *inode, u32 ilen) EXPORT_SYMBOL(fscrypt_fname_encrypted_size); /** - * fscrypt_fname_crypto_alloc_obuff() - + * fscrypt_fname_alloc_buffer - allocate a buffer for presented filenames * - * Allocates an output buffer that is sufficient for the crypto operation - * specified by the context and the direction. + * Allocate a buffer that is large enough to hold any decrypted or encoded + * filename (null-terminated), for the given maximum encrypted filename length. + * + * Return: 0 on success, -errno on failure */ int fscrypt_fname_alloc_buffer(const struct inode *inode, - u32 ilen, struct fscrypt_str *crypto_str) + u32 max_encrypted_len, + struct fscrypt_str *crypto_str) { - u32 olen = fscrypt_fname_encrypted_size(inode, ilen); const u32 max_encoded_len = max_t(u32, BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE), 1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name))); + u32 max_presented_len; - crypto_str->len = olen; - olen = max(olen, max_encoded_len); + max_presented_len = max(max_encoded_len, max_encrypted_len); - /* - * Allocated buffer can hold one more character to null-terminate the - * string - */ - crypto_str->name = kmalloc(olen + 1, GFP_NOFS); - if (!(crypto_str->name)) + crypto_str->name = kmalloc(max_presented_len + 1, GFP_NOFS); + if (!crypto_str->name) return -ENOMEM; + crypto_str->len = max_presented_len; return 0; } EXPORT_SYMBOL(fscrypt_fname_alloc_buffer); /** - * fscrypt_fname_crypto_free_buffer() - + * fscrypt_fname_free_buffer - free the buffer for presented filenames * - * Frees the buffer allocated for crypto operation. + * Free the buffer allocated by fscrypt_fname_alloc_buffer(). */ void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str) { diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index db31cf0c80c5..f5de736cf1c1 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -139,7 +139,7 @@ static inline u32 fscrypt_fname_encrypted_size(const struct inode *inode, } static inline int fscrypt_fname_alloc_buffer(const struct inode *inode, - u32 ilen, + u32 max_encrypted_len, struct fscrypt_str *crypto_str) { return -EOPNOTSUPP; From 31d3279a4fcaf92099ad5ee613a6cf3db99f7e9b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 11 Jan 2018 23:30:08 -0500 Subject: [PATCH 0613/1212] fscrypt: fix up fscrypt_fname_encrypted_size() for internal use Filesystems don't need fscrypt_fname_encrypted_size() anymore, so unexport it and move it to fscrypt_private.h. We also never calculate the encrypted size of a filename without having the fscrypt_info present since it is needed to know the amount of NUL-padding which is determined by the encryption policy, and also we will always truncate the NUL-padding to the maximum filename length. Therefore, also make fscrypt_fname_encrypted_size() assume that the fscrypt_info is present, and make it truncate the returned length to the specified max_len. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/fname.c | 29 ++++++++++++++--------------- fs/crypto/fscrypt_private.h | 3 +++ fs/crypto/hooks.c | 10 +++++----- include/linux/fscrypt_notsupp.h | 8 -------- include/linux/fscrypt_supp.h | 1 - 5 files changed, 22 insertions(+), 29 deletions(-) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index aee2c3c36048..b18fa323d1d9 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -190,17 +190,20 @@ static int digest_decode(const char *src, int len, char *dst) return cp - dst; } -u32 fscrypt_fname_encrypted_size(const struct inode *inode, u32 ilen) +bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, + u32 max_len, u32 *encrypted_len_ret) { - int padding = 32; - struct fscrypt_info *ci = inode->i_crypt_info; + int padding = 4 << (inode->i_crypt_info->ci_flags & + FS_POLICY_FLAGS_PAD_MASK); + u32 encrypted_len; - if (ci) - padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK); - ilen = max(ilen, (u32)FS_CRYPTO_BLOCK_SIZE); - return round_up(ilen, padding); + if (orig_len > max_len) + return false; + encrypted_len = max(orig_len, (u32)FS_CRYPTO_BLOCK_SIZE); + encrypted_len = round_up(encrypted_len, padding); + *encrypted_len_ret = min(encrypted_len, max_len); + return true; } -EXPORT_SYMBOL(fscrypt_fname_encrypted_size); /** * fscrypt_fname_alloc_buffer - allocate a buffer for presented filenames @@ -341,14 +344,10 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, return ret; if (dir->i_crypt_info) { - unsigned int max_len = dir->i_sb->s_cop->max_namelen(dir); - - if (iname->len > max_len) + if (!fscrypt_fname_encrypted_size(dir, iname->len, + dir->i_sb->s_cop->max_namelen(dir), + &fname->crypto_buf.len)) return -ENAMETOOLONG; - - fname->crypto_buf.len = - min(fscrypt_fname_encrypted_size(dir, iname->len), - max_len); fname->crypto_buf.name = kmalloc(fname->crypto_buf.len, GFP_NOFS); if (!fname->crypto_buf.name) diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index eb40d32b8e79..5c296d4af4a9 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -120,6 +120,9 @@ extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, /* fname.c */ extern int fname_encrypt(struct inode *inode, const struct qstr *iname, u8 *out, unsigned int olen); +extern bool fscrypt_fname_encrypted_size(const struct inode *inode, + u32 orig_len, u32 max_len, + u32 *encrypted_len_ret); /* keyinfo.c */ extern void __exit fscrypt_essiv_cleanup(void); diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 8b829400f467..a91f605d81e9 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -143,12 +143,12 @@ int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len, * counting it (even though it is meaningless for ciphertext) is simpler * for now since filesystems will assume it is there and subtract it. */ - if (sizeof(struct fscrypt_symlink_data) + len > max_len) + if (!fscrypt_fname_encrypted_size(dir, len, + max_len - sizeof(struct fscrypt_symlink_data), + &disk_link->len)) return -ENAMETOOLONG; - disk_link->len = min_t(unsigned int, - sizeof(struct fscrypt_symlink_data) + - fscrypt_fname_encrypted_size(dir, len), - max_len); + disk_link->len += sizeof(struct fscrypt_symlink_data); + disk_link->name = NULL; return 0; } diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index f5de736cf1c1..5777251400f9 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -130,14 +130,6 @@ static inline void fscrypt_free_filename(struct fscrypt_name *fname) return; } -static inline u32 fscrypt_fname_encrypted_size(const struct inode *inode, - u32 ilen) -{ - /* never happens */ - WARN_ON(1); - return 0; -} - static inline int fscrypt_fname_alloc_buffer(const struct inode *inode, u32 max_encrypted_len, struct fscrypt_str *crypto_str) diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index ddd79019e3f9..c88d2058902a 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -106,7 +106,6 @@ static inline void fscrypt_free_filename(struct fscrypt_name *fname) kfree(fname->crypto_buf.name); } -extern u32 fscrypt_fname_encrypted_size(const struct inode *, u32); extern int fscrypt_fname_alloc_buffer(const struct inode *, u32, struct fscrypt_str *); extern void fscrypt_fname_free_buffer(struct fscrypt_str *); From 7e0e7995ee97a285ca764c854c7e899aecd75949 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 19 Jan 2018 13:45:24 -0800 Subject: [PATCH 0614/1212] fscrypt: fix build with pre-4.6 gcc versions gcc versions prior to 4.6 require an extra level of braces when using a designated initializer for a member in an anonymous struct or union. This caused a compile error with the 'struct qstr' initialization in __fscrypt_encrypt_symlink(). Fix it by using QSTR_INIT(). Reported-by: Andrew Morton Fixes: 76e81d6d5048 ("fscrypt: new helper functions for ->symlink()") Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/hooks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index a91f605d81e9..bc010e4609ef 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -158,7 +158,7 @@ int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, unsigned int len, struct fscrypt_str *disk_link) { int err; - struct qstr iname = { .name = target, .len = len }; + struct qstr iname = QSTR_INIT(target, len); struct fscrypt_symlink_data *sd; unsigned int ciphertext_len; From 39575737bb62fc391c8cc8ea5dfea09daed57d5d Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Mon, 29 Jan 2018 19:13:15 +0800 Subject: [PATCH 0615/1212] f2fs: fix potential corruption in area before F2FS_SUPER_OFFSET sb_getblk does not guarantee the buffer head is uptodate. If bh is not uptodate, the data (may be used as boot code) in area before F2FS_SUPER_OFFSET may get corrupted when super block is committed. Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index aaeba346e9d7..63729184bcc4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1898,7 +1898,6 @@ static int __f2fs_commit_super(struct buffer_head *bh, lock_buffer(bh); if (super) memcpy(bh->b_data + F2FS_SUPER_OFFSET, super, sizeof(*super)); - set_buffer_uptodate(bh); set_buffer_dirty(bh); unlock_buffer(bh); @@ -2338,7 +2337,7 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) } /* write back-up superblock first */ - bh = sb_getblk(sbi->sb, sbi->valid_super_block ? 0: 1); + bh = sb_bread(sbi->sb, sbi->valid_super_block ? 0 : 1); if (!bh) return -EIO; err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi)); @@ -2349,7 +2348,7 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) return err; /* write current valid superblock */ - bh = sb_getblk(sbi->sb, sbi->valid_super_block); + bh = sb_bread(sbi->sb, sbi->valid_super_block); if (!bh) return -EIO; err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi)); From 41dda11641377f1233f14aae8fe8b3d0a2989ff8 Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Mon, 29 Jan 2018 11:37:45 +0800 Subject: [PATCH 0616/1212] f2fs: fix heap mode to reset it back Commit 7a20b8a61eff81bdb7097a578752a74860e9d142 ("f2fs: allocate node and hot data in the beginning of partition") introduces another mount option, heap, to reset it back. But it does not do anything for heap mode, so fix it. Cc: stable@vger.kernel.org Signed-off-by: Yunlong Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 5 +++-- fs/f2fs/segment.c | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d0de3429c26c..06de4ca9abc9 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -191,8 +191,9 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, if (gc_type != FG_GC && p->max_search > sbi->max_victim_search) p->max_search = sbi->max_victim_search; - /* let's select beginning hot/small space first */ - if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) + /* let's select beginning hot/small space first in no_heap mode*/ + if (test_opt(sbi, NOHEAP) && + (type == CURSEG_HOT_DATA || IS_NODESEG(type))) p->offset = 0; else p->offset = SIT_I(sbi)->last_victim[p->gc_mode]; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index bf98f6f34b7e..4e27b6721ba1 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2244,7 +2244,8 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) if (sbi->segs_per_sec != 1) return CURSEG_I(sbi, type)->segno; - if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) + if (test_opt(sbi, NOHEAP) && + (type == CURSEG_HOT_DATA || IS_NODESEG(type))) return 0; if (SIT_I(sbi)->last_victim[ALLOC_NEXT]) From 2e2a339c9853be971a114d0a572cb85de13d2ad7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 27 Jan 2018 17:29:48 +0800 Subject: [PATCH 0617/1212] f2fs: restrict inline_xattr_size configuration This patch limits to enable inline_xattr_size mount option only if both extra_attr and flexible_inline_xattr feature is on in current image. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 63729184bcc4..bb0ab4f5e2d4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -699,6 +699,13 @@ static int parse_options(struct super_block *sb, char *options) } if (test_opt(sbi, INLINE_XATTR_SIZE)) { + if (!f2fs_sb_has_extra_attr(sb) || + !f2fs_sb_has_flexible_inline_xattr(sb)) { + f2fs_msg(sb, KERN_ERR, + "extra_attr or flexible_inline_xattr " + "feature is off"); + return -EINVAL; + } if (!test_opt(sbi, INLINE_XATTR)) { f2fs_msg(sb, KERN_ERR, "inline_xattr_size option should be " From eceb943d5d592873f67d25b68d66232f7ef44be7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 27 Jan 2018 17:29:49 +0800 Subject: [PATCH 0618/1212] f2fs: fix to check extent cache in f2fs_drop_extent_tree If noextent_cache mount option is on, we will never initialize extent tree in inode, but still we're going to access it in f2fs_drop_extent_tree, result in kernel panic as below: BUG: unable to handle kernel NULL pointer dereference at 0000000000000038 IP: _raw_write_lock+0xc/0x30 Call Trace: ? f2fs_drop_extent_tree+0x41/0x70 [f2fs] f2fs_fallocate+0x5a0/0xdd0 [f2fs] ? common_file_perm+0x47/0xc0 ? apparmor_file_permission+0x1a/0x20 vfs_fallocate+0x15b/0x290 SyS_fallocate+0x44/0x70 do_syscall_64+0x6e/0x160 entry_SYSCALL64_slow_path+0x25/0x25 This patch fixes to check extent cache status before using in f2fs_drop_extent_tree. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index ff2352a0ed15..aff6c2ed1c02 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -706,6 +706,9 @@ void f2fs_drop_extent_tree(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree *et = F2FS_I(inode)->extent_tree; + if (!f2fs_may_extent_tree(inode)) + return; + set_inode_flag(inode, FI_NO_EXTENT); write_lock(&et->lock); From 0671fae134bb95325ddb35405656af3c9236548d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 25 Jan 2018 19:40:08 +0800 Subject: [PATCH 0619/1212] f2fs: support large nat bitmap Previously, we will store all nat version bitmap in checkpoint pack block, so our total node entry number has a limitation which caused total node number can not exceed (3900 * 8) block * 455 node/block = 14196000. So that once user wants to create more nodes in large size image, it becomes a bottleneck, that's unreasonable. This patch detects the new layout of nat/sit version bitmap in image in order to enable supporting large nat bitmap. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 6 ++++++ include/linux/f2fs_fs.h | 1 + 2 files changed, 7 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e54ffadb692c..20e940f22c5c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1826,6 +1826,12 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); int offset; + if (is_set_ckpt_flags(sbi, CP_LARGE_NAT_BITMAP_FLAG)) { + offset = (flag == SIT_BITMAP) ? + le32_to_cpu(ckpt->nat_ver_bitmap_bytesize) : 0; + return &ckpt->sit_nat_version_bitmap + offset; + } + if (__cp_payload(sbi) > 0) { if (flag == NAT_BITMAP) return &ckpt->sit_nat_version_bitmap; diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 073365c9808a..23f23b8e6878 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -116,6 +116,7 @@ struct f2fs_super_block { /* * For checkpoint */ +#define CP_LARGE_NAT_BITMAP_FLAG 0x00000400 #define CP_NOCRC_RECOVERY_FLAG 0x00000200 #define CP_TRIMMED_FLAG 0x00000100 #define CP_NAT_BITS_FLAG 0x00000080 From 180900373ec1684eb94e748915a5c25dde14774e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 31 Jan 2018 09:30:34 +0800 Subject: [PATCH 0620/1212] f2fs: fix to clear CP_TRIMMED_FLAG Once CP_TRIMMED_FLAG is set, after a reboot, we will never issue discard before LBA becomes invalid again, fix it by clearing the flag in checkpoint without CP_TRIMMED reason. Fixes: 1f43e2ad7bff ("f2fs: introduce CP_TRIMMED_FLAG to avoid unneeded discard") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 3c343e922f6e..ab1b35856082 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1140,6 +1140,8 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (cpc->reason & CP_TRIMMED) __set_ckpt_flags(ckpt, CP_TRIMMED_FLAG); + else + __clear_ckpt_flags(ckpt, CP_TRIMMED_FLAG); if (cpc->reason & CP_UMOUNT) __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); From 22fa74c2b0975f8ae05e55860b84ba2557c940ad Mon Sep 17 00:00:00 2001 From: Hyunchul Lee Date: Wed, 31 Jan 2018 11:36:57 +0900 Subject: [PATCH 0621/1212] f2fs: support passing down write hints given by users to block layer Add the 'whint_mode' mount option that controls which write hints are passed down to block layer. There are "off" and "user-based" mode. The default mode is "off". 1) whint_mode=off. F2FS only passes down WRITE_LIFE_NOT_SET. 2) whint_mode=user-based. F2FS tries to pass down hints given by users. User F2FS Block ---- ---- ----- META WRITE_LIFE_NOT_SET HOT_NODE " WARM_NODE " COLD_NODE " ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME extension list " " -- buffered io WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET WRITE_LIFE_NONE " " WRITE_LIFE_MEDIUM " " WRITE_LIFE_LONG " " -- direct io WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET WRITE_LIFE_NONE " WRITE_LIFE_NONE WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM WRITE_LIFE_LONG " WRITE_LIFE_LONG Many thanks to Chao Yu and Jaegeuk Kim for comments to implement this patch. Signed-off-by: Hyunchul Lee Reviewed-by: Chao Yu [Jaegeuk Kim: avoid build warning] [Chao Yu: fix to restore whint_mode in ->remount_fs] Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 32 +++++++++++++++------ fs/f2fs/f2fs.h | 18 ++++++------ fs/f2fs/segment.c | 59 ++++++++++++++++++++++++++++++++++++++ fs/f2fs/super.c | 31 +++++++++++++++++++- include/linux/blk_types.h | 1 + include/linux/fs.h | 17 +++++++++-- include/uapi/linux/fcntl.h | 21 ++++++++++++++ 7 files changed, 158 insertions(+), 21 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index c80f138b0f33..680241a10505 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -173,15 +173,22 @@ static bool __same_bdev(struct f2fs_sb_info *sbi, */ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, struct writeback_control *wbc, - int npages, bool is_read) + int npages, bool is_read, + enum page_type type, enum temp_type temp) { struct bio *bio; bio = f2fs_bio_alloc(sbi, npages, true); f2fs_target_device(sbi, blk_addr, bio); - bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; - bio->bi_private = is_read ? NULL : sbi; + if (is_read) { + bio->bi_end_io = f2fs_read_end_io; + bio->bi_private = NULL; + } else { + bio->bi_end_io = f2fs_write_end_io; + bio->bi_private = sbi; + bio->bi_write_hint = io_type_to_rw_hint(sbi, type, temp); + } if (wbc) wbc_init_bio(wbc, bio); @@ -380,7 +387,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) /* Allocate a new bio */ bio = __bio_alloc(fio->sbi, fio->new_blkaddr, fio->io_wbc, - 1, is_read_io(fio->op)); + 1, is_read_io(fio->op), fio->type, fio->temp); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_put(bio); @@ -443,7 +450,8 @@ int f2fs_submit_page_write(struct f2fs_io_info *fio) goto out_fail; } io->bio = __bio_alloc(sbi, fio->new_blkaddr, fio->io_wbc, - BIO_MAX_PAGES, false); + BIO_MAX_PAGES, false, + fio->type, fio->temp); io->fio = *fio; } @@ -867,8 +875,7 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) map.m_seg_type = NO_CHECK_TYPE; if (direct_io) { - /* map.m_seg_type = rw_hint_to_seg_type(iocb->ki_hint); */ - map.m_seg_type = rw_hint_to_seg_type(WRITE_LIFE_NOT_SET); + map.m_seg_type = rw_hint_to_seg_type(iocb->ki_hint); flag = __force_buffered_io(inode, WRITE) ? F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO; @@ -1150,8 +1157,7 @@ static int get_data_block_dio(struct inode *inode, sector_t iblock, return __get_data_block(inode, iblock, bh_result, create, F2FS_GET_BLOCK_DEFAULT, NULL, rw_hint_to_seg_type( - WRITE_LIFE_NOT_SET)); - /* inode->i_write_hint)); */ + inode->i_write_hint)); } static int get_data_block_bmap(struct inode *inode, sector_t iblock, @@ -2292,9 +2298,12 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); size_t count = iov_iter_count(iter); int rw = iov_iter_rw(iter); int err; + enum rw_hint hint = iocb->ki_hint; + int whint_mode = sbi->whint_mode; err = check_direct_IO(inode, iter, offset); if (err) @@ -2305,11 +2314,16 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, trace_f2fs_direct_IO_enter(inode, offset, count, rw); + if (rw == WRITE && whint_mode == WHINT_MODE_OFF) + iocb->ki_hint = WRITE_LIFE_NOT_SET; + down_read(&F2FS_I(inode)->dio_rwsem[rw]); err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio); up_read(&F2FS_I(inode)->dio_rwsem[rw]); if (rw == WRITE) { + if (whint_mode == WHINT_MODE_OFF) + iocb->ki_hint = hint; if (err > 0) { f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO, err); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 20e940f22c5c..2aa47fac39a8 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1101,6 +1101,11 @@ enum { MAX_TIME, }; +enum { + WHINT_MODE_OFF, /* not pass down write hints */ + WHINT_MODE_USER, /* try to pass down hints given by users */ +}; + struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ @@ -1284,6 +1289,8 @@ struct f2fs_sb_info { char *s_qf_names[MAXQUOTAS]; int s_jquota_fmt; /* Format of quota to use */ #endif + /* For which write hints are passed down to block layer */ + int whint_mode; }; #ifdef CONFIG_F2FS_FAULT_INJECTION @@ -2573,15 +2580,6 @@ static inline void *kvzalloc(size_t size, gfp_t flags) return ret; } -enum rw_hint { - WRITE_LIFE_NOT_SET = 0, - WRITE_LIFE_NONE = 1, /* RWH_WRITE_LIFE_NONE */ - WRITE_LIFE_SHORT = 2, /* RWH_WRITE_LIFE_SHORT */ - WRITE_LIFE_MEDIUM = 3, /* RWH_WRITE_LIFE_MEDIUM */ - WRITE_LIFE_LONG = 4, /* RWH_WRITE_LIFE_LONG */ - WRITE_LIFE_EXTREME = 5, /* RWH_WRITE_LIFE_EXTREME */ -}; - static inline void *f2fs_kzalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { @@ -2862,6 +2860,8 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi); int __init create_segment_manager_caches(void); void destroy_segment_manager_caches(void); int rw_hint_to_seg_type(enum rw_hint hint); +enum rw_hint io_type_to_rw_hint(struct f2fs_sb_info *sbi, enum page_type type, + enum temp_type temp); /* * checkpoint.c diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 4e27b6721ba1..5dc604058205 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2536,6 +2536,62 @@ int rw_hint_to_seg_type(enum rw_hint hint) } } +/* This returns write hints for each segment type. This hints will be + * passed down to block layer. There are mapping tables which depend on + * the mount option 'whint_mode'. + * + * 1) whint_mode=off. F2FS only passes down WRITE_LIFE_NOT_SET. + * + * 2) whint_mode=user-based. F2FS tries to pass down hints given by users. + * + * User F2FS Block + * ---- ---- ----- + * META WRITE_LIFE_NOT_SET + * HOT_NODE " + * WARM_NODE " + * COLD_NODE " + * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME + * extension list " " + * + * -- buffered io + * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME + * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT + * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET + * WRITE_LIFE_NONE " " + * WRITE_LIFE_MEDIUM " " + * WRITE_LIFE_LONG " " + * + * -- direct io + * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME + * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT + * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET + * WRITE_LIFE_NONE " WRITE_LIFE_NONE + * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM + * WRITE_LIFE_LONG " WRITE_LIFE_LONG + * + */ + +enum rw_hint io_type_to_rw_hint(struct f2fs_sb_info *sbi, + enum page_type type, enum temp_type temp) +{ + if (sbi->whint_mode == WHINT_MODE_USER) { + if (type == DATA) { + switch (temp) { + case COLD: + return WRITE_LIFE_EXTREME; + case HOT: + return WRITE_LIFE_SHORT; + default: + return WRITE_LIFE_NOT_SET; + } + } else { + return WRITE_LIFE_NOT_SET; + } + } else { + return WRITE_LIFE_NOT_SET; + } +} + static int __get_segment_type_2(struct f2fs_io_info *fio) { if (fio->type == DATA) @@ -2724,6 +2780,7 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, struct f2fs_io_info fio = { .sbi = sbi, .type = META, + .temp = HOT, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC | REQ_NOIDLE | REQ_META | REQ_PRIO, .old_blkaddr = page->index, @@ -2772,6 +2829,8 @@ int rewrite_data_page(struct f2fs_io_info *fio) int err; fio->new_blkaddr = fio->old_blkaddr; + /* i/o temperature is needed for passing down write hints */ + __get_segment_type(fio); stat_inc_inplace_blocks(fio->sbi); err = f2fs_submit_page_bio(fio); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index bb0ab4f5e2d4..96a720680eec 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -129,6 +129,7 @@ enum { Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, + Opt_whint, Opt_err, }; @@ -182,6 +183,7 @@ static match_table_t f2fs_tokens = { {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, + {Opt_whint, "whint_mode=%s"}, {Opt_err, NULL}, }; @@ -679,6 +681,22 @@ static int parse_options(struct super_block *sb, char *options) "quota operations not supported"); break; #endif + case Opt_whint: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + if (strlen(name) == 10 && + !strncmp(name, "user-based", 10)) { + sbi->whint_mode = WHINT_MODE_USER; + } else if (strlen(name) == 3 && + !strncmp(name, "off", 3)) { + sbi->whint_mode = WHINT_MODE_OFF; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -722,6 +740,12 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; } } + + /* Not pass down write hints if the number of active logs is lesser + * than NR_CURSEG_TYPE. + */ + if (sbi->active_logs != NR_CURSEG_TYPE) + sbi->whint_mode = WHINT_MODE_OFF; return 0; } @@ -1233,6 +1257,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",prjquota"); #endif f2fs_show_quota_options(seq, sbi->sb); + if (sbi->whint_mode == WHINT_MODE_USER) + seq_printf(seq, ",whint_mode=%s", "user-based"); return 0; } @@ -1242,6 +1268,7 @@ static void default_options(struct f2fs_sb_info *sbi) /* init some FS parameters */ sbi->active_logs = NR_CURSEG_TYPE; sbi->inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; + sbi->whint_mode = WHINT_MODE_OFF; set_opt(sbi, BG_GC); set_opt(sbi, INLINE_XATTR); @@ -1282,6 +1309,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool need_restart_gc = false; bool need_stop_gc = false; bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); + int old_whint_mode = sbi->whint_mode; #ifdef CONFIG_F2FS_FAULT_INJECTION struct f2fs_fault_info ffi = sbi->fault_info; #endif @@ -1381,7 +1409,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) need_stop_gc = true; } - if (*flags & MS_RDONLY) { + if (*flags & MS_RDONLY || sbi->whint_mode != old_whint_mode) { writeback_inodes_sb(sb, WB_REASON_SYNC); sync_inodes_sb(sb); @@ -1431,6 +1459,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) sbi->s_qf_names[i] = s_qf_names[i]; } #endif + sbi->whint_mode = old_whint_mode; sbi->mount_opt = org_mount_opt; sbi->active_logs = active_logs; sb->s_flags = old_sb_flags; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 0fb65843ec1e..f0942a82bb20 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -47,6 +47,7 @@ struct bio { struct bio *bi_next; /* request queue link */ struct block_device *bi_bdev; unsigned int bi_flags; /* status, command, etc */ + unsigned short bi_write_hint; int bi_error; unsigned long bi_rw; /* bottom bits READ/WRITE, * top bits priority diff --git a/include/linux/fs.h b/include/linux/fs.h index 933978eb92fb..8231cdc25901 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -318,6 +319,18 @@ struct page; struct address_space; struct writeback_control; +/* + * Write life time hint values. + */ +enum rw_hint { + WRITE_LIFE_NOT_SET = 0, + WRITE_LIFE_NONE = RWH_WRITE_LIFE_NONE, + WRITE_LIFE_SHORT = RWH_WRITE_LIFE_SHORT, + WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM, + WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG, + WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME, +}; + #define IOCB_EVENTFD (1 << 0) #define IOCB_APPEND (1 << 1) #define IOCB_DIRECT (1 << 2) @@ -328,6 +341,7 @@ struct kiocb { void (*ki_complete)(struct kiocb *iocb, long ret, long ret2); void *private; int ki_flags; + enum rw_hint ki_hint; }; static inline bool is_sync_kiocb(struct kiocb *kiocb) @@ -624,6 +638,7 @@ struct inode { spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ unsigned short i_bytes; unsigned int i_blkbits; + enum rw_hint i_write_hint; blkcnt_t i_blocks; #ifdef __NEED_I_SIZE_ORDERED @@ -1053,8 +1068,6 @@ struct file_lock_context { #define OFFT_OFFSET_MAX INT_LIMIT(off_t) #endif -#include - extern void send_sigio(struct fown_struct *fown, int fd, int band); #ifdef CONFIG_FILE_LOCKING diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index beed138bd359..f85ed3a5ef4d 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -42,6 +42,27 @@ #define F_SEAL_WRITE 0x0008 /* prevent writes */ /* (1U << 31) is reserved for signed error codes */ +/* + * Set/Get write life time hints. {GET,SET}_RW_HINT operate on the + * underlying inode, while {GET,SET}_FILE_RW_HINT operate only on + * the specific file. + */ +#define F_GET_RW_HINT (F_LINUX_SPECIFIC_BASE + 11) +#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12) +#define F_GET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 13) +#define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14) + +/* + * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be + * used to clear any hints previously set. + */ +#define RWF_WRITE_LIFE_NOT_SET 0 +#define RWH_WRITE_LIFE_NONE 1 +#define RWH_WRITE_LIFE_SHORT 2 +#define RWH_WRITE_LIFE_MEDIUM 3 +#define RWH_WRITE_LIFE_LONG 4 +#define RWH_WRITE_LIFE_EXTREME 5 + /* * Types of directory notifications that may be requested. */ From 92b12bb1a23e6e808e40d2c01f231b881e44abb2 Mon Sep 17 00:00:00 2001 From: Hyunchul Lee Date: Wed, 31 Jan 2018 11:36:58 +0900 Subject: [PATCH 0622/1212] f2fs: support passing down write hints to block layer with F2FS policy Add 'whint_mode=fs-based' mount option. In this mode, F2FS passes down write hints with its policy. * whint_mode=fs-based. F2FS passes down hints with its policy. User F2FS Block ---- ---- ----- META WRITE_LIFE_MEDIUM; HOT_NODE WRITE_LIFE_NOT_SET WARM_NODE " COLD_NODE WRITE_LIFE_NONE ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME extension list " " -- buffered io WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_LONG WRITE_LIFE_NONE " " WRITE_LIFE_MEDIUM " " WRITE_LIFE_LONG " " -- direct io WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET WRITE_LIFE_NONE " WRITE_LIFE_NONE WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM WRITE_LIFE_LONG " WRITE_LIFE_LONG Many thanks to Chao Yu and Jaegeuk Kim for comments to implement this patch. Signed-off-by: Hyunchul Lee Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/segment.c | 57 +++++++++++++++++++++++++++++++++++++++-------- fs/f2fs/super.c | 5 +++++ 3 files changed, 54 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2aa47fac39a8..385ad384775a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1104,6 +1104,7 @@ enum { enum { WHINT_MODE_OFF, /* not pass down write hints */ WHINT_MODE_USER, /* try to pass down hints given by users */ + WHINT_MODE_FS, /* pass down hints with F2FS policy */ }; struct f2fs_sb_info { diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5dc604058205..3a150018fd2c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2569,6 +2569,32 @@ int rw_hint_to_seg_type(enum rw_hint hint) * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM * WRITE_LIFE_LONG " WRITE_LIFE_LONG * + * 3) whint_mode=fs-based. F2FS passes down hints with its policy. + * + * User F2FS Block + * ---- ---- ----- + * META WRITE_LIFE_MEDIUM; + * HOT_NODE WRITE_LIFE_NOT_SET + * WARM_NODE " + * COLD_NODE WRITE_LIFE_NONE + * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME + * extension list " " + * + * -- buffered io + * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME + * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT + * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_LONG + * WRITE_LIFE_NONE " " + * WRITE_LIFE_MEDIUM " " + * WRITE_LIFE_LONG " " + * + * -- direct io + * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME + * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT + * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET + * WRITE_LIFE_NONE " WRITE_LIFE_NONE + * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM + * WRITE_LIFE_LONG " WRITE_LIFE_LONG */ enum rw_hint io_type_to_rw_hint(struct f2fs_sb_info *sbi, @@ -2576,20 +2602,33 @@ enum rw_hint io_type_to_rw_hint(struct f2fs_sb_info *sbi, { if (sbi->whint_mode == WHINT_MODE_USER) { if (type == DATA) { - switch (temp) { - case COLD: - return WRITE_LIFE_EXTREME; - case HOT: - return WRITE_LIFE_SHORT; - default: + if (temp == WARM) return WRITE_LIFE_NOT_SET; - } + else if (temp == HOT) + return WRITE_LIFE_SHORT; + else if (temp == COLD) + return WRITE_LIFE_EXTREME; } else { return WRITE_LIFE_NOT_SET; } - } else { - return WRITE_LIFE_NOT_SET; + } else if (sbi->whint_mode == WHINT_MODE_FS) { + if (type == DATA) { + if (temp == WARM) + return WRITE_LIFE_LONG; + else if (temp == HOT) + return WRITE_LIFE_SHORT; + else if (temp == COLD) + return WRITE_LIFE_EXTREME; + } else if (type == NODE) { + if (temp == WARM || temp == HOT) + return WRITE_LIFE_NOT_SET; + else if (temp == COLD) + return WRITE_LIFE_NONE; + } else if (type == META) { + return WRITE_LIFE_MEDIUM; + } } + return WRITE_LIFE_NOT_SET; } static int __get_segment_type_2(struct f2fs_io_info *fio) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 96a720680eec..8b6edc4e5cab 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -691,6 +691,9 @@ static int parse_options(struct super_block *sb, char *options) } else if (strlen(name) == 3 && !strncmp(name, "off", 3)) { sbi->whint_mode = WHINT_MODE_OFF; + } else if (strlen(name) == 8 && + !strncmp(name, "fs-based", 8)) { + sbi->whint_mode = WHINT_MODE_FS; } else { kfree(name); return -EINVAL; @@ -1259,6 +1262,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) f2fs_show_quota_options(seq, sbi->sb); if (sbi->whint_mode == WHINT_MODE_USER) seq_printf(seq, ",whint_mode=%s", "user-based"); + else if (sbi->whint_mode == WHINT_MODE_FS) + seq_printf(seq, ",whint_mode=%s", "fs-based"); return 0; } From 889d98087652d168cccc3ebb84d62efa6e825644 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 7 Feb 2018 17:01:48 -0800 Subject: [PATCH 0623/1212] f2fs: handle quota for orphan inodes This is to detect dquot_initialize errors early from evict_inode for orphan inodes. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index ab1b35856082..29bb6209dee2 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -573,13 +573,8 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) struct node_info ni; int err = acquire_orphan_inode(sbi); - if (err) { - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_msg(sbi->sb, KERN_WARNING, - "%s: orphan failed (ino=%x), run fsck to fix.", - __func__, ino); - return err; - } + if (err) + goto err_out; __add_ino_entry(sbi, ino, 0, ORPHAN_INO); @@ -593,6 +588,11 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) return PTR_ERR(inode); } + err = dquot_initialize(inode); + if (err) + goto err_out; + + dquot_initialize(inode); clear_nlink(inode); /* truncate all the data during iput */ @@ -602,14 +602,18 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) /* ENOMEM was fully retried in f2fs_evict_inode. */ if (ni.blk_addr != NULL_ADDR) { - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_msg(sbi->sb, KERN_WARNING, - "%s: orphan failed (ino=%x) by kernel, retry mount.", - __func__, ino); - return -EIO; + err = -EIO; + goto err_out; } __remove_ino_entry(sbi, ino, ORPHAN_INO); return 0; + +err_out: + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: orphan failed (ino=%x), run fsck to fix.", + __func__, ino); + return err; } int recover_orphan_inodes(struct f2fs_sb_info *sbi) From 190e64a819df14ed6406f6cb075a5177155b4101 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 3 Feb 2018 17:44:39 +0800 Subject: [PATCH 0624/1212] f2fs: fix to handle looped node chain during recovery There is no checksum in node block now, so bit-transition from hardware can make node_footer.next_blkaddr being corrupted w/o any detection, result in node chain becoming looped one. For this condition, during recovery, in order to avoid running into dead loop, let's detect it and just skip out. Signed-off-by: Yunlei He Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 210de28c9cd2..4ddc2262baf1 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -242,6 +242,9 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, struct curseg_info *curseg; struct page *page = NULL; block_t blkaddr; + unsigned int loop_cnt = 0; + unsigned int free_blocks = sbi->user_block_count - + valid_user_blocks(sbi); int err = 0; /* get node pages in the current segment */ @@ -294,6 +297,17 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, if (IS_INODE(page) && is_dent_dnode(page)) entry->last_dentry = blkaddr; next: + /* sanity check in order to detect looped node chain */ + if (++loop_cnt >= free_blocks || + blkaddr == next_blkaddr_of_node(page)) { + f2fs_msg(sbi->sb, KERN_NOTICE, + "%s: detect looped node chain, " + "blkaddr:%u, next:%u", + __func__, blkaddr, next_blkaddr_of_node(page)); + err = -EINVAL; + break; + } + /* check next segment */ blkaddr = next_blkaddr_of_node(page); f2fs_put_page(page, 1); From a292477154b522b4dfa38d62f5249e6999a93a82 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 6 Feb 2018 08:21:45 +0800 Subject: [PATCH 0625/1212] f2fs: remove redundant check of page type when submit bio This patch removes redundant check of page type when submit bio to make the logic more clear. Signed-off-by: Tiezhu Yang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 680241a10505..a6ebf4549529 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -201,13 +201,12 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, if (!is_read_io(bio_op(bio))) { unsigned int start; - if (f2fs_sb_mounted_blkzoned(sbi->sb) && - current->plug && (type == DATA || type == NODE)) - blk_finish_plug(current->plug); - if (type != DATA && type != NODE) goto submit_io; + if (f2fs_sb_mounted_blkzoned(sbi->sb) && current->plug) + blk_finish_plug(current->plug); + start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS; start %= F2FS_IO_SIZE(sbi); From e5081a52ac0965739126d52db39b32a12e7a06b7 Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Tue, 6 Feb 2018 12:31:17 +0800 Subject: [PATCH 0626/1212] f2fs: clean up f2fs_sb_has_xxx functions This patch introduces F2FS_FEATURE_FUNCS to clean up the definitions of different f2fs_sb_has_xxx functions. Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/f2fs.h | 51 ++++++++++++----------------------------------- fs/f2fs/file.c | 6 +++--- fs/f2fs/segment.c | 4 ++-- fs/f2fs/super.c | 14 ++++++------- fs/f2fs/sysfs.c | 4 ++-- 6 files changed, 28 insertions(+), 53 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a6ebf4549529..9ca848dc9dc0 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -204,7 +204,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, if (type != DATA && type != NODE) goto submit_io; - if (f2fs_sb_mounted_blkzoned(sbi->sb) && current->plug) + if (f2fs_sb_has_blkzoned(sbi->sb) && current->plug) blk_finish_plug(current->plug); start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 385ad384775a..1653f6010495 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3267,45 +3267,20 @@ static inline bool f2fs_bio_encrypted(struct bio *bio) return bio->bi_private != NULL; } -static inline int f2fs_sb_has_crypto(struct super_block *sb) -{ - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT); +#define F2FS_FEATURE_FUNCS(name, flagname) \ +static inline int f2fs_sb_has_##name(struct super_block *sb) \ +{ \ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_##flagname); \ } -static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb) -{ - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED); -} - -static inline int f2fs_sb_has_extra_attr(struct super_block *sb) -{ - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_EXTRA_ATTR); -} - -static inline int f2fs_sb_has_project_quota(struct super_block *sb) -{ - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_PRJQUOTA); -} - -static inline int f2fs_sb_has_inode_chksum(struct super_block *sb) -{ - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CHKSUM); -} - -static inline int f2fs_sb_has_flexible_inline_xattr(struct super_block *sb) -{ - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_FLEXIBLE_INLINE_XATTR); -} - -static inline int f2fs_sb_has_quota_ino(struct super_block *sb) -{ - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_QUOTA_INO); -} - -static inline int f2fs_sb_has_inode_crtime(struct super_block *sb) -{ - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CRTIME); -} +F2FS_FEATURE_FUNCS(encrypt, ENCRYPT); +F2FS_FEATURE_FUNCS(blkzoned, BLKZONED); +F2FS_FEATURE_FUNCS(extra_attr, EXTRA_ATTR); +F2FS_FEATURE_FUNCS(project_quota, PRJQUOTA); +F2FS_FEATURE_FUNCS(inode_chksum, INODE_CHKSUM); +F2FS_FEATURE_FUNCS(flexible_inline_xattr, FLEXIBLE_INLINE_XATTR); +F2FS_FEATURE_FUNCS(quota_ino, QUOTA_INO); +F2FS_FEATURE_FUNCS(inode_crtime, INODE_CRTIME); #ifdef CONFIG_BLK_DEV_ZONED static inline int get_blkz_type(struct f2fs_sb_info *sbi, @@ -3325,7 +3300,7 @@ static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) { struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); - return blk_queue_discard(q) || f2fs_sb_mounted_blkzoned(sbi->sb); + return blk_queue_discard(q) || f2fs_sb_has_blkzoned(sbi->sb); } static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 65cda5bc61b7..7c7d0477c057 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1936,7 +1936,7 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); - if (!f2fs_sb_has_crypto(inode->i_sb)) + if (!f2fs_sb_has_encrypt(inode->i_sb)) return -EOPNOTSUPP; f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); @@ -1946,7 +1946,7 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) { - if (!f2fs_sb_has_crypto(file_inode(filp)->i_sb)) + if (!f2fs_sb_has_encrypt(file_inode(filp)->i_sb)) return -EOPNOTSUPP; return fscrypt_ioctl_get_policy(filp, (void __user *)arg); } @@ -1957,7 +1957,7 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int err; - if (!f2fs_sb_has_crypto(inode->i_sb)) + if (!f2fs_sb_has_encrypt(inode->i_sb)) return -EOPNOTSUPP; if (uuid_is_nonzero(sbi->raw_super->encrypt_pw_salt)) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 3a150018fd2c..d4e09133c013 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1565,7 +1565,7 @@ static int __issue_discard_async(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { #ifdef CONFIG_BLK_DEV_ZONED - if (f2fs_sb_mounted_blkzoned(sbi->sb) && + if (f2fs_sb_has_blkzoned(sbi->sb) && bdev_zoned_model(bdev) != BLK_ZONED_NONE) return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); #endif @@ -1763,7 +1763,7 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) sbi->blocks_per_seg, cur_pos); len = next_pos - cur_pos; - if (f2fs_sb_mounted_blkzoned(sbi->sb) || + if (f2fs_sb_has_blkzoned(sbi->sb) || (force && len < cpc->trim_minlen)) goto skip; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8b6edc4e5cab..b6d70d6d8a27 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -405,14 +405,14 @@ static int parse_options(struct super_block *sb, char *options) q = bdev_get_queue(sb->s_bdev); if (blk_queue_discard(q)) { set_opt(sbi, DISCARD); - } else if (!f2fs_sb_mounted_blkzoned(sb)) { + } else if (!f2fs_sb_has_blkzoned(sb)) { f2fs_msg(sb, KERN_WARNING, "mounting with \"discard\" option, but " "the device does not support discard"); } break; case Opt_nodiscard: - if (f2fs_sb_mounted_blkzoned(sb)) { + if (f2fs_sb_has_blkzoned(sb)) { f2fs_msg(sb, KERN_WARNING, "discard is required for zoned block devices"); return -EINVAL; @@ -561,7 +561,7 @@ static int parse_options(struct super_block *sb, char *options) return -ENOMEM; if (strlen(name) == 8 && !strncmp(name, "adaptive", 8)) { - if (f2fs_sb_mounted_blkzoned(sb)) { + if (f2fs_sb_has_blkzoned(sb)) { f2fs_msg(sb, KERN_WARNING, "adaptive mode is not allowed with " "zoned block device feature"); @@ -1283,7 +1283,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, NOHEAP); sbi->sb->s_flags |= MS_LAZYTIME; set_opt(sbi, FLUSH_MERGE); - if (f2fs_sb_mounted_blkzoned(sbi->sb)) { + if (f2fs_sb_has_blkzoned(sbi->sb)) { set_opt_mode(sbi, F2FS_MOUNT_LFS); set_opt(sbi, DISCARD); } else { @@ -2250,7 +2250,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) unsigned int n = 0; int err = -EIO; - if (!f2fs_sb_mounted_blkzoned(sbi->sb)) + if (!f2fs_sb_has_blkzoned(sbi->sb)) return 0; if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != @@ -2461,7 +2461,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) #ifdef CONFIG_BLK_DEV_ZONED if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM && - !f2fs_sb_mounted_blkzoned(sbi->sb)) { + !f2fs_sb_has_blkzoned(sbi->sb)) { f2fs_msg(sbi->sb, KERN_ERR, "Zoned block device feature not enabled\n"); return -EINVAL; @@ -2556,7 +2556,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) * devices, but mandatory for host-managed zoned block devices. */ #ifndef CONFIG_BLK_DEV_ZONED - if (f2fs_sb_mounted_blkzoned(sb)) { + if (f2fs_sb_has_blkzoned(sb)) { f2fs_msg(sb, KERN_ERR, "Zoned block device support is not enabled\n"); err = -EOPNOTSUPP; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index d978c7b6ea04..374ee5c82f94 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -92,10 +92,10 @@ static ssize_t features_show(struct f2fs_attr *a, if (!sb->s_bdev->bd_part) return snprintf(buf, PAGE_SIZE, "0\n"); - if (f2fs_sb_has_crypto(sb)) + if (f2fs_sb_has_encrypt(sb)) len += snprintf(buf, PAGE_SIZE - len, "%s", "encryption"); - if (f2fs_sb_mounted_blkzoned(sb)) + if (f2fs_sb_has_blkzoned(sb)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "blkzoned"); if (f2fs_sb_has_extra_attr(sb)) From 946aefc7545d4eacf8f18ffac7db09a7d59e9b8f Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Sat, 10 Feb 2018 12:12:51 +0800 Subject: [PATCH 0627/1212] f2fs: flush cp pack except cp pack 2 page at first Previously, we attempt to flush the whole cp pack in a single bio, however, when suddenly powering off at this time, we could get into an extreme scenario that cp pack 1 page and cp pack 2 page are updated and latest, but payload or current summaries are still partially outdated. (see reliable write in the UFS specification) This patch submits the whole cp pack except cp pack 2 page at first, and then writes the cp pack 2 page with an extra independent bio with pre-io barrier. Signed-off-by: Gao Xiang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 69 +++++++++++++++++++++++++++++--------------- 1 file changed, 46 insertions(+), 23 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 29bb6209dee2..9db919c423b6 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1172,6 +1172,39 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) spin_unlock_irqrestore(&sbi->cp_lock, flags); } +static void commit_checkpoint(struct f2fs_sb_info *sbi, + void *src, block_t blk_addr) +{ + struct writeback_control wbc = { + .for_reclaim = 0, + }; + + /* + * pagevec_lookup_tag and lock_page again will take + * some extra time. Therefore, update_meta_pages and + * sync_meta_pages are combined in this function. + */ + struct page *page = grab_meta_page(sbi, blk_addr); + int err; + + memcpy(page_address(page), src, PAGE_SIZE); + set_page_dirty(page); + + f2fs_wait_on_page_writeback(page, META, true); + f2fs_bug_on(sbi, PageWriteback(page)); + if (unlikely(!clear_page_dirty_for_io(page))) + f2fs_bug_on(sbi, 1); + + /* writeout cp pack 2 page */ + err = __f2fs_write_meta_page(page, &wbc, FS_CP_META_IO); + f2fs_bug_on(sbi, err); + + f2fs_put_page(page, 0); + + /* submit checkpoint (with barrier if NOBARRIER is not set) */ + f2fs_submit_merged_write(sbi, META_FLUSH); +} + static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); @@ -1274,16 +1307,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) } } - /* need to wait for end_io results */ - wait_on_all_pages_writeback(sbi); - if (unlikely(f2fs_cp_error(sbi))) - return -EIO; - - /* flush all device cache */ - err = f2fs_flush_device_cache(sbi); - if (err) - return err; - /* write out checkpoint buffer at block 0 */ update_meta_page(sbi, ckpt, start_blk++); @@ -1311,26 +1334,26 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) start_blk += NR_CURSEG_NODE_TYPE; } - /* writeout checkpoint block */ - update_meta_page(sbi, ckpt, start_blk); + /* update user_block_counts */ + sbi->last_valid_block_count = sbi->total_valid_block_count; + percpu_counter_set(&sbi->alloc_valid_block_count, 0); - /* wait for previous submitted node/meta pages writeback */ + /* Here, we have one bio having CP pack except cp pack 2 page */ + sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + + /* wait for previous submitted meta pages writeback */ wait_on_all_pages_writeback(sbi); if (unlikely(f2fs_cp_error(sbi))) return -EIO; - filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LLONG_MAX); - filemap_fdatawait_range(META_MAPPING(sbi), 0, LLONG_MAX); + /* flush all device cache */ + err = f2fs_flush_device_cache(sbi); + if (err) + return err; - /* update user_block_counts */ - sbi->last_valid_block_count = sbi->total_valid_block_count; - percpu_counter_set(&sbi->alloc_valid_block_count, 0); - - /* Here, we only have one bio having CP pack */ - sync_meta_pages(sbi, META_FLUSH, LONG_MAX, FS_CP_META_IO); - - /* wait for previous submitted meta pages writeback */ + /* barrier and flush checkpoint cp pack 2 page if it can */ + commit_checkpoint(sbi, ckpt, start_blk); wait_on_all_pages_writeback(sbi); release_ino_entry(sbi, false); From 1f6bac14c10061c2556deb4bf50600971d911b50 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 21 Feb 2018 18:13:40 +0000 Subject: [PATCH 0628/1212] f2fs: remove redundant initialization of pointer 'p' Pointer p is initialized with a value that is never read and is later re-assigned a new value, hence the initialization is redundant and can be removed. Cleans up clang warning: fs/f2fs/extent_cache.c:463:19: warning: Value stored to 'p' during its initialization is never read Signed-off-by: Colin Ian King Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index aff6c2ed1c02..d5a861bf2b42 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -460,7 +460,7 @@ static struct extent_node *__insert_extent_tree(struct inode *inode, struct rb_node *insert_parent) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct rb_node **p = &et->root.rb_node; + struct rb_node **p; struct rb_node *parent = NULL; struct extent_node *en = NULL; From 4d409fa3346bf97cc68435cf49a6ab7c5733b27f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 11 Feb 2018 22:53:20 +0800 Subject: [PATCH 0629/1212] f2fs: introduce sb_lock to make encrypt pwsalt update exclusive f2fs_super_block.encrypt_pw_salt can be udpated and persisted concurrently, result in getting different pwsalt in separated threads, so let's introduce sb_lock to exclude concurrent accessers. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 19 +++++++++++-------- fs/f2fs/super.c | 2 ++ 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1653f6010495..a8ea66cb45ed 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1111,6 +1111,7 @@ struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ struct f2fs_super_block *raw_super; /* raw super block pointer */ + struct mutex sb_lock; /* lock for raw super block */ int valid_super_block; /* valid super block no */ unsigned long s_flag; /* flags for sbi */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7c7d0477c057..9152fb41764a 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1960,13 +1960,15 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) if (!f2fs_sb_has_encrypt(inode->i_sb)) return -EOPNOTSUPP; - if (uuid_is_nonzero(sbi->raw_super->encrypt_pw_salt)) - goto got_it; - err = mnt_want_write_file(filp); if (err) return err; + mutex_lock(&sbi->sb_lock); + + if (uuid_is_nonzero(sbi->raw_super->encrypt_pw_salt)) + goto got_it; + /* update superblock with uuid */ generate_random_uuid(sbi->raw_super->encrypt_pw_salt); @@ -1974,15 +1976,16 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) if (err) { /* undo new data */ memset(sbi->raw_super->encrypt_pw_salt, 0, 16); - mnt_drop_write_file(filp); - return err; + goto out_err; } - mnt_drop_write_file(filp); got_it: if (copy_to_user((__u8 __user *)arg, sbi->raw_super->encrypt_pw_salt, 16)) - return -EFAULT; - return 0; + err = -EFAULT; +out_err: + mutex_unlock(&sbi->sb_lock); + mnt_drop_write_file(filp); + return err; } static int f2fs_ioc_gc(struct file *filp, unsigned long arg) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b6d70d6d8a27..f86374cc4470 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2225,6 +2225,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->dirty_device = 0; spin_lock_init(&sbi->dev_lock); + + mutex_init(&sbi->sb_lock); } static int init_percpu_info(struct f2fs_sb_info *sbi) From 98b329de5026821e871b933aeb8815d3ceb3b03b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 25 Feb 2018 23:38:21 +0800 Subject: [PATCH 0630/1212] f2fs: fix to set KEEP_SIZE bit in f2fs_zero_range As Jayashree Mohan reported: A simple workload to reproduce this would be : 1. create foo 2. Write (8K - 16K) // foo size = 16K now 3. fsync() 4. falloc zero_range , keep_size (4202496 - 4210688) // foo size must be 16K 5. fdatasync() Crash now On recovery, we see that the file size is 4210688 and not 16K, which violates the semantics of keep_size flag. We have a test case to reproduce this using CrashMonkey on 4.15 kernel. Try this out by simply running : ./c_harness -f /dev/sda -d /dev/cow_ram0 -t f2fs -e 102400 -P -v tests/generic_468_zero.so The root cause is that we miss to set KEEP_SIZE bit correctly in zero_range when zeroing block cross EOF with FALLOC_FL_KEEP_SIZE, let's fix this missing case. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 9152fb41764a..84614f5d1689 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1354,8 +1354,12 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, } out: - if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) - f2fs_i_size_write(inode, new_size); + if (new_size > i_size_read(inode)) { + if (mode & FALLOC_FL_KEEP_SIZE) + file_set_keep_isize(inode); + else + f2fs_i_size_write(inode, new_size); + } out_sem: up_write(&F2FS_I(inode)->i_mmap_sem); From 766d2321697fe98dd0db6b06aa4b41ef9559f506 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 26 Feb 2018 22:04:13 +0800 Subject: [PATCH 0631/1212] f2fs: expose extension_list sysfs entry This patch adds a sysfs entry 'extension_list' to support query/add/del item in extension list. Query: cat /sys/fs/f2fs//extension_list Add: echo 'extension' > /sys/fs/f2fs//extension_list Del: echo '!extension' > /sys/fs/f2fs//extension_list Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 9 ++++++ fs/f2fs/f2fs.h | 4 ++- fs/f2fs/file.c | 4 +-- fs/f2fs/namei.c | 42 +++++++++++++++++++++++-- fs/f2fs/super.c | 2 +- fs/f2fs/sysfs.c | 40 +++++++++++++++++++++++ include/linux/f2fs_fs.h | 3 +- 7 files changed, 96 insertions(+), 8 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index db7aab1516de..be3f74ec05b5 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -192,3 +192,12 @@ Date: November 2017 Contact: "Sheng Yong" Description: Controls readahead inode block in readdir. + +What: /sys/fs/f2fs//extension_list +Date: Feburary 2018 +Contact: "Chao Yu" +Description: + Used to control configure extension list: + - Query: cat /sys/fs/f2fs//extension_list + - Add: echo 'extension' > /sys/fs/f2fs//extension_list + - Del: echo '!extension' > /sys/fs/f2fs//extension_list diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a8ea66cb45ed..621603b1835c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1111,7 +1111,7 @@ struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ struct f2fs_super_block *raw_super; /* raw super block pointer */ - struct mutex sb_lock; /* lock for raw super block */ + struct rw_semaphore sb_lock; /* lock for raw super block */ int valid_super_block; /* valid super block no */ unsigned long s_flag; /* flags for sbi */ @@ -2690,6 +2690,8 @@ void handle_failed_inode(struct inode *inode); /* * namei.c */ +int update_extension_list(struct f2fs_sb_info *sbi, const char *name, + bool set); struct dentry *f2fs_get_parent(struct dentry *child); /* diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 84614f5d1689..e418fc5b3fed 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1968,7 +1968,7 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) if (err) return err; - mutex_lock(&sbi->sb_lock); + down_write(&sbi->sb_lock); if (uuid_is_nonzero(sbi->raw_super->encrypt_pw_salt)) goto got_it; @@ -1987,7 +1987,7 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) 16)) err = -EFAULT; out_err: - mutex_unlock(&sbi->sb_lock); + up_write(&sbi->sb_lock); mnt_drop_write_file(filp); return err; } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 72328a18c086..685f94ba760b 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -171,16 +171,52 @@ static int is_multimedia_file(const unsigned char *s, const char *sub) static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode, const unsigned char *name) { - int i; - __u8 (*extlist)[8] = sbi->raw_super->extension_list; + __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; + int i, count; + + down_read(&sbi->sb_lock); + + count = le32_to_cpu(sbi->raw_super->extension_count); - int count = le32_to_cpu(sbi->raw_super->extension_count); for (i = 0; i < count; i++) { if (is_multimedia_file(name, extlist[i])) { file_set_cold(inode); break; } } + + up_read(&sbi->sb_lock); +} + +int update_extension_list(struct f2fs_sb_info *sbi, const char *name, bool set) +{ + __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; + int count = le32_to_cpu(sbi->raw_super->extension_count); + int i; + + for (i = 0; i < count; i++) { + if (strcmp(name, extlist[i])) + continue; + + if (set) + return -EINVAL; + + memcpy(extlist[i], extlist[i + 1], + F2FS_EXTENSION_LEN * (count - i - 1)); + memset(extlist[count - 1], 0, F2FS_EXTENSION_LEN); + sbi->raw_super->extension_count = cpu_to_le32(count - 1); + return 0; + } + + if (!set) + return -EINVAL; + + if (count == F2FS_MAX_EXTENSION) + return -EINVAL; + + strncpy(extlist[count], name, strlen(name)); + sbi->raw_super->extension_count = cpu_to_le32(count + 1); + return 0; } static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f86374cc4470..ec68aa982649 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2226,7 +2226,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->dirty_device = 0; spin_lock_init(&sbi->dev_lock); - mutex_init(&sbi->sb_lock); + init_rwsem(&sbi->sb_lock); } static int init_percpu_info(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 374ee5c82f94..d27b28e602a6 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -136,6 +136,18 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, if (!ptr) return -EINVAL; + if (!strcmp(a->attr.name, "extension_list")) { + __u8 (*extlist)[F2FS_EXTENSION_LEN] = + sbi->raw_super->extension_list; + int count = le32_to_cpu(sbi->raw_super->extension_count); + int len = 0, i; + + for (i = 0; i < count; i++) + len += snprintf(buf + len, PAGE_SIZE - len, "%s\n", + extlist[i]); + return len; + } + ui = (unsigned int *)(ptr + a->offset); return snprintf(buf, PAGE_SIZE, "%u\n", *ui); @@ -154,6 +166,32 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, if (!ptr) return -EINVAL; + if (!strcmp(a->attr.name, "extension_list")) { + const char *name = strim((char *)buf); + bool set = true; + + if (name[0] == '!') { + name++; + set = false; + } + + if (strlen(name) >= F2FS_EXTENSION_LEN) + return -EINVAL; + + down_write(&sbi->sb_lock); + + ret = update_extension_list(sbi, name, set); + if (ret) + goto out; + + ret = f2fs_commit_super(sbi, false); + if (ret) + update_extension_list(sbi, name, !set); +out: + up_write(&sbi->sb_lock); + return ret ? ret : count; + } + ui = (unsigned int *)(ptr + a->offset); ret = kstrtoul(skip_spaces(buf), 0, &t); @@ -307,6 +345,7 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold); +F2FS_RW_ATTR(F2FS_SBI, f2fs_super_block, extension_list, extension_list); #ifdef CONFIG_F2FS_FAULT_INJECTION F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); @@ -357,6 +396,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(iostat_enable), ATTR_LIST(readdir_ra), ATTR_LIST(gc_pin_file_thresh), + ATTR_LIST(extension_list), #ifdef CONFIG_F2FS_FAULT_INJECTION ATTR_LIST(inject_rate), ATTR_LIST(inject_type), diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 23f23b8e6878..bee1211bc2b9 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -21,6 +21,7 @@ #define F2FS_BLKSIZE 4096 /* support only 4KB block */ #define F2FS_BLKSIZE_BITS 12 /* bits for F2FS_BLKSIZE */ #define F2FS_MAX_EXTENSION 64 /* # of extension entries */ +#define F2FS_EXTENSION_LEN 8 /* max size of extension */ #define F2FS_BLK_ALIGN(x) (((x) + F2FS_BLKSIZE - 1) >> F2FS_BLKSIZE_BITS) #define NULL_ADDR ((block_t)0) /* used as block_t addresses */ @@ -101,7 +102,7 @@ struct f2fs_super_block { __u8 uuid[16]; /* 128-bit uuid for volume */ __le16 volume_name[MAX_VOLUME_NAME]; /* volume name */ __le32 extension_count; /* # of extensions below */ - __u8 extension_list[F2FS_MAX_EXTENSION][8]; /* extension array */ + __u8 extension_list[F2FS_MAX_EXTENSION][F2FS_EXTENSION_LEN];/* extension array */ __le32 cp_payload; __u8 version[VERSION_LEN]; /* the kernel version */ __u8 init_version[VERSION_LEN]; /* the initial kernel version */ From b7982989124958d1ad880bae0b5169e6eaa00421 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 26 Feb 2018 09:19:47 -0800 Subject: [PATCH 0632/1212] f2fs: don't stop GC if GC is contended Let's do GC as much as possible, while gc_urgent is set. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 06de4ca9abc9..7725999394b0 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -76,14 +76,15 @@ static int gc_thread_func(void *data) * invalidated soon after by user update or deletion. * So, I'd like to wait some time to collect dirty segments. */ - if (!mutex_trylock(&sbi->gc_mutex)) - goto next; - if (gc_th->gc_urgent) { wait_ms = gc_th->urgent_sleep_time; + mutex_lock(&sbi->gc_mutex); goto do_gc; } + if (!mutex_trylock(&sbi->gc_mutex)) + goto next; + if (!is_idle(sbi)) { increase_sleep_time(gc_th, &wait_ms); mutex_unlock(&sbi->gc_mutex); From 0ffdffc8f106628a4c6bc3eed2eb3cf88393d2f3 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sun, 18 Feb 2018 08:50:49 -0800 Subject: [PATCH 0633/1212] f2fs: add mount option for segment allocation policy This patch adds an mount option, "alloc_mode=%s" having two options, "default" and "reuse". In "alloc_mode=reuse" case, f2fs starts to allocate segments from 0'th segment all the time to reassign segments. It'd be useful for small-sized eMMC parts. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 8 ++++++++ fs/f2fs/f2fs.h | 8 ++++++++ fs/f2fs/segment.c | 5 +++++ fs/f2fs/super.c | 26 ++++++++++++++++++++++++++ 4 files changed, 47 insertions(+) diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index 6cf9ad12c57f..579c1119131d 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -172,6 +172,14 @@ offgrpjquota Turn off group journelled quota. offprjjquota Turn off project journelled quota. quota Enable plain user disk quota accounting. noquota Disable all plain disk quota option. +whint_mode=%s Control which write hints are passed down to block + layer. This supports "off", "user-based", and + "fs-based". In "off" mode (default), f2fs does not pass + down hints. In "user-based" mode, f2fs tries to pass + down hints given by users. And in "fs-based" mode, f2fs + passes down hints with its policy. +alloc_mode=%s Adjust block allocation policy, which supports "reuse" + and "default". ================================================================================ DEBUGFS ENTRIES diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 621603b1835c..1f88986207ed 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1107,6 +1107,11 @@ enum { WHINT_MODE_FS, /* pass down hints with F2FS policy */ }; +enum { + ALLOC_MODE_DEFAULT, /* stay default */ + ALLOC_MODE_REUSE, /* reuse segments as much as possible */ +}; + struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ @@ -1293,6 +1298,9 @@ struct f2fs_sb_info { #endif /* For which write hints are passed down to block layer */ int whint_mode; + + /* segment allocation policy */ + int alloc_mode; }; #ifdef CONFIG_F2FS_FAULT_INJECTION diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d4e09133c013..da498a1de469 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2250,6 +2250,11 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) if (SIT_I(sbi)->last_victim[ALLOC_NEXT]) return SIT_I(sbi)->last_victim[ALLOC_NEXT]; + + /* find segments from 0 to reuse freed segments */ + if (sbi->alloc_mode == ALLOC_MODE_REUSE) + return 0; + return CURSEG_I(sbi, type)->segno; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ec68aa982649..ff59af55ccd4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -130,6 +130,7 @@ enum { Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_whint, + Opt_alloc, Opt_err, }; @@ -184,6 +185,7 @@ static match_table_t f2fs_tokens = { {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, {Opt_whint, "whint_mode=%s"}, + {Opt_alloc, "alloc_mode=%s"}, {Opt_err, NULL}, }; @@ -700,6 +702,23 @@ static int parse_options(struct super_block *sb, char *options) } kfree(name); break; + case Opt_alloc: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + + if (strlen(name) == 7 && + !strncmp(name, "default", 7)) { + sbi->alloc_mode = ALLOC_MODE_DEFAULT; + } else if (strlen(name) == 5 && + !strncmp(name, "reuse", 5)) { + sbi->alloc_mode = ALLOC_MODE_REUSE; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -1265,6 +1284,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) else if (sbi->whint_mode == WHINT_MODE_FS) seq_printf(seq, ",whint_mode=%s", "fs-based"); + if (sbi->alloc_mode == ALLOC_MODE_DEFAULT) + seq_printf(seq, ",alloc_mode=%s", "default"); + else if (sbi->alloc_mode == ALLOC_MODE_REUSE) + seq_printf(seq, ",alloc_mode=%s", "reuse"); return 0; } @@ -1274,6 +1297,7 @@ static void default_options(struct f2fs_sb_info *sbi) sbi->active_logs = NR_CURSEG_TYPE; sbi->inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; sbi->whint_mode = WHINT_MODE_OFF; + sbi->alloc_mode = ALLOC_MODE_DEFAULT; set_opt(sbi, BG_GC); set_opt(sbi, INLINE_XATTR); @@ -1315,6 +1339,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool need_stop_gc = false; bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); int old_whint_mode = sbi->whint_mode; + int old_alloc_mode = sbi->alloc_mode; #ifdef CONFIG_F2FS_FAULT_INJECTION struct f2fs_fault_info ffi = sbi->fault_info; #endif @@ -1464,6 +1489,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) sbi->s_qf_names[i] = s_qf_names[i]; } #endif + sbi->alloc_mode = old_alloc_mode; sbi->whint_mode = old_whint_mode; sbi->mount_opt = org_mount_opt; sbi->active_logs = active_logs; From 1aa536a624cc246bcafd5ace82abe3b50e47c802 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 22 Feb 2018 14:09:30 -0800 Subject: [PATCH 0634/1212] f2fs: add auto tuning for small devices If f2fs is running on top of very small devices, it's worth to avoid abusing free LBAs. In order to achieve that, this patch introduces some parameter tuning. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 2 ++ fs/f2fs/super.c | 14 ++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 5d6d3e72be31..d1524d16b2a0 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -596,6 +596,8 @@ static inline int utilization(struct f2fs_sb_info *sbi) #define DEF_MIN_FSYNC_BLOCKS 8 #define DEF_MIN_HOT_BLOCKS 16 +#define SMALL_VOLUME_SEGMENTS (16 * 512) /* 16GB */ + enum { F2FS_IPU_FORCE, F2FS_IPU_SSR, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ff59af55ccd4..28c49fc34e86 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2523,6 +2523,18 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) return 0; } +static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) +{ + struct f2fs_sm_info *sm_i = SM_I(sbi); + + /* adjust parameters according to the volume size */ + if (sm_i->main_segments <= SMALL_VOLUME_SEGMENTS) { + sbi->alloc_mode = ALLOC_MODE_REUSE; + sm_i->dcc_info->discard_granularity = 1; + sm_i->ipu_policy = 1 << F2FS_IPU_FORCE; + } +} + static int f2fs_fill_super(struct super_block *sb, void *data, int silent) { struct f2fs_sb_info *sbi; @@ -2875,6 +2887,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) f2fs_join_shrinker(sbi); + f2fs_tuning_parameters(sbi); + f2fs_msg(sbi->sb, KERN_NOTICE, "Mounted with checkpoint version = %llx", cur_cp_version(F2FS_CKPT(sbi))); f2fs_update_time(sbi, CP_TIME); From a5052f32b940d492403a8a6624ce88094bfb610e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sun, 25 Feb 2018 01:04:57 -0800 Subject: [PATCH 0635/1212] f2fs: set readdir_ra by default It gives general readdir improvement. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 28c49fc34e86..85e4b938b996 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1298,6 +1298,7 @@ static void default_options(struct f2fs_sb_info *sbi) sbi->inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; sbi->whint_mode = WHINT_MODE_OFF; sbi->alloc_mode = ALLOC_MODE_DEFAULT; + sbi->readdir_ra = 1; set_opt(sbi, BG_GC); set_opt(sbi, INLINE_XATTR); From 10b2d001d6ace7f509bda9321a729b6949cc6ea0 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 22 Feb 2018 23:30:55 -0800 Subject: [PATCH 0636/1212] f2fs: issue discard aggressively in the gc_urgent mode This patch avoids to skip discard commands when user sets gc_urgent mode. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index da498a1de469..c217a91088af 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1491,12 +1491,11 @@ static int issue_discard_thread(void *data) if (kthread_should_stop()) return 0; - if (dcc->discard_wake) { + if (dcc->discard_wake) dcc->discard_wake = 0; - if (sbi->gc_thread && sbi->gc_thread->gc_urgent) - init_discard_policy(&dpolicy, - DPOLICY_FORCE, 1); - } + + if (sbi->gc_thread && sbi->gc_thread->gc_urgent) + init_discard_policy(&dpolicy, DPOLICY_FORCE, 1); sb_start_intwrite(sbi->sb); @@ -1807,7 +1806,7 @@ void init_discard_policy(struct discard_policy *dpolicy, } else if (discard_type == DPOLICY_FORCE) { dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; - dpolicy->io_aware = true; + dpolicy->io_aware = false; } else if (discard_type == DPOLICY_FSTRIM) { dpolicy->io_aware = false; } else if (discard_type == DPOLICY_UMOUNT) { From 1e0aeb0af9ed3b16b4c2543aa2c6502a153b897b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 26 Feb 2018 15:40:30 -0800 Subject: [PATCH 0637/1212] f2fs: do gc in greedy mode for whole range if gc_urgent mode is set Otherwise, f2fs conducts GC on 8GB range only based on slow cost-benefit. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 7725999394b0..54f51a990794 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -162,12 +162,17 @@ static int select_gc_type(struct f2fs_gc_kthread *gc_th, int gc_type) { int gc_mode = (gc_type == BG_GC) ? GC_CB : GC_GREEDY; - if (gc_th && gc_th->gc_idle) { + if (!gc_th) + return gc_mode; + + if (gc_th->gc_idle) { if (gc_th->gc_idle == 1) gc_mode = GC_CB; else if (gc_th->gc_idle == 2) gc_mode = GC_GREEDY; } + if (gc_th->gc_urgent) + gc_mode = GC_GREEDY; return gc_mode; } @@ -189,7 +194,9 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, } /* we need to check every dirty segments in the FG_GC case */ - if (gc_type != FG_GC && p->max_search > sbi->max_victim_search) + if (gc_type != FG_GC && + (sbi->gc_thread && !sbi->gc_thread->gc_urgent) && + p->max_search > sbi->max_victim_search) p->max_search = sbi->max_victim_search; /* let's select beginning hot/small space first in no_heap mode*/ From 58edcdbca67ab09ef7631e7a94a5bd5190895631 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 27 Feb 2018 22:45:24 +0800 Subject: [PATCH 0638/1212] f2fs: fix to avoid race in between atomic write and background GC Sqlite user Background GC - move_data_block : move page #1 - f2fs_is_atomic_file - f2fs_ioc_start_atomic_write - f2fs_ioc_commit_atomic_write - commit_inmem_pages : commit page #1 & set node #2 dirty - f2fs_submit_page_write - f2fs_update_data_blkaddr - set_page_dirty : set node #2 dirty - f2fs_do_sync_file - fsync_node_pages : commit node #1 & node #2, then sudden power-cut In a race case, we may check FI_ATOMIC_FILE flag before starting atomic write flow, then we will commit meta data before data with reversed order, after a sudden pow-cut, database transaction will be inconsistent. So we'd better to exclude gc/atomic_write to each other by using lock instead of flag checking. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index e418fc5b3fed..8ec080550a37 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1713,6 +1713,8 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) inode_lock(inode); + down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + if (f2fs_is_volatile_file(inode)) goto err_out; @@ -1731,6 +1733,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); } err_out: + up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); inode_unlock(inode); mnt_drop_write_file(filp); return ret; From 076a6f32fe5d2d8c43f44e625c67d796eeb8f1ed Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 28 Feb 2018 17:07:27 +0800 Subject: [PATCH 0639/1212] f2fs: support hot file extension This patch supports to recognize hot file extension in f2fs, so that we can allocate proper hot segment location for its data, which can lead to better hot/cold seperation in filesystem. In addition, we changes a bit on query/add/del operation method for extension_list sysfs entry as below: - Query: cat /sys/fs/f2fs//extension_list - Add: echo 'extension' > /sys/fs/f2fs//extension_list - Del: echo '!extension' > /sys/fs/f2fs//extension_list - Add: echo '[h/c]extension' > /sys/fs/f2fs//extension_list - Del: echo '[h/c]!extension' > /sys/fs/f2fs//extension_list - [h] means add/del hot file extension - [c] means add/del cold file extension Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 6 +- fs/f2fs/f2fs.h | 6 +- fs/f2fs/namei.c | 77 +++++++++++++++++++------ fs/f2fs/segment.c | 3 +- fs/f2fs/sysfs.c | 30 ++++++++-- include/linux/f2fs_fs.h | 3 +- 6 files changed, 95 insertions(+), 30 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index be3f74ec05b5..b8d0a30f1644 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -199,5 +199,7 @@ Contact: "Chao Yu" Description: Used to control configure extension list: - Query: cat /sys/fs/f2fs//extension_list - - Add: echo 'extension' > /sys/fs/f2fs//extension_list - - Del: echo '!extension' > /sys/fs/f2fs//extension_list + - Add: echo '[h/c]extension' > /sys/fs/f2fs//extension_list + - Del: echo '[h/c]!extension' > /sys/fs/f2fs//extension_list + - [h] means add/del hot file extension + - [c] means add/del cold file extension diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1f88986207ed..e3bfecf7852b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -640,6 +640,7 @@ enum { #define FADVISE_ENCRYPT_BIT 0x04 #define FADVISE_ENC_NAME_BIT 0x08 #define FADVISE_KEEP_SIZE_BIT 0x10 +#define FADVISE_HOT_BIT 0x20 #define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) #define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) @@ -654,6 +655,9 @@ enum { #define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT) #define file_keep_isize(inode) is_file(inode, FADVISE_KEEP_SIZE_BIT) #define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT) +#define file_is_hot(inode) is_file(inode, FADVISE_HOT_BIT) +#define file_set_hot(inode) set_file(inode, FADVISE_HOT_BIT) +#define file_clear_hot(inode) clear_file(inode, FADVISE_HOT_BIT) #define DEF_DIR_LEVEL 0 @@ -2699,7 +2703,7 @@ void handle_failed_inode(struct inode *inode); * namei.c */ int update_extension_list(struct f2fs_sb_info *sbi, const char *name, - bool set); + bool hot, bool set); struct dentry *f2fs_get_parent(struct dentry *child); /* diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 685f94ba760b..794dac1c64b3 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -142,7 +142,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) return ERR_PTR(err); } -static int is_multimedia_file(const unsigned char *s, const char *sub) +static int is_extension_exist(const unsigned char *s, const char *sub) { size_t slen = strlen(s); size_t sublen = strlen(sub); @@ -168,33 +168,59 @@ static int is_multimedia_file(const unsigned char *s, const char *sub) /* * Set multimedia files as cold files for hot/cold data separation */ -static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode, +static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode, const unsigned char *name) { __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; - int i, count; + int i, cold_count, hot_count; down_read(&sbi->sb_lock); - count = le32_to_cpu(sbi->raw_super->extension_count); + cold_count = le32_to_cpu(sbi->raw_super->extension_count); + hot_count = sbi->raw_super->hot_ext_count; - for (i = 0; i < count; i++) { - if (is_multimedia_file(name, extlist[i])) { + for (i = 0; i < cold_count + hot_count; i++) { + if (!is_extension_exist(name, extlist[i])) + continue; + if (i < cold_count) file_set_cold(inode); - break; - } + else + file_set_hot(inode); + break; } up_read(&sbi->sb_lock); } -int update_extension_list(struct f2fs_sb_info *sbi, const char *name, bool set) +int update_extension_list(struct f2fs_sb_info *sbi, const char *name, + bool hot, bool set) { __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; - int count = le32_to_cpu(sbi->raw_super->extension_count); + int cold_count = le32_to_cpu(sbi->raw_super->extension_count); + int hot_count = sbi->raw_super->hot_ext_count; + int total_count = cold_count + hot_count; + int start, count; int i; - for (i = 0; i < count; i++) { + if (set) { + if (total_count == F2FS_MAX_EXTENSION) + return -EINVAL; + } else { + if (!hot && !cold_count) + return -EINVAL; + if (hot && !hot_count) + return -EINVAL; + } + + if (hot) { + start = cold_count; + count = total_count; + } else { + start = 0; + count = cold_count; + } + + for (i = start; i < count; i++) { if (strcmp(name, extlist[i])) continue; @@ -202,20 +228,33 @@ int update_extension_list(struct f2fs_sb_info *sbi, const char *name, bool set) return -EINVAL; memcpy(extlist[i], extlist[i + 1], - F2FS_EXTENSION_LEN * (count - i - 1)); - memset(extlist[count - 1], 0, F2FS_EXTENSION_LEN); - sbi->raw_super->extension_count = cpu_to_le32(count - 1); + F2FS_EXTENSION_LEN * (total_count - i - 1)); + memset(extlist[total_count - 1], 0, F2FS_EXTENSION_LEN); + if (hot) + sbi->raw_super->hot_ext_count = hot_count - 1; + else + sbi->raw_super->extension_count = + cpu_to_le32(cold_count - 1); return 0; } if (!set) return -EINVAL; - if (count == F2FS_MAX_EXTENSION) - return -EINVAL; + if (hot) { + strncpy(extlist[count], name, strlen(name)); + sbi->raw_super->hot_ext_count = hot_count + 1; + } else { + char buf[F2FS_MAX_EXTENSION][F2FS_EXTENSION_LEN]; - strncpy(extlist[count], name, strlen(name)); - sbi->raw_super->extension_count = cpu_to_le32(count + 1); + memcpy(buf, &extlist[cold_count], + F2FS_EXTENSION_LEN * hot_count); + memset(extlist[cold_count], 0, F2FS_EXTENSION_LEN); + strncpy(extlist[cold_count], name, strlen(name)); + memcpy(&extlist[cold_count + 1], buf, + F2FS_EXTENSION_LEN * hot_count); + sbi->raw_super->extension_count = cpu_to_le32(cold_count + 1); + } return 0; } @@ -239,7 +278,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, return PTR_ERR(inode); if (!test_opt(sbi, DISABLE_EXT_IDENTIFY)) - set_cold_files(sbi, inode, dentry->d_name.name); + set_file_temperature(sbi, inode, dentry->d_name.name); inode->i_op = &f2fs_file_inode_operations; inode->i_fop = &f2fs_file_operations; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c217a91088af..2d753f9b7499 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2667,7 +2667,8 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) if (is_cold_data(fio->page) || file_is_cold(inode)) return CURSEG_COLD_DATA; - if (is_inode_flag_set(inode, FI_HOT_DATA)) + if (file_is_hot(inode) || + is_inode_flag_set(inode, FI_HOT_DATA)) return CURSEG_HOT_DATA; /* rw_hint_to_seg_type(inode->i_write_hint); */ return CURSEG_WARM_DATA; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index d27b28e602a6..23a2d8d66c43 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -139,10 +139,19 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, if (!strcmp(a->attr.name, "extension_list")) { __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; - int count = le32_to_cpu(sbi->raw_super->extension_count); + int cold_count = le32_to_cpu(sbi->raw_super->extension_count); + int hot_count = sbi->raw_super->hot_ext_count; int len = 0, i; - for (i = 0; i < count; i++) + len += snprintf(buf + len, PAGE_SIZE - len, + "cold file extenstion:\n"); + for (i = 0; i < cold_count; i++) + len += snprintf(buf + len, PAGE_SIZE - len, "%s\n", + extlist[i]); + + len += snprintf(buf + len, PAGE_SIZE - len, + "hot file extenstion:\n"); + for (i = cold_count; i < cold_count + hot_count; i++) len += snprintf(buf + len, PAGE_SIZE - len, "%s\n", extlist[i]); return len; @@ -168,9 +177,18 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, if (!strcmp(a->attr.name, "extension_list")) { const char *name = strim((char *)buf); - bool set = true; + bool set = true, hot; - if (name[0] == '!') { + if (!strncmp(name, "[h]", 3)) + hot = true; + else if (!strncmp(name, "[c]", 3)) + hot = false; + else + return -EINVAL; + + name += 3; + + if (*name == '!') { name++; set = false; } @@ -180,13 +198,13 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, down_write(&sbi->sb_lock); - ret = update_extension_list(sbi, name, set); + ret = update_extension_list(sbi, name, hot, set); if (ret) goto out; ret = f2fs_commit_super(sbi, false); if (ret) - update_extension_list(sbi, name, !set); + update_extension_list(sbi, name, hot, !set); out: up_write(&sbi->sb_lock); return ret ? ret : count; diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index bee1211bc2b9..bb92fd5b5841 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -111,7 +111,8 @@ struct f2fs_super_block { __u8 encrypt_pw_salt[16]; /* Salt used for string2key algorithm */ struct f2fs_device devs[MAX_DEVICES]; /* device list */ __le32 qf_ino[F2FS_MAX_QUOTAS]; /* quota inode numbers */ - __u8 reserved[315]; /* valid reserved region */ + __u8 hot_ext_count; /* # of hot file extension */ + __u8 reserved[314]; /* valid reserved region */ } __packed; /* From 6c6611223a79ead8030efbe3443f870c0f11540f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 8 Mar 2018 20:47:33 -0800 Subject: [PATCH 0640/1212] f2fs: avoid selinux denial on CAP_SYS_RESOURCE This fixes CAP_SYS_RESOURCE denial of selinux when using resgid, since it seems selinux reports it at the first place, but mostly we don't need to check this condition first. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e3bfecf7852b..3e05162bbeb7 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1671,13 +1671,13 @@ static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi, return false; if (IS_NOQUOTA(inode)) return true; - if (capable(CAP_SYS_RESOURCE)) - return true; if (uid_eq(sbi->s_resuid, current_fsuid())) return true; if (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) return true; + if (capable(CAP_SYS_RESOURCE)) + return true; return false; } From 0c9c3e034410c4b1410fc3dad4d2657d71539ae4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 1 Mar 2018 23:40:31 +0800 Subject: [PATCH 0641/1212] f2fs: wrap sb_rdonly with f2fs_readonly Use f2fs_readonly to wrap sb_rdonly for cleanup, and spread it in all places. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 +-- fs/f2fs/super.c | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3e05162bbeb7..be7f236a38da 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2531,8 +2531,7 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) return ret; } -#define sb_rdonly f2fs_readonly -static inline int f2fs_readonly(struct super_block *sb) +static inline bool f2fs_readonly(struct super_block *sb) { return sb->s_flags & MS_RDONLY; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 85e4b938b996..8db821b2d78e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -336,7 +336,7 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) "QUOTA feature is enabled, so ignore jquota_fmt"); sbi->s_jquota_fmt = 0; } - if (f2fs_sb_has_quota_ino(sbi->sb) && sb_rdonly(sbi->sb)) { + if (f2fs_sb_has_quota_ino(sbi->sb) && f2fs_readonly(sbi->sb)) { f2fs_msg(sbi->sb, KERN_INFO, "Filesystem with quota feature cannot be mounted RDWR " "without CONFIG_QUOTA"); @@ -2813,7 +2813,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) * Turn on quotas which were not enabled for read-only mounts if * filesystem has quota feature, so that they are updated correctly. */ - if (f2fs_sb_has_quota_ino(sb) && !sb_rdonly(sb)) { + if (f2fs_sb_has_quota_ino(sb) && !f2fs_readonly(sb)) { err = f2fs_enable_quotas(sb); if (err) { f2fs_msg(sb, KERN_ERR, @@ -2898,7 +2898,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) free_meta: #ifdef CONFIG_QUOTA - if (f2fs_sb_has_quota_ino(sb) && !sb_rdonly(sb)) + if (f2fs_sb_has_quota_ino(sb) && !f2fs_readonly(sb)) f2fs_quota_off_umount(sbi->sb); #endif f2fs_sync_inode_meta(sbi); From 6bc490f0eedcd21df5a41e9369cdafed154c9e95 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 1 Mar 2018 23:40:32 +0800 Subject: [PATCH 0642/1212] f2fs: fix to restore old mount option in ->remount_fs This patch fixes to restore old mount option once we encounter failure in ->remount_fs. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8db821b2d78e..dca74d62d9d4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1341,6 +1341,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); int old_whint_mode = sbi->whint_mode; int old_alloc_mode = sbi->alloc_mode; + int old_inline_xattr_size = sbi->inline_xattr_size; + block_t old_root_reserved_blocks = sbi->root_reserved_blocks; + kuid_t old_resuid = sbi->s_resuid; + kgid_t old_resgid = sbi->s_resgid; + int old_write_io_size_bits = sbi->write_io_size_bits; #ifdef CONFIG_F2FS_FAULT_INJECTION struct f2fs_fault_info ffi = sbi->fault_info; #endif @@ -1490,6 +1495,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) sbi->s_qf_names[i] = s_qf_names[i]; } #endif + sbi->write_io_size_bits = old_write_io_size_bits; + sbi->s_resgid = old_resgid; + sbi->s_resuid = old_resuid; + sbi->root_reserved_blocks = old_root_reserved_blocks; + sbi->inline_xattr_size = old_inline_xattr_size; sbi->alloc_mode = old_alloc_mode; sbi->whint_mode = old_whint_mode; sbi->mount_opt = org_mount_opt; From 0bdeb167c843f33fffa3bd046b9e9e9eb8ff01ea Mon Sep 17 00:00:00 2001 From: Junling Zheng Date: Wed, 7 Mar 2018 12:07:49 +0800 Subject: [PATCH 0643/1212] f2fs: introduce mount option for fsync mode Commit "0a007b97aad6"(f2fs: recover directory operations by fsync) fixed xfstest generic/342 case, but it also increased the written data and caused the performance degradation. In most cases, there's no need to do so heavy fsync actually. So we introduce new mount option "fsync_mode={posix,strict}" to control the policy of fsync. "fsync_mode=posix" is set by default, and means that f2fs uses a light fsync, which follows POSIX semantics. And "fsync_mode=strict" means that it's a heavy fsync, which behaves in line with xfs, ext4 and btrfs, where generic/342 will pass, but the performance will regress. Signed-off-by: Junling Zheng Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 7 +++++++ fs/f2fs/dir.c | 3 ++- fs/f2fs/f2fs.h | 8 ++++++++ fs/f2fs/file.c | 3 ++- fs/f2fs/namei.c | 9 ++++++--- fs/f2fs/super.c | 26 ++++++++++++++++++++++++++ 6 files changed, 51 insertions(+), 5 deletions(-) diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index 579c1119131d..fb92e6f25adf 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -180,6 +180,13 @@ whint_mode=%s Control which write hints are passed down to block passes down hints with its policy. alloc_mode=%s Adjust block allocation policy, which supports "reuse" and "default". +fsync_mode=%s Control the policy of fsync. Currently supports "posix" + and "strict". In "posix" mode, which is default, fsync + will follow POSIX semantics and does a light operation + to improve the filesystem performance. In "strict" mode, + fsync will be heavy and behaves in line with xfs, ext4 + and btrfs, where xfstest generic/342 will pass, but the + performance will regress. ================================================================================ DEBUGFS ENTRIES diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 560b707050ca..bb3b8ef1a890 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -704,7 +704,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); - add_ino_entry(F2FS_I_SB(dir), dir->i_ino, TRANS_DIR_INO); + if (F2FS_I_SB(dir)->fsync_mode == FSYNC_MODE_STRICT) + add_ino_entry(F2FS_I_SB(dir), dir->i_ino, TRANS_DIR_INO); if (f2fs_has_inline_dentry(dir)) return f2fs_delete_inline_entry(dentry, page, dir, inode); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index be7f236a38da..1ec04a58576e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1116,6 +1116,11 @@ enum { ALLOC_MODE_REUSE, /* reuse segments as much as possible */ }; +enum fsync_mode { + FSYNC_MODE_POSIX, /* fsync follows posix semantics */ + FSYNC_MODE_STRICT, /* fsync behaves in line with ext4 */ +}; + struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ @@ -1305,6 +1310,9 @@ struct f2fs_sb_info { /* segment allocation policy */ int alloc_mode; + + /* fsync policy */ + int fsync_mode; }; #ifdef CONFIG_F2FS_FAULT_INJECTION diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 8ec080550a37..57afbf3e09ea 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -168,7 +168,8 @@ static inline enum cp_reason_type need_do_checkpoint(struct inode *inode) cp_reason = CP_FASTBOOT_MODE; else if (sbi->active_logs == 2) cp_reason = CP_SPEC_LOG_NUM; - else if (need_dentry_mark(sbi, inode->i_ino) && + else if (sbi->fsync_mode == FSYNC_MODE_STRICT && + need_dentry_mark(sbi, inode->i_ino) && exist_written_data(sbi, F2FS_I(inode)->i_pino, TRANS_DIR_INO)) cp_reason = CP_RECOVER_DIR; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 794dac1c64b3..2b00eb44bb90 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -967,7 +967,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_put_page(old_dir_page, 0); f2fs_i_links_write(old_dir, false); } - add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); + if (sbi->fsync_mode == FSYNC_MODE_STRICT) + add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); f2fs_unlock_op(sbi); @@ -1117,8 +1118,10 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, } f2fs_mark_inode_dirty_sync(new_dir, false); - add_ino_entry(sbi, old_dir->i_ino, TRANS_DIR_INO); - add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); + if (sbi->fsync_mode == FSYNC_MODE_STRICT) { + add_ino_entry(sbi, old_dir->i_ino, TRANS_DIR_INO); + add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); + } f2fs_unlock_op(sbi); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index dca74d62d9d4..b7c3f3b18a6d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -131,6 +131,7 @@ enum { Opt_jqfmt_vfsv1, Opt_whint, Opt_alloc, + Opt_fsync, Opt_err, }; @@ -186,6 +187,7 @@ static match_table_t f2fs_tokens = { {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, {Opt_whint, "whint_mode=%s"}, {Opt_alloc, "alloc_mode=%s"}, + {Opt_fsync, "fsync_mode=%s"}, {Opt_err, NULL}, }; @@ -719,6 +721,22 @@ static int parse_options(struct super_block *sb, char *options) } kfree(name); break; + case Opt_fsync: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + if (strlen(name) == 5 && + !strncmp(name, "posix", 5)) { + sbi->fsync_mode = FSYNC_MODE_POSIX; + } else if (strlen(name) == 6 && + !strncmp(name, "strict", 6)) { + sbi->fsync_mode = FSYNC_MODE_STRICT; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -1288,6 +1306,11 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",alloc_mode=%s", "default"); else if (sbi->alloc_mode == ALLOC_MODE_REUSE) seq_printf(seq, ",alloc_mode=%s", "reuse"); + + if (sbi->fsync_mode == FSYNC_MODE_POSIX) + seq_printf(seq, ",fsync_mode=%s", "posix"); + else if (sbi->fsync_mode == FSYNC_MODE_STRICT) + seq_printf(seq, ",fsync_mode=%s", "strict"); return 0; } @@ -1298,6 +1321,7 @@ static void default_options(struct f2fs_sb_info *sbi) sbi->inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; sbi->whint_mode = WHINT_MODE_OFF; sbi->alloc_mode = ALLOC_MODE_DEFAULT; + sbi->fsync_mode = FSYNC_MODE_POSIX; sbi->readdir_ra = 1; set_opt(sbi, BG_GC); @@ -1341,6 +1365,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); int old_whint_mode = sbi->whint_mode; int old_alloc_mode = sbi->alloc_mode; + int old_fsync_mode = sbi->fsync_mode; int old_inline_xattr_size = sbi->inline_xattr_size; block_t old_root_reserved_blocks = sbi->root_reserved_blocks; kuid_t old_resuid = sbi->s_resuid; @@ -1501,6 +1526,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) sbi->root_reserved_blocks = old_root_reserved_blocks; sbi->inline_xattr_size = old_inline_xattr_size; sbi->alloc_mode = old_alloc_mode; + sbi->fsync_mode = old_fsync_mode; sbi->whint_mode = old_whint_mode; sbi->mount_opt = org_mount_opt; sbi->active_logs = active_logs; From 5738be52b3e88fab6008a95bab75548ef2f47826 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Wed, 7 Mar 2018 16:22:50 +0800 Subject: [PATCH 0644/1212] f2fs: Don't overwrite all types of node to keep node chain Currently, we enable node SSR by default, and mixed different types of node segment to do SSR more intensively. Although reuse warm node is not allowed, warm node chain will be destroyed by errors introduced by other types node chain. So we'd better forbid reusing all types of node to keep warm node chain. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 2d753f9b7499..92a46a7ba931 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1942,7 +1942,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) sbi->discard_blks--; /* don't overwrite by SSR to keep node chain */ - if (se->type == CURSEG_WARM_NODE) { + if (IS_NODESEG(se->type)) { if (!f2fs_test_and_set_bit(offset, se->ckpt_valid_map)) se->ckpt_valid_blocks++; } From d909e9410634d321ae6931e87bb0ad5eaac3fa62 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 8 Mar 2018 14:22:56 +0800 Subject: [PATCH 0645/1212] f2fs: wrap all options with f2fs_sb_info.mount_opt This patch merges miscellaneous mount options into struct f2fs_mount_info, After this patch, once we add new mount option, we don't need to worry about recovery of it in remount_fs(), since we will recover the f2fs_sb_info.mount_opt including all options. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/dir.c | 2 +- fs/f2fs/f2fs.h | 64 ++++++------ fs/f2fs/file.c | 4 +- fs/f2fs/namei.c | 6 +- fs/f2fs/segment.c | 8 +- fs/f2fs/super.c | 226 +++++++++++++++++++--------------------- fs/f2fs/sysfs.c | 4 +- include/linux/f2fs_fs.h | 8 +- 9 files changed, 154 insertions(+), 170 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9ca848dc9dc0..1e78f55c9a7a 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2302,7 +2302,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int rw = iov_iter_rw(iter); int err; enum rw_hint hint = iocb->ki_hint; - int whint_mode = sbi->whint_mode; + int whint_mode = F2FS_OPTION(sbi).whint_mode; err = check_direct_IO(inode, iter, offset); if (err) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index bb3b8ef1a890..02c32c96fe09 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -704,7 +704,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); - if (F2FS_I_SB(dir)->fsync_mode == FSYNC_MODE_STRICT) + if (F2FS_OPTION(F2FS_I_SB(dir)).fsync_mode == FSYNC_MODE_STRICT) add_ino_entry(F2FS_I_SB(dir), dir->i_ino, TRANS_DIR_INO); if (f2fs_has_inline_dentry(dir)) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1ec04a58576e..fa93ef53be34 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -99,9 +99,10 @@ extern char *fault_name[FAULT_MAX]; #define F2FS_MOUNT_INLINE_XATTR_SIZE 0x00800000 #define F2FS_MOUNT_RESERVE_ROOT 0x01000000 -#define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option) -#define set_opt(sbi, option) ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option) -#define test_opt(sbi, option) ((sbi)->mount_opt.opt & F2FS_MOUNT_##option) +#define F2FS_OPTION(sbi) ((sbi)->mount_opt) +#define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) +#define set_opt(sbi, option) (F2FS_OPTION(sbi).opt |= F2FS_MOUNT_##option) +#define test_opt(sbi, option) (F2FS_OPTION(sbi).opt & F2FS_MOUNT_##option) #define ver_after(a, b) (typecheck(unsigned long long, a) && \ typecheck(unsigned long long, b) && \ @@ -114,7 +115,25 @@ typedef u32 block_t; /* typedef u32 nid_t; struct f2fs_mount_info { - unsigned int opt; + unsigned int opt; + int write_io_size_bits; /* Write IO size bits */ + block_t root_reserved_blocks; /* root reserved blocks */ + kuid_t s_resuid; /* reserved blocks for uid */ + kgid_t s_resgid; /* reserved blocks for gid */ + int active_logs; /* # of active logs */ + int inline_xattr_size; /* inline xattr size */ +#ifdef CONFIG_F2FS_FAULT_INJECTION + struct f2fs_fault_info fault_info; /* For fault injection */ +#endif +#ifdef CONFIG_QUOTA + /* Names of quota files with journalled quota */ + char *s_qf_names[MAXQUOTAS]; + int s_jquota_fmt; /* Format of quota to use */ +#endif + /* For which write hints are passed down to block layer */ + int whint_mode; + int alloc_mode; /* segment allocation policy */ + int fsync_mode; /* fsync policy */ }; #define F2FS_FEATURE_ENCRYPT 0x0001 @@ -1145,7 +1164,6 @@ struct f2fs_sb_info { struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */ struct mutex wio_mutex[NR_PAGE_TYPE - 1][NR_TEMP_TYPE]; /* bio ordering for NODE/DATA */ - int write_io_size_bits; /* Write IO size bits */ mempool_t *write_io_dummy; /* Dummy pages */ /* for checkpoint */ @@ -1195,9 +1213,7 @@ struct f2fs_sb_info { unsigned int total_node_count; /* total node block count */ unsigned int total_valid_node_count; /* valid node block count */ loff_t max_file_blocks; /* max block index of file */ - int active_logs; /* # of active logs */ int dir_level; /* directory level */ - int inline_xattr_size; /* inline xattr size */ unsigned int trigger_ssr_threshold; /* threshold to trigger ssr */ int readdir_ra; /* readahead inode in readdir */ @@ -1207,9 +1223,6 @@ struct f2fs_sb_info { block_t last_valid_block_count; /* for recovery */ block_t reserved_blocks; /* configurable reserved blocks */ block_t current_reserved_blocks; /* current reserved blocks */ - block_t root_reserved_blocks; /* root reserved blocks */ - kuid_t s_resuid; /* reserved blocks for uid */ - kgid_t s_resgid; /* reserved blocks for gid */ unsigned int nquota_files; /* # of quota sysfile */ @@ -1294,25 +1307,6 @@ struct f2fs_sb_info { /* Precomputed FS UUID checksum for seeding other checksums */ __u32 s_chksum_seed; - - /* For fault injection */ -#ifdef CONFIG_F2FS_FAULT_INJECTION - struct f2fs_fault_info fault_info; -#endif - -#ifdef CONFIG_QUOTA - /* Names of quota files with journalled quota */ - char *s_qf_names[MAXQUOTAS]; - int s_jquota_fmt; /* Format of quota to use */ -#endif - /* For which write hints are passed down to block layer */ - int whint_mode; - - /* segment allocation policy */ - int alloc_mode; - - /* fsync policy */ - int fsync_mode; }; #ifdef CONFIG_F2FS_FAULT_INJECTION @@ -1322,7 +1316,7 @@ struct f2fs_sb_info { __func__, __builtin_return_address(0)) static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) { - struct f2fs_fault_info *ffi = &sbi->fault_info; + struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info; if (!ffi->inject_rate) return false; @@ -1679,10 +1673,10 @@ static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi, return false; if (IS_NOQUOTA(inode)) return true; - if (uid_eq(sbi->s_resuid, current_fsuid())) + if (uid_eq(F2FS_OPTION(sbi).s_resuid, current_fsuid())) return true; - if (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && - in_group_p(sbi->s_resgid)) + if (!gid_eq(F2FS_OPTION(sbi).s_resgid, GLOBAL_ROOT_GID) && + in_group_p(F2FS_OPTION(sbi).s_resgid)) return true; if (capable(CAP_SYS_RESOURCE)) return true; @@ -1720,7 +1714,7 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, sbi->current_reserved_blocks; if (!__allow_reserved_blocks(sbi, inode)) - avail_user_block_count -= sbi->root_reserved_blocks; + avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks; if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) { diff = sbi->total_valid_block_count - avail_user_block_count; @@ -1927,7 +1921,7 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, sbi->current_reserved_blocks + 1; if (!__allow_reserved_blocks(sbi, inode)) - valid_block_count += sbi->root_reserved_blocks; + valid_block_count += F2FS_OPTION(sbi).root_reserved_blocks; if (unlikely(valid_block_count > sbi->user_block_count)) { spin_unlock(&sbi->stat_lock); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 57afbf3e09ea..e39edd76e170 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -166,9 +166,9 @@ static inline enum cp_reason_type need_do_checkpoint(struct inode *inode) cp_reason = CP_NODE_NEED_CP; else if (test_opt(sbi, FASTBOOT)) cp_reason = CP_FASTBOOT_MODE; - else if (sbi->active_logs == 2) + else if (F2FS_OPTION(sbi).active_logs == 2) cp_reason = CP_SPEC_LOG_NUM; - else if (sbi->fsync_mode == FSYNC_MODE_STRICT && + else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT && need_dentry_mark(sbi, inode->i_ino) && exist_written_data(sbi, F2FS_I(inode)->i_pino, TRANS_DIR_INO)) cp_reason = CP_RECOVER_DIR; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 2b00eb44bb90..62aec95fe124 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -97,7 +97,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) { f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode)); if (f2fs_has_inline_xattr(inode)) - xattr_size = sbi->inline_xattr_size; + xattr_size = F2FS_OPTION(sbi).inline_xattr_size; /* Otherwise, will be 0 */ } else if (f2fs_has_inline_xattr(inode) || f2fs_has_inline_dentry(inode)) { @@ -967,7 +967,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_put_page(old_dir_page, 0); f2fs_i_links_write(old_dir, false); } - if (sbi->fsync_mode == FSYNC_MODE_STRICT) + if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); f2fs_unlock_op(sbi); @@ -1118,7 +1118,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, } f2fs_mark_inode_dirty_sync(new_dir, false); - if (sbi->fsync_mode == FSYNC_MODE_STRICT) { + if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) { add_ino_entry(sbi, old_dir->i_ino, TRANS_DIR_INO); add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 92a46a7ba931..3389721893d3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2251,7 +2251,7 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) return SIT_I(sbi)->last_victim[ALLOC_NEXT]; /* find segments from 0 to reuse freed segments */ - if (sbi->alloc_mode == ALLOC_MODE_REUSE) + if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE) return 0; return CURSEG_I(sbi, type)->segno; @@ -2604,7 +2604,7 @@ int rw_hint_to_seg_type(enum rw_hint hint) enum rw_hint io_type_to_rw_hint(struct f2fs_sb_info *sbi, enum page_type type, enum temp_type temp) { - if (sbi->whint_mode == WHINT_MODE_USER) { + if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER) { if (type == DATA) { if (temp == WARM) return WRITE_LIFE_NOT_SET; @@ -2615,7 +2615,7 @@ enum rw_hint io_type_to_rw_hint(struct f2fs_sb_info *sbi, } else { return WRITE_LIFE_NOT_SET; } - } else if (sbi->whint_mode == WHINT_MODE_FS) { + } else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) { if (type == DATA) { if (temp == WARM) return WRITE_LIFE_LONG; @@ -2684,7 +2684,7 @@ static int __get_segment_type(struct f2fs_io_info *fio) { int type = 0; - switch (fio->sbi->active_logs) { + switch (F2FS_OPTION(fio->sbi).active_logs) { case 2: type = __get_segment_type_2(fio); break; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b7c3f3b18a6d..67b0e1e34da0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -60,7 +60,7 @@ char *fault_name[FAULT_MAX] = { static void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate) { - struct f2fs_fault_info *ffi = &sbi->fault_info; + struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info; if (rate) { atomic_set(&ffi->inject_ops, 0); @@ -208,21 +208,24 @@ static inline void limit_reserve_root(struct f2fs_sb_info *sbi) block_t limit = (sbi->user_block_count << 1) / 1000; /* limit is 0.2% */ - if (test_opt(sbi, RESERVE_ROOT) && sbi->root_reserved_blocks > limit) { - sbi->root_reserved_blocks = limit; + if (test_opt(sbi, RESERVE_ROOT) && + F2FS_OPTION(sbi).root_reserved_blocks > limit) { + F2FS_OPTION(sbi).root_reserved_blocks = limit; f2fs_msg(sbi->sb, KERN_INFO, "Reduce reserved blocks for root = %u", - sbi->root_reserved_blocks); + F2FS_OPTION(sbi).root_reserved_blocks); } if (!test_opt(sbi, RESERVE_ROOT) && - (!uid_eq(sbi->s_resuid, + (!uid_eq(F2FS_OPTION(sbi).s_resuid, make_kuid(&init_user_ns, F2FS_DEF_RESUID)) || - !gid_eq(sbi->s_resgid, + !gid_eq(F2FS_OPTION(sbi).s_resgid, make_kgid(&init_user_ns, F2FS_DEF_RESGID)))) f2fs_msg(sbi->sb, KERN_INFO, "Ignore s_resuid=%u, s_resgid=%u w/o reserve_root", - from_kuid_munged(&init_user_ns, sbi->s_resuid), - from_kgid_munged(&init_user_ns, sbi->s_resgid)); + from_kuid_munged(&init_user_ns, + F2FS_OPTION(sbi).s_resuid), + from_kgid_munged(&init_user_ns, + F2FS_OPTION(sbi).s_resgid)); } static void init_once(void *foo) @@ -242,7 +245,7 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype, char *qname; int ret = -EINVAL; - if (sb_any_quota_loaded(sb) && !sbi->s_qf_names[qtype]) { + if (sb_any_quota_loaded(sb) && !F2FS_OPTION(sbi).s_qf_names[qtype]) { f2fs_msg(sb, KERN_ERR, "Cannot change journaled " "quota options when quota turned on"); @@ -260,8 +263,8 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype, "Not enough memory for storing quotafile name"); return -EINVAL; } - if (sbi->s_qf_names[qtype]) { - if (strcmp(sbi->s_qf_names[qtype], qname) == 0) + if (F2FS_OPTION(sbi).s_qf_names[qtype]) { + if (strcmp(F2FS_OPTION(sbi).s_qf_names[qtype], qname) == 0) ret = 0; else f2fs_msg(sb, KERN_ERR, @@ -274,7 +277,7 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype, "quotafile must be on filesystem root"); goto errout; } - sbi->s_qf_names[qtype] = qname; + F2FS_OPTION(sbi).s_qf_names[qtype] = qname; set_opt(sbi, QUOTA); return 0; errout: @@ -286,13 +289,13 @@ static int f2fs_clear_qf_name(struct super_block *sb, int qtype) { struct f2fs_sb_info *sbi = F2FS_SB(sb); - if (sb_any_quota_loaded(sb) && sbi->s_qf_names[qtype]) { + if (sb_any_quota_loaded(sb) && F2FS_OPTION(sbi).s_qf_names[qtype]) { f2fs_msg(sb, KERN_ERR, "Cannot change journaled quota options" " when quota turned on"); return -EINVAL; } - kfree(sbi->s_qf_names[qtype]); - sbi->s_qf_names[qtype] = NULL; + kfree(F2FS_OPTION(sbi).s_qf_names[qtype]); + F2FS_OPTION(sbi).s_qf_names[qtype] = NULL; return 0; } @@ -308,15 +311,19 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) "Cannot enable project quota enforcement."); return -1; } - if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA] || - sbi->s_qf_names[PRJQUOTA]) { - if (test_opt(sbi, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) + if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA] || + F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] || + F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) { + if (test_opt(sbi, USRQUOTA) && + F2FS_OPTION(sbi).s_qf_names[USRQUOTA]) clear_opt(sbi, USRQUOTA); - if (test_opt(sbi, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) + if (test_opt(sbi, GRPQUOTA) && + F2FS_OPTION(sbi).s_qf_names[GRPQUOTA]) clear_opt(sbi, GRPQUOTA); - if (test_opt(sbi, PRJQUOTA) && sbi->s_qf_names[PRJQUOTA]) + if (test_opt(sbi, PRJQUOTA) && + F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) clear_opt(sbi, PRJQUOTA); if (test_opt(sbi, GRPQUOTA) || test_opt(sbi, USRQUOTA) || @@ -326,17 +333,17 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) return -1; } - if (!sbi->s_jquota_fmt) { + if (!F2FS_OPTION(sbi).s_jquota_fmt) { f2fs_msg(sbi->sb, KERN_ERR, "journaled quota format " "not specified"); return -1; } } - if (f2fs_sb_has_quota_ino(sbi->sb) && sbi->s_jquota_fmt) { + if (f2fs_sb_has_quota_ino(sbi->sb) && F2FS_OPTION(sbi).s_jquota_fmt) { f2fs_msg(sbi->sb, KERN_INFO, "QUOTA feature is enabled, so ignore jquota_fmt"); - sbi->s_jquota_fmt = 0; + F2FS_OPTION(sbi).s_jquota_fmt = 0; } if (f2fs_sb_has_quota_ino(sbi->sb) && f2fs_readonly(sbi->sb)) { f2fs_msg(sbi->sb, KERN_INFO, @@ -446,7 +453,7 @@ static int parse_options(struct super_block *sb, char *options) if (args->from && match_int(args, &arg)) return -EINVAL; set_opt(sbi, INLINE_XATTR_SIZE); - sbi->inline_xattr_size = arg; + F2FS_OPTION(sbi).inline_xattr_size = arg; break; #else case Opt_user_xattr: @@ -486,7 +493,7 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE) return -EINVAL; - sbi->active_logs = arg; + F2FS_OPTION(sbi).active_logs = arg; break; case Opt_disable_ext_identify: set_opt(sbi, DISABLE_EXT_IDENTIFY); @@ -530,9 +537,9 @@ static int parse_options(struct super_block *sb, char *options) if (test_opt(sbi, RESERVE_ROOT)) { f2fs_msg(sb, KERN_INFO, "Preserve previous reserve_root=%u", - sbi->root_reserved_blocks); + F2FS_OPTION(sbi).root_reserved_blocks); } else { - sbi->root_reserved_blocks = arg; + F2FS_OPTION(sbi).root_reserved_blocks = arg; set_opt(sbi, RESERVE_ROOT); } break; @@ -545,7 +552,7 @@ static int parse_options(struct super_block *sb, char *options) "Invalid uid value %d", arg); return -EINVAL; } - sbi->s_resuid = uid; + F2FS_OPTION(sbi).s_resuid = uid; break; case Opt_resgid: if (args->from && match_int(args, &arg)) @@ -556,7 +563,7 @@ static int parse_options(struct super_block *sb, char *options) "Invalid gid value %d", arg); return -EINVAL; } - sbi->s_resgid = gid; + F2FS_OPTION(sbi).s_resgid = gid; break; case Opt_mode: name = match_strdup(&args[0]); @@ -591,7 +598,7 @@ static int parse_options(struct super_block *sb, char *options) 1 << arg, BIO_MAX_PAGES); return -EINVAL; } - sbi->write_io_size_bits = arg; + F2FS_OPTION(sbi).write_io_size_bits = arg; break; case Opt_fault_injection: if (args->from && match_int(args, &arg)) @@ -652,13 +659,13 @@ static int parse_options(struct super_block *sb, char *options) return ret; break; case Opt_jqfmt_vfsold: - sbi->s_jquota_fmt = QFMT_VFS_OLD; + F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_OLD; break; case Opt_jqfmt_vfsv0: - sbi->s_jquota_fmt = QFMT_VFS_V0; + F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_V0; break; case Opt_jqfmt_vfsv1: - sbi->s_jquota_fmt = QFMT_VFS_V1; + F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_V1; break; case Opt_noquota: clear_opt(sbi, QUOTA); @@ -691,13 +698,13 @@ static int parse_options(struct super_block *sb, char *options) return -ENOMEM; if (strlen(name) == 10 && !strncmp(name, "user-based", 10)) { - sbi->whint_mode = WHINT_MODE_USER; + F2FS_OPTION(sbi).whint_mode = WHINT_MODE_USER; } else if (strlen(name) == 3 && !strncmp(name, "off", 3)) { - sbi->whint_mode = WHINT_MODE_OFF; + F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; } else if (strlen(name) == 8 && !strncmp(name, "fs-based", 8)) { - sbi->whint_mode = WHINT_MODE_FS; + F2FS_OPTION(sbi).whint_mode = WHINT_MODE_FS; } else { kfree(name); return -EINVAL; @@ -711,10 +718,10 @@ static int parse_options(struct super_block *sb, char *options) if (strlen(name) == 7 && !strncmp(name, "default", 7)) { - sbi->alloc_mode = ALLOC_MODE_DEFAULT; + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; } else if (strlen(name) == 5 && !strncmp(name, "reuse", 5)) { - sbi->alloc_mode = ALLOC_MODE_REUSE; + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; } else { kfree(name); return -EINVAL; @@ -727,10 +734,10 @@ static int parse_options(struct super_block *sb, char *options) return -ENOMEM; if (strlen(name) == 5 && !strncmp(name, "posix", 5)) { - sbi->fsync_mode = FSYNC_MODE_POSIX; + F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; } else if (strlen(name) == 6 && !strncmp(name, "strict", 6)) { - sbi->fsync_mode = FSYNC_MODE_STRICT; + F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_STRICT; } else { kfree(name); return -EINVAL; @@ -770,8 +777,9 @@ static int parse_options(struct super_block *sb, char *options) "set with inline_xattr option"); return -EINVAL; } - if (!sbi->inline_xattr_size || - sbi->inline_xattr_size >= DEF_ADDRS_PER_INODE - + if (!F2FS_OPTION(sbi).inline_xattr_size || + F2FS_OPTION(sbi).inline_xattr_size >= + DEF_ADDRS_PER_INODE - F2FS_TOTAL_EXTRA_ATTR_SIZE - DEF_INLINE_RESERVED_SIZE - DEF_MIN_INLINE_SIZE) { @@ -784,8 +792,8 @@ static int parse_options(struct super_block *sb, char *options) /* Not pass down write hints if the number of active logs is lesser * than NR_CURSEG_TYPE. */ - if (sbi->active_logs != NR_CURSEG_TYPE) - sbi->whint_mode = WHINT_MODE_OFF; + if (F2FS_OPTION(sbi).active_logs != NR_CURSEG_TYPE) + F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; return 0; } @@ -1028,7 +1036,7 @@ static void f2fs_put_super(struct super_block *sb) mempool_destroy(sbi->write_io_dummy); #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) - kfree(sbi->s_qf_names[i]); + kfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif destroy_percpu_info(sbi); for (i = 0; i < NR_PAGE_TYPE; i++) @@ -1142,8 +1150,9 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = total_count - start_count; buf->f_bfree = user_block_count - valid_user_blocks(sbi) - sbi->current_reserved_blocks; - if (buf->f_bfree > sbi->root_reserved_blocks) - buf->f_bavail = buf->f_bfree - sbi->root_reserved_blocks; + if (buf->f_bfree > F2FS_OPTION(sbi).root_reserved_blocks) + buf->f_bavail = buf->f_bfree - + F2FS_OPTION(sbi).root_reserved_blocks; else buf->f_bavail = 0; @@ -1178,10 +1187,10 @@ static inline void f2fs_show_quota_options(struct seq_file *seq, #ifdef CONFIG_QUOTA struct f2fs_sb_info *sbi = F2FS_SB(sb); - if (sbi->s_jquota_fmt) { + if (F2FS_OPTION(sbi).s_jquota_fmt) { char *fmtname = ""; - switch (sbi->s_jquota_fmt) { + switch (F2FS_OPTION(sbi).s_jquota_fmt) { case QFMT_VFS_OLD: fmtname = "vfsold"; break; @@ -1195,14 +1204,17 @@ static inline void f2fs_show_quota_options(struct seq_file *seq, seq_printf(seq, ",jqfmt=%s", fmtname); } - if (sbi->s_qf_names[USRQUOTA]) - seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]); + if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA]) + seq_show_option(seq, "usrjquota", + F2FS_OPTION(sbi).s_qf_names[USRQUOTA]); - if (sbi->s_qf_names[GRPQUOTA]) - seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]); + if (F2FS_OPTION(sbi).s_qf_names[GRPQUOTA]) + seq_show_option(seq, "grpjquota", + F2FS_OPTION(sbi).s_qf_names[GRPQUOTA]); - if (sbi->s_qf_names[PRJQUOTA]) - seq_show_option(seq, "prjjquota", sbi->s_qf_names[PRJQUOTA]); + if (F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) + seq_show_option(seq, "prjjquota", + F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]); #endif } @@ -1237,7 +1249,7 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",noinline_xattr"); if (test_opt(sbi, INLINE_XATTR_SIZE)) seq_printf(seq, ",inline_xattr_size=%u", - sbi->inline_xattr_size); + F2FS_OPTION(sbi).inline_xattr_size); #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL if (test_opt(sbi, POSIX_ACL)) @@ -1273,18 +1285,20 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, "adaptive"); else if (test_opt(sbi, LFS)) seq_puts(seq, "lfs"); - seq_printf(seq, ",active_logs=%u", sbi->active_logs); + seq_printf(seq, ",active_logs=%u", F2FS_OPTION(sbi).active_logs); if (test_opt(sbi, RESERVE_ROOT)) seq_printf(seq, ",reserve_root=%u,resuid=%u,resgid=%u", - sbi->root_reserved_blocks, - from_kuid_munged(&init_user_ns, sbi->s_resuid), - from_kgid_munged(&init_user_ns, sbi->s_resgid)); + F2FS_OPTION(sbi).root_reserved_blocks, + from_kuid_munged(&init_user_ns, + F2FS_OPTION(sbi).s_resuid), + from_kgid_munged(&init_user_ns, + F2FS_OPTION(sbi).s_resgid)); if (F2FS_IO_SIZE_BITS(sbi)) seq_printf(seq, ",io_size=%uKB", F2FS_IO_SIZE_KB(sbi)); #ifdef CONFIG_F2FS_FAULT_INJECTION if (test_opt(sbi, FAULT_INJECTION)) seq_printf(seq, ",fault_injection=%u", - sbi->fault_info.inject_rate); + F2FS_OPTION(sbi).fault_info.inject_rate); #endif #ifdef CONFIG_QUOTA if (test_opt(sbi, QUOTA)) @@ -1297,19 +1311,19 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",prjquota"); #endif f2fs_show_quota_options(seq, sbi->sb); - if (sbi->whint_mode == WHINT_MODE_USER) + if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER) seq_printf(seq, ",whint_mode=%s", "user-based"); - else if (sbi->whint_mode == WHINT_MODE_FS) + else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) seq_printf(seq, ",whint_mode=%s", "fs-based"); - if (sbi->alloc_mode == ALLOC_MODE_DEFAULT) + if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_DEFAULT) seq_printf(seq, ",alloc_mode=%s", "default"); - else if (sbi->alloc_mode == ALLOC_MODE_REUSE) + else if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE) seq_printf(seq, ",alloc_mode=%s", "reuse"); - if (sbi->fsync_mode == FSYNC_MODE_POSIX) + if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_POSIX) seq_printf(seq, ",fsync_mode=%s", "posix"); - else if (sbi->fsync_mode == FSYNC_MODE_STRICT) + else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) seq_printf(seq, ",fsync_mode=%s", "strict"); return 0; } @@ -1317,11 +1331,11 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) static void default_options(struct f2fs_sb_info *sbi) { /* init some FS parameters */ - sbi->active_logs = NR_CURSEG_TYPE; - sbi->inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; - sbi->whint_mode = WHINT_MODE_OFF; - sbi->alloc_mode = ALLOC_MODE_DEFAULT; - sbi->fsync_mode = FSYNC_MODE_POSIX; + F2FS_OPTION(sbi).active_logs = NR_CURSEG_TYPE; + F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; + F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; + F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; sbi->readdir_ra = 1; set_opt(sbi, BG_GC); @@ -1359,24 +1373,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) struct f2fs_sb_info *sbi = F2FS_SB(sb); struct f2fs_mount_info org_mount_opt; unsigned long old_sb_flags; - int err, active_logs; + int err; bool need_restart_gc = false; bool need_stop_gc = false; bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); - int old_whint_mode = sbi->whint_mode; - int old_alloc_mode = sbi->alloc_mode; - int old_fsync_mode = sbi->fsync_mode; - int old_inline_xattr_size = sbi->inline_xattr_size; - block_t old_root_reserved_blocks = sbi->root_reserved_blocks; - kuid_t old_resuid = sbi->s_resuid; - kgid_t old_resgid = sbi->s_resgid; - int old_write_io_size_bits = sbi->write_io_size_bits; -#ifdef CONFIG_F2FS_FAULT_INJECTION - struct f2fs_fault_info ffi = sbi->fault_info; -#endif #ifdef CONFIG_QUOTA - int s_jquota_fmt; - char *s_qf_names[MAXQUOTAS]; int i, j; #endif @@ -1386,21 +1387,21 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) */ org_mount_opt = sbi->mount_opt; old_sb_flags = sb->s_flags; - active_logs = sbi->active_logs; #ifdef CONFIG_QUOTA - s_jquota_fmt = sbi->s_jquota_fmt; + org_mount_opt.s_jquota_fmt = F2FS_OPTION(sbi).s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) { - if (sbi->s_qf_names[i]) { - s_qf_names[i] = kstrdup(sbi->s_qf_names[i], - GFP_KERNEL); - if (!s_qf_names[i]) { + if (F2FS_OPTION(sbi).s_qf_names[i]) { + org_mount_opt.s_qf_names[i] = + kstrdup(F2FS_OPTION(sbi).s_qf_names[i], + GFP_KERNEL); + if (!org_mount_opt.s_qf_names[i]) { for (j = 0; j < i; j++) - kfree(s_qf_names[j]); + kfree(org_mount_opt.s_qf_names[j]); return -ENOMEM; } } else { - s_qf_names[i] = NULL; + org_mount_opt.s_qf_names[i] = NULL; } } #endif @@ -1470,7 +1471,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) need_stop_gc = true; } - if (*flags & MS_RDONLY || sbi->whint_mode != old_whint_mode) { + if (*flags & MS_RDONLY || + F2FS_OPTION(sbi).whint_mode != org_mount_opt.whint_mode) { writeback_inodes_sb(sb, WB_REASON_SYNC); sync_inodes_sb(sb); @@ -1496,7 +1498,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) #ifdef CONFIG_QUOTA /* Release old quota file names */ for (i = 0; i < MAXQUOTAS; i++) - kfree(s_qf_names[i]); + kfree(org_mount_opt.s_qf_names[i]); #endif /* Update the POSIXACL Flag */ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | @@ -1514,26 +1516,14 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } restore_opts: #ifdef CONFIG_QUOTA - sbi->s_jquota_fmt = s_jquota_fmt; + F2FS_OPTION(sbi).s_jquota_fmt = org_mount_opt.s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) { - kfree(sbi->s_qf_names[i]); - sbi->s_qf_names[i] = s_qf_names[i]; + kfree(F2FS_OPTION(sbi).s_qf_names[i]); + F2FS_OPTION(sbi).s_qf_names[i] = org_mount_opt.s_qf_names[i]; } #endif - sbi->write_io_size_bits = old_write_io_size_bits; - sbi->s_resgid = old_resgid; - sbi->s_resuid = old_resuid; - sbi->root_reserved_blocks = old_root_reserved_blocks; - sbi->inline_xattr_size = old_inline_xattr_size; - sbi->alloc_mode = old_alloc_mode; - sbi->fsync_mode = old_fsync_mode; - sbi->whint_mode = old_whint_mode; sbi->mount_opt = org_mount_opt; - sbi->active_logs = active_logs; sb->s_flags = old_sb_flags; -#ifdef CONFIG_F2FS_FAULT_INJECTION - sbi->fault_info = ffi; -#endif return err; } @@ -1655,8 +1645,8 @@ static qsize_t *f2fs_get_reserved_space(struct inode *inode) static int f2fs_quota_on_mount(struct f2fs_sb_info *sbi, int type) { - return dquot_quota_on_mount(sbi->sb, sbi->s_qf_names[type], - sbi->s_jquota_fmt, type); + return dquot_quota_on_mount(sbi->sb, F2FS_OPTION(sbi).s_qf_names[type], + F2FS_OPTION(sbi).s_jquota_fmt, type); } int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly) @@ -1675,7 +1665,7 @@ int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly) } for (i = 0; i < MAXQUOTAS; i++) { - if (sbi->s_qf_names[i]) { + if (F2FS_OPTION(sbi).s_qf_names[i]) { err = f2fs_quota_on_mount(sbi, i); if (!err) { enabled = 1; @@ -2566,7 +2556,7 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) /* adjust parameters according to the volume size */ if (sm_i->main_segments <= SMALL_VOLUME_SEGMENTS) { - sbi->alloc_mode = ALLOC_MODE_REUSE; + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; sm_i->dcc_info->discard_granularity = 1; sm_i->ipu_policy = 1 << F2FS_IPU_FORCE; } @@ -2619,8 +2609,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) sb->s_fs_info = sbi; sbi->raw_super = raw_super; - sbi->s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); - sbi->s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); + F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); + F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); /* precompute checksum seed for metadata */ if (f2fs_sb_has_inode_chksum(sb)) @@ -2978,7 +2968,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) free_options: #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) - kfree(sbi->s_qf_names[i]); + kfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif kfree(options); free_sb_buf: diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 23a2d8d66c43..7d983ad19da4 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -58,7 +58,7 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) #ifdef CONFIG_F2FS_FAULT_INJECTION else if (struct_type == FAULT_INFO_RATE || struct_type == FAULT_INFO_TYPE) - return (unsigned char *)&sbi->fault_info; + return (unsigned char *)&F2FS_OPTION(sbi).fault_info; #endif return NULL; } @@ -222,7 +222,7 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, if (a->struct_type == RESERVED_BLOCKS) { spin_lock(&sbi->stat_lock); if (t > (unsigned long)(sbi->user_block_count - - sbi->root_reserved_blocks)) { + F2FS_OPTION(sbi).root_reserved_blocks)) { spin_unlock(&sbi->stat_lock); return -EINVAL; } diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index bb92fd5b5841..61ddee120675 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -39,10 +39,10 @@ #define F2FS_MAX_QUOTAS 3 -#define F2FS_IO_SIZE(sbi) (1 << (sbi)->write_io_size_bits) /* Blocks */ -#define F2FS_IO_SIZE_KB(sbi) (1 << ((sbi)->write_io_size_bits + 2)) /* KB */ -#define F2FS_IO_SIZE_BYTES(sbi) (1 << ((sbi)->write_io_size_bits + 12)) /* B */ -#define F2FS_IO_SIZE_BITS(sbi) ((sbi)->write_io_size_bits) /* power of 2 */ +#define F2FS_IO_SIZE(sbi) (1 << F2FS_OPTION(sbi).write_io_size_bits) /* Blocks */ +#define F2FS_IO_SIZE_KB(sbi) (1 << (F2FS_OPTION(sbi).write_io_size_bits + 2)) /* KB */ +#define F2FS_IO_SIZE_BYTES(sbi) (1 << (F2FS_OPTION(sbi).write_io_size_bits + 12)) /* B */ +#define F2FS_IO_SIZE_BITS(sbi) (F2FS_OPTION(sbi).write_io_size_bits) /* power of 2 */ #define F2FS_IO_SIZE_MASK(sbi) (F2FS_IO_SIZE(sbi) - 1) /* This flag is used by node and meta inodes, and by recovery */ From 30654507e0a28a634e709f7fa05dd3850067bd32 Mon Sep 17 00:00:00 2001 From: Hyunchul Lee Date: Thu, 8 Mar 2018 19:34:38 +0900 Subject: [PATCH 0646/1212] f2fs: add nowait aio support This patch adds nowait aio support[1]. Return EAGAIN if any of the following checks fail for direct I/O: - i_rwsem is not lockable - Blocks are not allocated at the write location And xfstests generic/471 is passed. [1]: 6be96d "Introduce RWF_NOWAIT and FMODE_AIO_NOWAIT" Signed-off-by: Hyunchul Lee Reviewed-by: Goldwyn Rodrigues Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 47 ++++++++++++++++++++++++++++++++++++---------- fs/f2fs/f2fs.h | 8 ++++++++ fs/f2fs/file.c | 35 ++++++++++++++++++++++++++++------ include/linux/fs.h | 4 ++++ 4 files changed, 78 insertions(+), 16 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 1e78f55c9a7a..b66b78d3f76d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -837,13 +837,6 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) return 0; } -static inline bool __force_buffered_io(struct inode *inode, int rw) -{ - return (f2fs_encrypted_file(inode) || - (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) || - F2FS_I_SB(inode)->s_ndevs); -} - int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); @@ -875,7 +868,7 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) if (direct_io) { map.m_seg_type = rw_hint_to_seg_type(iocb->ki_hint); - flag = __force_buffered_io(inode, WRITE) ? + flag = f2fs_force_buffered_io(inode, WRITE) ? F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO; goto map_blocks; @@ -1119,6 +1112,31 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, return err; } +bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len) +{ + struct f2fs_map_blocks map; + block_t last_lblk; + int err; + + if (pos + len > i_size_read(inode)) + return false; + + map.m_lblk = F2FS_BYTES_TO_BLK(pos); + map.m_next_pgofs = NULL; + map.m_next_extent = NULL; + map.m_seg_type = NO_CHECK_TYPE; + last_lblk = F2FS_BLK_ALIGN(pos + len); + + while (map.m_lblk < last_lblk) { + map.m_len = last_lblk - map.m_lblk; + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT); + if (err || map.m_len == 0) + return false; + map.m_lblk += map.m_len; + } + return true; +} + static int __get_data_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create, int flag, pgoff_t *next_pgofs, int seg_type) @@ -2308,7 +2326,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (err) return err; - if (__force_buffered_io(inode, rw)) + if (f2fs_force_buffered_io(inode, rw)) return 0; trace_f2fs_direct_IO_enter(inode, offset, count, rw); @@ -2316,7 +2334,15 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (rw == WRITE && whint_mode == WHINT_MODE_OFF) iocb->ki_hint = WRITE_LIFE_NOT_SET; - down_read(&F2FS_I(inode)->dio_rwsem[rw]); + if (!down_read_trylock(&F2FS_I(inode)->dio_rwsem[rw])) { + if (iocb->ki_flags & IOCB_NOWAIT) { + iocb->ki_hint = hint; + err = -EAGAIN; + goto out; + } + down_read(&F2FS_I(inode)->dio_rwsem[rw]); + } + err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio); up_read(&F2FS_I(inode)->dio_rwsem[rw]); @@ -2332,6 +2358,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, } } +out: trace_f2fs_direct_IO_exit(inode, offset, count, rw, err); return err; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fa93ef53be34..615f158f895b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2961,6 +2961,7 @@ int f2fs_release_page(struct page *page, gfp_t wait); int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode); #endif +bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len); /* * gc.c @@ -3345,4 +3346,11 @@ static inline bool f2fs_may_encrypt(struct inode *inode) #endif } +static inline bool f2fs_force_buffered_io(struct inode *inode, int rw) +{ + return (f2fs_encrypted_file(inode) || + (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) || + F2FS_I_SB(inode)->s_ndevs); +} + #endif diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index e39edd76e170..cdad5853aaff 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -482,6 +482,9 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) if (err) return err; + + filp->f_mode |= FMODE_NOWAIT; + return dquot_file_open(inode, filp); } @@ -2696,7 +2699,15 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; - inode_lock(inode); + if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) + return -EINVAL; + + if (!inode_trylock(inode)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + inode_lock(inode); + } + ret = generic_write_checks(iocb, from); if (ret > 0) { int err; @@ -2704,11 +2715,23 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (iov_iter_fault_in_readable(from, iov_iter_count(from))) set_inode_flag(inode, FI_NO_PREALLOC); - err = f2fs_preallocate_blocks(iocb, from); - if (err) { - clear_inode_flag(inode, FI_NO_PREALLOC); - inode_unlock(inode); - return err; + if ((iocb->ki_flags & IOCB_NOWAIT) && + (iocb->ki_flags & IOCB_DIRECT)) { + if (!f2fs_overwrite_io(inode, iocb->ki_pos, + iov_iter_count(from)) || + f2fs_has_inline_data(inode) || + f2fs_force_buffered_io(inode, WRITE)) { + inode_unlock(inode); + return -EAGAIN; + } + + } else { + err = f2fs_preallocate_blocks(iocb, from); + if (err) { + clear_inode_flag(inode, FI_NO_PREALLOC); + inode_unlock(inode); + return err; + } } blk_start_plug(&plug); ret = __generic_file_write_iter(iocb, from); diff --git a/include/linux/fs.h b/include/linux/fs.h index 8231cdc25901..e9382296305d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -144,6 +144,9 @@ typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate); /* File was opened by fanotify and shouldn't generate fanotify events */ #define FMODE_NONOTIFY ((__force fmode_t)0x4000000) +/* File is capable of returning -EAGAIN if I/O will block */ +#define FMODE_NOWAIT ((__force fmode_t)0x8000000) + /* * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector * that indicates that they should check the contents of the iovec are @@ -334,6 +337,7 @@ enum rw_hint { #define IOCB_EVENTFD (1 << 0) #define IOCB_APPEND (1 << 1) #define IOCB_DIRECT (1 << 2) +#define IOCB_NOWAIT (1 << 7) struct kiocb { struct file *ki_filp; From 4c55abe4f8d2ca91987cf5e91e8eb7a71b2dab9c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 9 Mar 2018 14:24:22 +0800 Subject: [PATCH 0647/1212] f2fs: remove unneeded set_cold_node() When setting COLD_BIT_SHIFT flag in node block, we only need to call set_cold_node() in new_node_page() and recover_inode_page() during node page initialization. So remove unneeded set_cold_node() in other places. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 2 -- fs/f2fs/inode.c | 1 - fs/f2fs/node.c | 3 ++- fs/f2fs/node.h | 4 ++-- 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 02c32c96fe09..00ada49c7fa4 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -396,8 +396,6 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, page = get_node_page(F2FS_I_SB(dir), inode->i_ino); if (IS_ERR(page)) return page; - - set_cold_node(inode, page); } if (new_name) { diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 10be247ca421..562a56bc037c 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -439,7 +439,6 @@ void update_inode(struct inode *inode, struct page *node_page) } __set_inode_rdev(inode, ri); - set_cold_node(inode, node_page); /* deleted inode */ if (inode->i_nlink == 0) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index c294d0feea08..ab2595842c5d 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1076,7 +1076,7 @@ struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs) f2fs_wait_on_page_writeback(page, NODE, true); fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); - set_cold_node(dn->inode, page); + set_cold_node(page, S_ISDIR(dn->inode->i_mode)); if (!PageUptodate(page)) SetPageUptodate(page); if (set_page_dirty(page)) @@ -2313,6 +2313,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) if (!PageUptodate(ipage)) SetPageUptodate(ipage); fill_node_footer(ipage, ino, ino, 0, true); + set_cold_node(page, false); src = F2FS_INODE(page); dst = F2FS_INODE(ipage); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 081ef0d672bf..e593b4d78be2 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -422,12 +422,12 @@ static inline void clear_inline_node(struct page *page) ClearPageChecked(page); } -static inline void set_cold_node(struct inode *inode, struct page *page) +static inline void set_cold_node(struct page *page, bool is_dir) { struct f2fs_node *rn = F2FS_NODE(page); unsigned int flag = le32_to_cpu(rn->footer.flag); - if (S_ISDIR(inode->i_mode)) + if (is_dir) flag &= ~(0x1 << COLD_BIT_SHIFT); else flag |= (0x1 << COLD_BIT_SHIFT); From 739ace131cdfd5dd0eca4c4bbf06b0a3bce25d9d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 9 Mar 2018 17:42:28 -0800 Subject: [PATCH 0648/1212] f2fs: align memory boundary for bitops For example, in arm64, free_nid_bitmap should be aligned to word size in order to use bit operations. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/node.c | 20 +++++++++++++++++--- include/linux/f2fs_fs.h | 4 ++++ 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 615f158f895b..93b13c50af67 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -830,7 +830,7 @@ struct f2fs_nm_info { unsigned int nid_cnt[MAX_NID_STATE]; /* the number of free node id */ spinlock_t nid_list_lock; /* protect nid lists ops */ struct mutex build_lock; /* lock for build free nids */ - unsigned char (*free_nid_bitmap)[NAT_ENTRY_BITMAP_SIZE]; + unsigned char **free_nid_bitmap; unsigned char *nat_block_bitmap; unsigned short *free_nid_count; /* free nid count of NAT block */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index ab2595842c5d..571cb70c5fbd 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2730,12 +2730,20 @@ static int init_node_manager(struct f2fs_sb_info *sbi) static int init_free_nid_cache(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); + int i; - nm_i->free_nid_bitmap = f2fs_kvzalloc(sbi, nm_i->nat_blocks * - NAT_ENTRY_BITMAP_SIZE, GFP_KERNEL); + nm_i->free_nid_bitmap = f2fs_kzalloc(sbi, nm_i->nat_blocks * + sizeof(unsigned char *), GFP_KERNEL); if (!nm_i->free_nid_bitmap) return -ENOMEM; + for (i = 0; i < nm_i->nat_blocks; i++) { + nm_i->free_nid_bitmap[i] = f2fs_kvzalloc(sbi, + NAT_ENTRY_BITMAP_SIZE_ALIGNED, GFP_KERNEL); + if (!nm_i->free_nid_bitmap) + return -ENOMEM; + } + nm_i->nat_block_bitmap = f2fs_kvzalloc(sbi, nm_i->nat_blocks / 8, GFP_KERNEL); if (!nm_i->nat_block_bitmap) @@ -2826,7 +2834,13 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) up_write(&nm_i->nat_tree_lock); kvfree(nm_i->nat_block_bitmap); - kvfree(nm_i->free_nid_bitmap); + if (nm_i->free_nid_bitmap) { + int i; + + for (i = 0; i < nm_i->nat_blocks; i++) + kvfree(nm_i->free_nid_bitmap[i]); + kfree(nm_i->free_nid_bitmap); + } kvfree(nm_i->free_nid_count); kfree(nm_i->nat_bitmap); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 61ddee120675..2ebfa01b7091 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -305,6 +305,10 @@ struct f2fs_node { */ #define NAT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_nat_entry)) #define NAT_ENTRY_BITMAP_SIZE ((NAT_ENTRY_PER_BLOCK + 7) / 8) +#define NAT_ENTRY_BITMAP_SIZE_ALIGNED \ + ((NAT_ENTRY_BITMAP_SIZE + BITS_PER_LONG - 1) / \ + BITS_PER_LONG * BITS_PER_LONG) + struct f2fs_nat_entry { __u8 version; /* latest version of cached nat entry */ From 8a5719615847c523b9975bb68294aa9a792d94ba Mon Sep 17 00:00:00 2001 From: Qiuyang Sun Date: Tue, 13 Mar 2018 19:42:50 +0800 Subject: [PATCH 0649/1212] f2fs: release locks before return in f2fs_ioc_gc_range() Currently, we will leave the kernel with locks still held when the gc_range is invalid. This patch fixes the bug. Signed-off-by: Qiuyang Sun Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index cdad5853aaff..abc3db46cb1d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2057,8 +2057,10 @@ static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) return ret; end = range.start + range.len; - if (range.start < MAIN_BLKADDR(sbi) || end >= MAX_BLKADDR(sbi)) - return -EINVAL; + if (range.start < MAIN_BLKADDR(sbi) || end >= MAX_BLKADDR(sbi)) { + ret = -EINVAL; + goto out; + } do_more: if (!range.sync) { if (!mutex_trylock(&sbi->gc_mutex)) { From 9321e22c038cf725ad1734b42dffb2536e920242 Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Thu, 15 Mar 2018 18:51:41 +0800 Subject: [PATCH 0650/1212] f2fs: introduce F2FS_FEATURE_LOST_FOUND feature This patch introduces a new feature, F2FS_FEATURE_LOST_FOUND, which is set by mkfs. mkfs creates a directory named lost+found, which saves unreachable files. If fsck finds a file which has no parent, or its parent is removed by fsck, the file will be placed under lost+found directory by fsck. lost+found directory could not be encrypted. As a result, the root directory cannot be encrypted too. So if LOST_FOUND feature is enabled, let's avoid to encrypt root directory. Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/super.c | 12 ++++++++++++ fs/f2fs/sysfs.c | 7 +++++++ 3 files changed, 21 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 93b13c50af67..e0bf6f83dd14 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -145,6 +145,7 @@ struct f2fs_mount_info { #define F2FS_FEATURE_FLEXIBLE_INLINE_XATTR 0x0040 #define F2FS_FEATURE_QUOTA_INO 0x0080 #define F2FS_FEATURE_INODE_CRTIME 0x0100 +#define F2FS_FEATURE_LOST_FOUND 0x0200 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -3298,6 +3299,7 @@ F2FS_FEATURE_FUNCS(inode_chksum, INODE_CHKSUM); F2FS_FEATURE_FUNCS(flexible_inline_xattr, FLEXIBLE_INLINE_XATTR); F2FS_FEATURE_FUNCS(quota_ino, QUOTA_INO); F2FS_FEATURE_FUNCS(inode_crtime, INODE_CRTIME); +F2FS_FEATURE_FUNCS(lost_found, LOST_FOUND); #ifdef CONFIG_BLK_DEV_ZONED static inline int get_blkz_type(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 67b0e1e34da0..ec4774942a48 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1895,6 +1895,18 @@ static int f2fs_get_context(struct inode *inode, void *ctx, size_t len) static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len, void *fs_data) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + /* + * Encrypting the root directory is not allowed because fsck + * expects lost+found directory to exist and remain unencrypted + * if LOST_FOUND feature is enabled. + * + */ + if (f2fs_sb_has_lost_found(sbi->sb) && + inode->i_ino == F2FS_ROOT_INO(sbi)) + return -EPERM; + return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len, fs_data, XATTR_CREATE); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 7d983ad19da4..f33a56d6e6dd 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -116,6 +116,9 @@ static ssize_t features_show(struct f2fs_attr *a, if (f2fs_sb_has_inode_crtime(sb)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "inode_crtime"); + if (f2fs_sb_has_lost_found(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "lost_found"); len += snprintf(buf + len, PAGE_SIZE - len, "\n"); return len; } @@ -292,6 +295,7 @@ enum feat_id { FEAT_FLEXIBLE_INLINE_XATTR, FEAT_QUOTA_INO, FEAT_INODE_CRTIME, + FEAT_LOST_FOUND, }; static ssize_t f2fs_feature_show(struct f2fs_attr *a, @@ -307,6 +311,7 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a, case FEAT_FLEXIBLE_INLINE_XATTR: case FEAT_QUOTA_INO: case FEAT_INODE_CRTIME: + case FEAT_LOST_FOUND: return snprintf(buf, PAGE_SIZE, "supported\n"); } return 0; @@ -386,6 +391,7 @@ F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM); F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR); F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO); F2FS_FEATURE_RO_ATTR(inode_crtime, FEAT_INODE_CRTIME); +F2FS_FEATURE_RO_ATTR(lost_found, FEAT_LOST_FOUND); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -441,6 +447,7 @@ static struct attribute *f2fs_feat_attrs[] = { ATTR_LIST(flexible_inline_xattr), ATTR_LIST(quota_ino), ATTR_LIST(inode_crtime), + ATTR_LIST(lost_found), NULL, }; From 7419dcb8be0282e165f676539babeec2766bb0ca Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Thu, 15 Mar 2018 18:51:42 +0800 Subject: [PATCH 0651/1212] f2fs: introduce a new mount option test_dummy_encryption This patch introduces a new mount option `test_dummy_encryption' to allow fscrypt to create a fake fscrypt context. This is used by xfstests. Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 2 ++ fs/f2fs/dir.c | 4 +++- fs/f2fs/f2fs.h | 8 ++++++++ fs/f2fs/namei.c | 9 ++++++--- fs/f2fs/super.c | 28 ++++++++++++++++++++++++++++ 5 files changed, 47 insertions(+), 4 deletions(-) diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index fb92e6f25adf..1f52baea2f69 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -187,6 +187,8 @@ fsync_mode=%s Control the policy of fsync. Currently supports "posix" fsync will be heavy and behaves in line with xfs, ext4 and btrfs, where xfstest generic/342 will pass, but the performance will regress. +test_dummy_encryption Enable dummy encryption, which provides a fake fscrypt + context. The fake fscrypt context is used by xfstests. ================================================================================ DEBUGFS ENTRIES diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 00ada49c7fa4..41d32171bd52 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -361,6 +361,7 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, struct page *dpage) { struct page *page; + int dummy_encrypt = DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(dir)); int err; if (is_inode_flag_set(inode, FI_NEW_INODE)) { @@ -387,7 +388,8 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, if (err) goto put_error; - if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) { + if ((f2fs_encrypted_inode(dir) || dummy_encrypt) && + f2fs_may_encrypt(inode)) { err = fscrypt_inherit_context(dir, inode, page, false); if (err) goto put_error; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e0bf6f83dd14..423603d3f5b3 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -134,6 +134,7 @@ struct f2fs_mount_info { int whint_mode; int alloc_mode; /* segment allocation policy */ int fsync_mode; /* fsync policy */ + bool test_dummy_encryption; /* test dummy encryption */ }; #define F2FS_FEATURE_ENCRYPT 0x0001 @@ -1141,6 +1142,13 @@ enum fsync_mode { FSYNC_MODE_STRICT, /* fsync behaves in line with ext4 */ }; +#ifdef CONFIG_F2FS_FS_ENCRYPTION +#define DUMMY_ENCRYPTION_ENABLED(sbi) \ + (unlikely(F2FS_OPTION(sbi).test_dummy_encryption)) +#else +#define DUMMY_ENCRYPTION_ENABLED(sbi) (0) +#endif + struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 62aec95fe124..5ec20f077629 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -78,7 +78,8 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) set_inode_flag(inode, FI_NEW_INODE); /* If the directory encrypted, then we should encrypt the inode. */ - if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) + if ((f2fs_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) && + f2fs_may_encrypt(inode)) f2fs_set_encrypted_inode(inode); if (f2fs_sb_has_extra_attr(sbi->sb)) { @@ -784,10 +785,12 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) { - if (unlikely(f2fs_cp_error(F2FS_I_SB(dir)))) + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + + if (unlikely(f2fs_cp_error(sbi))) return -EIO; - if (f2fs_encrypted_inode(dir)) { + if (f2fs_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) { int err = fscrypt_get_encryption_info(dir); if (err) return err; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ec4774942a48..62f228478849 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -132,6 +132,7 @@ enum { Opt_whint, Opt_alloc, Opt_fsync, + Opt_test_dummy_encryption, Opt_err, }; @@ -188,6 +189,7 @@ static match_table_t f2fs_tokens = { {Opt_whint, "whint_mode=%s"}, {Opt_alloc, "alloc_mode=%s"}, {Opt_fsync, "fsync_mode=%s"}, + {Opt_test_dummy_encryption, "test_dummy_encryption"}, {Opt_err, NULL}, }; @@ -744,6 +746,21 @@ static int parse_options(struct super_block *sb, char *options) } kfree(name); break; + case Opt_test_dummy_encryption: +#ifdef CONFIG_F2FS_FS_ENCRYPTION + if (!f2fs_sb_has_encrypt(sb)) { + f2fs_msg(sb, KERN_ERR, "Encrypt feature is off"); + return -EINVAL; + } + + F2FS_OPTION(sbi).test_dummy_encryption = true; + f2fs_msg(sb, KERN_INFO, + "Test dummy encryption mode enabled"); +#else + f2fs_msg(sb, KERN_INFO, + "Test dummy encryption mount option ignored"); +#endif + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -1315,6 +1332,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",whint_mode=%s", "user-based"); else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) seq_printf(seq, ",whint_mode=%s", "fs-based"); +#ifdef CONFIG_F2FS_FS_ENCRYPTION + if (F2FS_OPTION(sbi).test_dummy_encryption) + seq_puts(seq, ",test_dummy_encryption"); +#endif if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_DEFAULT) seq_printf(seq, ",alloc_mode=%s", "default"); @@ -1336,6 +1357,7 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; + F2FS_OPTION(sbi).test_dummy_encryption = false; sbi->readdir_ra = 1; set_opt(sbi, BG_GC); @@ -1912,6 +1934,11 @@ static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len, ctx, len, fs_data, XATTR_CREATE); } +static bool f2fs_dummy_context(struct inode *inode) +{ + return DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(inode)); +} + static unsigned f2fs_max_namelen(struct inode *inode) { return S_ISLNK(inode->i_mode) ? @@ -1922,6 +1949,7 @@ static const struct fscrypt_operations f2fs_cryptops = { .key_prefix = "f2fs:", .get_context = f2fs_get_context, .set_context = f2fs_set_context, + .dummy_context = f2fs_dummy_context, .empty_dir = f2fs_empty_dir, .max_namelen = f2fs_max_namelen, }; From 6ab573a9d96f7991927948ecb481c89654d4bdd0 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Fri, 16 Mar 2018 18:53:53 +0530 Subject: [PATCH 0652/1212] f2fs: Set GF_NOFS in read_cache_page_gfp while doing f2fs_quota_read Quota code itself is serializing the operations by taking mutex_lock. It seems a below deadlock can happen if GF_NOFS is not used in f2fs_quota_read __switch_to+0x88 __schedule+0x5b0 schedule+0x78 schedule_preempt_disabled+0x20 __mutex_lock_slowpath+0xdc //mutex owner is itself mutex_lock+0x2c dquot_commit+0x30 //mutex_lock(&dqopt->dqio_mutex); dqput+0xe0 __dquot_drop+0x80 dquot_drop+0x48 f2fs_evict_inode+0x218 evict+0xa8 dispose_list+0x3c prune_icache_sb+0x58 super_cache_scan+0xf4 do_shrink_slab+0x208 shrink_slab.part.40+0xac shrink_zone+0x1b0 do_try_to_free_pages+0x25c try_to_free_pages+0x164 __alloc_pages_nodemask+0x534 do_read_cache_page+0x6c read_cache_page+0x14 f2fs_quota_read+0xa4 read_blk+0x54 find_tree_dqentry+0xe4 find_tree_dqentry+0xb8 find_tree_dqentry+0xb8 find_tree_dqentry+0xb8 qtree_read_dquot+0x68 v2_read_dquot+0x24 dquot_acquire+0x5c // mutex_lock(&dqopt->dqio_mutex); dqget+0x238 __dquot_initialize+0xd4 dquot_initialize+0x10 dquot_file_open+0x34 f2fs_file_open+0x6c do_dentry_open+0x1e4 vfs_open+0x6c path_openat+0xa20 do_filp_open+0x4c do_sys_open+0x178 Signed-off-by: Ritesh Harjani Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 62f228478849..2feaf1e3fc9f 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1573,7 +1573,7 @@ static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data, while (toread > 0) { tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread); repeat: - page = read_mapping_page(mapping, blkidx, NULL); + page = read_cache_page_gfp(mapping, blkidx, GFP_NOFS); if (IS_ERR(page)) { if (PTR_ERR(page) == -ENOMEM) { congestion_wait(BLK_RW_ASYNC, HZ/50); From 2c8834a7a2c95b19e7242559ac4fc64c0f40916d Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Thu, 8 Mar 2018 16:29:13 +0800 Subject: [PATCH 0653/1212] f2fs: check blkaddr more accuratly before issue a bio This patch check blkaddr more accuratly before issue a write or read bio. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 ++ fs/f2fs/data.c | 5 +++-- fs/f2fs/f2fs.h | 1 + fs/f2fs/segment.h | 25 +++++++++++++++++++------ 4 files changed, 25 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 9db919c423b6..04c608646fd5 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -68,6 +68,7 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, .old_blkaddr = index, .new_blkaddr = index, .encrypted_page = NULL, + .is_meta = is_meta, }; if (unlikely(!is_meta)) @@ -163,6 +164,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, REQ_RAHEAD, .encrypted_page = NULL, .in_list = false, + .is_meta = (type != META_POR), }; struct blk_plug plug; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b66b78d3f76d..b677300c5bac 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -381,6 +381,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) struct page *page = fio->encrypted_page ? fio->encrypted_page : fio->page; + verify_block_addr(fio, fio->new_blkaddr); trace_f2fs_submit_page_bio(page, fio); f2fs_trace_ios(fio, 0); @@ -426,8 +427,8 @@ int f2fs_submit_page_write(struct f2fs_io_info *fio) } if (fio->old_blkaddr != NEW_ADDR) - verify_block_addr(sbi, fio->old_blkaddr); - verify_block_addr(sbi, fio->new_blkaddr); + verify_block_addr(fio, fio->old_blkaddr); + verify_block_addr(fio, fio->new_blkaddr); bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 423603d3f5b3..db5f61f821e4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1065,6 +1065,7 @@ struct f2fs_io_info { bool submitted; /* indicate IO submission */ int need_lock; /* indicate we need to lock cp_rwsem */ bool in_list; /* indicate fio is in io_list */ + bool is_meta; /* indicate borrow meta inode mapping or not */ enum iostat_type io_type; /* io type */ struct writeback_control *io_wbc; /* writeback control */ }; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index d1524d16b2a0..96a2d57ba8a4 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -53,13 +53,19 @@ ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ (sbi)->segs_per_sec)) \ -#define MAIN_BLKADDR(sbi) (SM_I(sbi)->main_blkaddr) -#define SEG0_BLKADDR(sbi) (SM_I(sbi)->seg0_blkaddr) +#define MAIN_BLKADDR(sbi) \ + (SM_I(sbi) ? SM_I(sbi)->main_blkaddr : \ + le32_to_cpu(F2FS_RAW_SUPER(sbi)->main_blkaddr)) +#define SEG0_BLKADDR(sbi) \ + (SM_I(sbi) ? SM_I(sbi)->seg0_blkaddr : \ + le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment0_blkaddr)) #define MAIN_SEGS(sbi) (SM_I(sbi)->main_segments) #define MAIN_SECS(sbi) ((sbi)->total_sections) -#define TOTAL_SEGS(sbi) (SM_I(sbi)->segment_count) +#define TOTAL_SEGS(sbi) \ + (SM_I(sbi) ? SM_I(sbi)->segment_count : \ + le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count)) #define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << (sbi)->log_blocks_per_seg) #define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi)) @@ -632,10 +638,17 @@ static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) f2fs_bug_on(sbi, segno > TOTAL_SEGS(sbi) - 1); } -static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) +static inline void verify_block_addr(struct f2fs_io_info *fio, block_t blk_addr) { - BUG_ON(blk_addr < SEG0_BLKADDR(sbi) - || blk_addr >= MAX_BLKADDR(sbi)); + struct f2fs_sb_info *sbi = fio->sbi; + + if (PAGE_TYPE_OF_BIO(fio->type) == META && + (!is_read_io(fio->op) || fio->is_meta)) + BUG_ON(blk_addr < SEG0_BLKADDR(sbi) || + blk_addr >= MAIN_BLKADDR(sbi)); + else + BUG_ON(blk_addr < MAIN_BLKADDR(sbi) || + blk_addr >= MAX_BLKADDR(sbi)); } /* From d6a69d5e656825919c0b92f50032829f55d6f8f9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 20 Mar 2018 23:08:29 +0800 Subject: [PATCH 0654/1212] f2fs: clean up with F2FS_BLK_ALIGN Clean up F2FS_BYTES_TO_BLK(x + F2FS_BLKSIZE - 1) with F2FS_BLK_ALIGN(x). Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 +-- fs/f2fs/node.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index abc3db46cb1d..161ddca86387 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -575,7 +575,6 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, int truncate_blocks(struct inode *inode, u64 from, bool lock) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - unsigned int blocksize = inode->i_sb->s_blocksize; struct dnode_of_data dn; pgoff_t free_from; int count = 0, err = 0; @@ -584,7 +583,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) trace_f2fs_truncate_blocks_enter(inode, from); - free_from = (pgoff_t)F2FS_BYTES_TO_BLK(from + blocksize - 1); + free_from = (pgoff_t)F2FS_BLK_ALIGN(from); if (free_from >= sbi->max_file_blocks) goto free_partial; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 571cb70c5fbd..35e661890c58 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2603,8 +2603,7 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) if (!enabled_nat_bits(sbi, NULL)) return 0; - nm_i->nat_bits_blocks = F2FS_BYTES_TO_BLK((nat_bits_bytes << 1) + 8 + - F2FS_BLKSIZE - 1); + nm_i->nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8); nm_i->nat_bits = f2fs_kzalloc(sbi, nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, GFP_KERNEL); if (!nm_i->nat_bits) From 49338842e9b23b7a320531b7f199e0e5266e2de4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 20 Mar 2018 23:08:30 +0800 Subject: [PATCH 0655/1212] f2fs: don't track new nat entry in nat set Nat entry set is used only in checkpoint(), and during checkpoint() we won't flush new nat entry with unallocated address, so we don't need to add new nat entry into nat set, then nat_entry_set::entry_cnt can indicate actual entry count we need to flush in checkpoint(). Signed-off-by: Yunlei He Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 29 +++++++++++++++++++++++++---- fs/f2fs/node.h | 1 + 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 35e661890c58..157d768c7b31 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -193,8 +193,8 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e) __free_nat_entry(e); } -static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, - struct nat_entry *ne) +static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i, + struct nat_entry *ne) { nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid); struct nat_entry_set *head; @@ -209,15 +209,36 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, head->entry_cnt = 0; f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head); } + return head; +} + +static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, + struct nat_entry *ne) +{ + struct nat_entry_set *head; + bool new_ne = nat_get_blkaddr(ne) == NEW_ADDR; + + if (!new_ne) + head = __grab_nat_entry_set(nm_i, ne); + + /* + * update entry_cnt in below condition: + * 1. update NEW_ADDR to valid block address; + * 2. update old block address to new one; + */ + if (!new_ne && (get_nat_flag(ne, IS_PREALLOC) || + !get_nat_flag(ne, IS_DIRTY))) + head->entry_cnt++; + + set_nat_flag(ne, IS_PREALLOC, new_ne); if (get_nat_flag(ne, IS_DIRTY)) goto refresh_list; nm_i->dirty_nat_cnt++; - head->entry_cnt++; set_nat_flag(ne, IS_DIRTY, true); refresh_list: - if (nat_get_blkaddr(ne) == NEW_ADDR) + if (new_ne) list_del_init(&ne->list); else list_move_tail(&ne->list, &head->entry_list); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index e593b4d78be2..b95e49e4a928 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -44,6 +44,7 @@ enum { HAS_FSYNCED_INODE, /* is the inode fsynced before? */ HAS_LAST_FSYNC, /* has the latest node fsync mark? */ IS_DIRTY, /* this nat entry is dirty? */ + IS_PREALLOC, /* nat entry is preallocated */ }; /* From 0192e0a4502f23761a844333fb878fc60ce1b029 Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Thu, 22 Mar 2018 10:08:40 +0800 Subject: [PATCH 0656/1212] f2fs: no need to initialize zero value for GFP_F2FS_ZERO Since f2fs_inode_info is allocated with flag GFP_F2FS_ZERO, so we do not need to initialize zero value for its member any more. Signed-off-by: Yunlong Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 2feaf1e3fc9f..a622eb4f59f2 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -827,7 +827,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) /* Initialize f2fs-specific inode info */ atomic_set(&fi->dirty_pages, 0); fi->i_current_depth = 1; - fi->i_advise = 0; init_rwsem(&fi->i_sem); INIT_LIST_HEAD(&fi->dirty_list); INIT_LIST_HEAD(&fi->gdirty_list); @@ -839,10 +838,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) init_rwsem(&fi->i_mmap_sem); init_rwsem(&fi->i_xattr_sem); -#ifdef CONFIG_QUOTA - memset(&fi->i_dquot, 0, sizeof(fi->i_dquot)); - fi->i_reserved_quota = 0; -#endif /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; From ee2e74b3f00e663207d7832f613e75a5df3ae3fb Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Mon, 26 Mar 2018 17:32:23 +0800 Subject: [PATCH 0657/1212] f2fs: Add a segment type check in inplace write This patch add a segment type check in IPU, in case of something wrong with blkadd in dnode. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 3389721893d3..d7bac60ad719 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2871,10 +2871,15 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) int rewrite_data_page(struct f2fs_io_info *fio) { int err; + struct f2fs_sb_info *sbi = fio->sbi; fio->new_blkaddr = fio->old_blkaddr; /* i/o temperature is needed for passing down write hints */ __get_segment_type(fio); + + f2fs_bug_on(sbi, !IS_DATASEG(get_seg_entry(sbi, + GET_SEGNO(sbi, fio->new_blkaddr))->type)); + stat_inc_inplace_blocks(fio->sbi); err = f2fs_submit_page_bio(fio); From db2188a68704bd120d32836bc5ac273dc26b4617 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 28 Mar 2018 11:15:09 -0700 Subject: [PATCH 0658/1212] f2fs: reserve bits for fs-verity Reserve an F2FS feature flag and inode flag for fs-verity. This is an in-development feature that is planned be discussed at LSF/MM 2018 [1]. It will provide file-based integrity and authenticity for read-only files. Most code will be in a filesystem-independent module, with smaller changes needed to individual filesystems that opt-in to supporting the feature. An early prototype supporting F2FS is available [2]. Reserving the F2FS on-disk bits for fs-verity will prevent users of the prototype from conflicting with other new F2FS features. Note that we're reserving the inode flag in f2fs_inode.i_advise, which isn't really appropriate since it's not a hint or advice. But ->i_advise is already being used to hold the 'encrypt' flag; and F2FS's ->i_flags uses the generic FS_* values, so it seems ->i_flags can't be used for an F2FS-specific flag without additional work to remove the assumption that ->i_flags uses the generic flags namespace. [1] https://marc.info/?l=linux-fsdevel&m=151690752225644 [2] https://git.kernel.org/pub/scm/linux/kernel/git/mhalcrow/linux.git/log/?h=fs-verity-dev Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index db5f61f821e4..2ede2e36f30f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -147,6 +147,7 @@ struct f2fs_mount_info { #define F2FS_FEATURE_QUOTA_INO 0x0080 #define F2FS_FEATURE_INODE_CRTIME 0x0100 #define F2FS_FEATURE_LOST_FOUND 0x0200 +#define F2FS_FEATURE_VERITY 0x0400 /* reserved */ #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -662,6 +663,7 @@ enum { #define FADVISE_ENC_NAME_BIT 0x08 #define FADVISE_KEEP_SIZE_BIT 0x10 #define FADVISE_HOT_BIT 0x20 +#define FADVISE_VERITY_BIT 0x40 /* reserved */ #define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) #define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) From 86444d6006929a57706be791a09e1e77f9fd6fdd Mon Sep 17 00:00:00 2001 From: Junling Zheng Date: Thu, 29 Mar 2018 19:27:12 +0800 Subject: [PATCH 0659/1212] f2fs: fix a wrong condition in f2fs_skip_inode_update Fix commit 97dd26ad8347 (f2fs: fix wrong AUTO_RECOVER condition). We should use ~PAGE_MASK to determine whether i_size is aligned to the f2fs's block size or not. Signed-off-by: Junling Zheng Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2ede2e36f30f..1ca32899af2b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2535,7 +2535,7 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) } if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) || file_keep_isize(inode) || - i_size_read(inode) & PAGE_MASK) + i_size_read(inode) & ~PAGE_MASK) return false; down_read(&F2FS_I(inode)->i_sem); From a8d07f1f9c627e53ae608cb3273ade02ae56e343 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 30 Mar 2018 17:58:13 -0700 Subject: [PATCH 0660/1212] f2fs: truncate preallocated blocks in error case If write is failed, we must deallocate the blocks that we couldn't write. Cc: stable@vger.kernel.org Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 161ddca86387..39c3acb454a3 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2711,6 +2711,8 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = generic_write_checks(iocb, from); if (ret > 0) { + bool preallocated = false; + size_t target_size = 0; int err; if (iov_iter_fault_in_readable(from, iov_iter_count(from))) @@ -2727,6 +2729,9 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) } } else { + preallocated = true; + target_size = iocb->ki_pos + iov_iter_count(from); + err = f2fs_preallocate_blocks(iocb, from); if (err) { clear_inode_flag(inode, FI_NO_PREALLOC); @@ -2739,6 +2744,10 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) blk_finish_plug(&plug); clear_inode_flag(inode, FI_NO_PREALLOC); + /* if we couldn't write data, we should deallocate blocks. */ + if (preallocated && i_size_read(inode) < target_size) + f2fs_truncate(inode); + if (ret > 0) f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret); } From 6cb5aa02bfbd5081549988aa76cd3598eb1acdab Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Mon, 2 Apr 2018 20:22:20 +0800 Subject: [PATCH 0661/1212] f2fs: make assignment of t->dentry_bitmap more readable In make_dentry_ptr_block, it is confused with "&" for t->dentry_bitmap but without "&" for t->dentry, so delete "&" to make code more readable. Signed-off-by: Yunlong Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1ca32899af2b..3bb4071633f2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -536,7 +536,7 @@ static inline void make_dentry_ptr_block(struct inode *inode, d->inode = inode; d->max = NR_DENTRY_IN_BLOCK; d->nr_bitmap = SIZE_OF_DENTRY_BITMAP; - d->bitmap = &t->dentry_bitmap; + d->bitmap = t->dentry_bitmap; d->dentry = t->dentry; d->filename = t->filename; } From 42bf67fc543bc5945b355ad38f40cd84c3886786 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 29 Mar 2018 22:50:41 -0700 Subject: [PATCH 0662/1212] f2fs: remain written times to update inode during fsync This fixes xfstests/generic/392. The failure was caused by different times between 1) one marked in the last fsync(2) call and 2) the other given by roll-forward recovery after power-cut. The reason was that we skipped updating inode block at 1), since its i_size was recoverable along with 4KB-aligned data writes, which was fixed by: "f2fs: fix a wrong condition in f2fs_skip_inode_update" Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 11 +++++++++++ fs/f2fs/inode.c | 8 ++++++++ 2 files changed, 19 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3bb4071633f2..a8bdcf5fc50f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -728,6 +728,7 @@ struct f2fs_inode_info { kprojid_t i_projid; /* id for project quota */ int i_inline_xattr_size; /* inline xattr size */ struct timespec i_crtime; /* inode creation time */ + struct timespec i_disk_time[4]; /* inode disk times */ }; static inline void get_extent_info(struct extent_info *ext, @@ -2538,6 +2539,16 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) i_size_read(inode) & ~PAGE_MASK) return false; + if (!timespec_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime)) + return false; + if (!timespec_equal(F2FS_I(inode)->i_disk_time + 1, &inode->i_ctime)) + return false; + if (!timespec_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime)) + return false; + if (!timespec_equal(F2FS_I(inode)->i_disk_time + 3, + &F2FS_I(inode)->i_crtime)) + return false; + down_read(&F2FS_I(inode)->i_sem); ret = F2FS_I(inode)->last_disk_size == i_size_read(inode); up_read(&F2FS_I(inode)->i_sem); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 562a56bc037c..51846fc54fbd 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -284,6 +284,10 @@ static int do_read_inode(struct inode *inode) fi->i_crtime.tv_nsec = le32_to_cpu(ri->i_crtime_nsec); } + F2FS_I(inode)->i_disk_time[0] = inode->i_atime; + F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; + F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; + F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); @@ -444,6 +448,10 @@ void update_inode(struct inode *inode, struct page *node_page) if (inode->i_nlink == 0) clear_inline_node(node_page); + F2FS_I(inode)->i_disk_time[0] = inode->i_atime; + F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; + F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; + F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; } void update_inode_page(struct inode *inode) From 13890bed2032a3d92ea25df2ffe42b54d329f60d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 18 Apr 2018 11:09:47 -0700 Subject: [PATCH 0663/1212] fscrypt: allow synchronous bio decryption Currently, fscrypt provides fscrypt_decrypt_bio_pages() which decrypts a bio's pages asynchronously, then unlocks them afterwards. But, this assumes that decryption is the last "postprocessing step" for the bio, so it's incompatible with additional postprocessing steps such as authenticity verification after decryption. Therefore, rename the existing fscrypt_decrypt_bio_pages() to fscrypt_enqueue_decrypt_bio(). Then, add fscrypt_decrypt_bio() which decrypts the pages in the bio synchronously without unlocking the pages, nor setting them Uptodate; and add fscrypt_enqueue_decrypt_work(), which enqueues work on the fscrypt_read_workqueue. The new functions will be used by filesystems that support both fscrypt and fs-verity. Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/crypto/bio.c | 35 +++++++++++++++++++++------------ fs/crypto/crypto.c | 8 +++++++- fs/crypto/fscrypt_private.h | 1 - fs/f2fs/data.c | 2 +- include/linux/fscrypt_notsupp.h | 13 +++++++++--- include/linux/fscrypt_supp.h | 5 ++++- 6 files changed, 44 insertions(+), 20 deletions(-) diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index a91ed46fe503..c7cf565c434e 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -25,15 +25,8 @@ #include #include "fscrypt_private.h" -/* - * Call fscrypt_decrypt_page on every single page, reusing the encryption - * context. - */ -static void completion_pages(struct work_struct *work) +static void __fscrypt_decrypt_bio(struct bio *bio, bool done) { - struct fscrypt_ctx *ctx = - container_of(work, struct fscrypt_ctx, r.work); - struct bio *bio = ctx->r.bio; struct bio_vec *bv; int i; @@ -45,22 +38,38 @@ static void completion_pages(struct work_struct *work) if (ret) { WARN_ON_ONCE(1); SetPageError(page); - } else { + } else if (done) { SetPageUptodate(page); } - unlock_page(page); + if (done) + unlock_page(page); } +} + +void fscrypt_decrypt_bio(struct bio *bio) +{ + __fscrypt_decrypt_bio(bio, false); +} +EXPORT_SYMBOL(fscrypt_decrypt_bio); + +static void completion_pages(struct work_struct *work) +{ + struct fscrypt_ctx *ctx = + container_of(work, struct fscrypt_ctx, r.work); + struct bio *bio = ctx->r.bio; + + __fscrypt_decrypt_bio(bio, true); fscrypt_release_ctx(ctx); bio_put(bio); } -void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio) +void fscrypt_enqueue_decrypt_bio(struct fscrypt_ctx *ctx, struct bio *bio) { INIT_WORK(&ctx->r.work, completion_pages); ctx->r.bio = bio; - queue_work(fscrypt_read_workqueue, &ctx->r.work); + fscrypt_enqueue_decrypt_work(&ctx->r.work); } -EXPORT_SYMBOL(fscrypt_decrypt_bio_pages); +EXPORT_SYMBOL(fscrypt_enqueue_decrypt_bio); void fscrypt_pullback_bio_page(struct page **page, bool restore) { diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index ce654526c0fb..0758d32ad01b 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -45,12 +45,18 @@ static mempool_t *fscrypt_bounce_page_pool = NULL; static LIST_HEAD(fscrypt_free_ctxs); static DEFINE_SPINLOCK(fscrypt_ctx_lock); -struct workqueue_struct *fscrypt_read_workqueue; +static struct workqueue_struct *fscrypt_read_workqueue; static DEFINE_MUTEX(fscrypt_init_mutex); static struct kmem_cache *fscrypt_ctx_cachep; struct kmem_cache *fscrypt_info_cachep; +void fscrypt_enqueue_decrypt_work(struct work_struct *work) +{ + queue_work(fscrypt_read_workqueue, work); +} +EXPORT_SYMBOL(fscrypt_enqueue_decrypt_work); + /** * fscrypt_release_ctx() - Releases an encryption context * @ctx: The encryption context to release. diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 5c296d4af4a9..426aa1b27f17 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -107,7 +107,6 @@ static inline bool fscrypt_valid_enc_modes(u32 contents_mode, /* crypto.c */ extern struct kmem_cache *fscrypt_info_cachep; extern int fscrypt_initialize(unsigned int cop_flags); -extern struct workqueue_struct *fscrypt_read_workqueue; extern int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, u64 lblk_num, struct page *src_page, diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b677300c5bac..7acc982f632d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -65,7 +65,7 @@ static void f2fs_read_end_io(struct bio *bio) if (bio->bi_error) { fscrypt_release_ctx(bio->bi_private); } else { - fscrypt_decrypt_bio_pages(bio->bi_private, bio); + fscrypt_enqueue_decrypt_bio(bio->bi_private, bio); return; } } diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index 5777251400f9..44bd4fbd3ec5 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -24,6 +24,10 @@ static inline bool fscrypt_dummy_context_enabled(struct inode *inode) } /* crypto.c */ +static inline void fscrypt_enqueue_decrypt_work(struct work_struct *work) +{ +} + static inline struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags) { @@ -160,10 +164,13 @@ static inline bool fscrypt_match_name(const struct fscrypt_name *fname, } /* bio.c */ -static inline void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, - struct bio *bio) +static inline void fscrypt_decrypt_bio(struct bio *bio) +{ +} + +static inline void fscrypt_enqueue_decrypt_bio(struct fscrypt_ctx *ctx, + struct bio *bio) { - return; } static inline void fscrypt_pullback_bio_page(struct page **page, bool restore) diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index c88d2058902a..9d1857302b73 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -58,6 +58,7 @@ static inline bool fscrypt_dummy_context_enabled(struct inode *inode) } /* crypto.c */ +extern void fscrypt_enqueue_decrypt_work(struct work_struct *); extern struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *, gfp_t); extern void fscrypt_release_ctx(struct fscrypt_ctx *); extern struct page *fscrypt_encrypt_page(const struct inode *, struct page *, @@ -187,7 +188,9 @@ static inline bool fscrypt_match_name(const struct fscrypt_name *fname, } /* bio.c */ -extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *); +extern void fscrypt_decrypt_bio(struct bio *); +extern void fscrypt_enqueue_decrypt_bio(struct fscrypt_ctx *ctx, + struct bio *bio); extern void fscrypt_pullback_bio_page(struct page **, bool); extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t, unsigned int); From c18b4f60c8dfa090117422018e5f052e2b5b5ba8 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 18 Apr 2018 11:09:48 -0700 Subject: [PATCH 0664/1212] f2fs: refactor read path to allow multiple postprocessing steps Currently f2fs's ->readpage() and ->readpages() assume that either the data undergoes no postprocessing, or decryption only. But with fs-verity, there will be an additional authenticity verification step, and it may be needed either by itself, or combined with decryption. To support this, store a 'struct bio_post_read_ctx' in ->bi_private which contains a work struct, a bitmask of postprocessing steps that are enabled, and an indicator of the current step. The bio completion routine, if there was no I/O error, enqueues the first postprocessing step. When that completes, it continues to the next step. Pages that fail any postprocessing step have PageError set. Once all steps have completed, pages without PageError set are set Uptodate, and all pages are unlocked. Also replace f2fs_encrypted_file() with a new function f2fs_post_read_required() in places like direct I/O and garbage collection that really should be testing whether the file needs special I/O processing, not whether it is encrypted specifically. This may also be useful for other future f2fs features such as compression. Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 168 +++++++++++++++++++++++++++++++++++------------ fs/f2fs/f2fs.h | 12 +++- fs/f2fs/file.c | 4 +- fs/f2fs/gc.c | 6 +- fs/f2fs/inline.c | 2 +- fs/f2fs/super.c | 6 ++ 6 files changed, 148 insertions(+), 50 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7acc982f632d..d3d2e4775003 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -29,6 +29,11 @@ #include "trace.h" #include +#define NUM_PREALLOC_POST_READ_CTXS 128 + +static struct kmem_cache *bio_post_read_ctx_cache; +static mempool_t *bio_post_read_ctx_pool; + static bool __is_cp_guaranteed(struct page *page) { struct address_space *mapping = page->mapping; @@ -49,11 +54,77 @@ static bool __is_cp_guaranteed(struct page *page) return false; } -static void f2fs_read_end_io(struct bio *bio) +/* postprocessing steps for read bios */ +enum bio_post_read_step { + STEP_INITIAL = 0, + STEP_DECRYPT, +}; + +struct bio_post_read_ctx { + struct bio *bio; + struct work_struct work; + unsigned int cur_step; + unsigned int enabled_steps; +}; + +static void __read_end_io(struct bio *bio) { - struct bio_vec *bvec; + struct page *page; + struct bio_vec *bv; int i; + bio_for_each_segment_all(bv, bio, i) { + page = bv->bv_page; + + /* PG_error was set if any post_read step failed */ + if (bio->bi_error || PageError(page)) { + ClearPageUptodate(page); + SetPageError(page); + } else { + SetPageUptodate(page); + } + unlock_page(page); + } + if (bio->bi_private) + mempool_free(bio->bi_private, bio_post_read_ctx_pool); + bio_put(bio); +} + +static void bio_post_read_processing(struct bio_post_read_ctx *ctx); + +static void decrypt_work(struct work_struct *work) +{ + struct bio_post_read_ctx *ctx = + container_of(work, struct bio_post_read_ctx, work); + + fscrypt_decrypt_bio(ctx->bio); + + bio_post_read_processing(ctx); +} + +static void bio_post_read_processing(struct bio_post_read_ctx *ctx) +{ + switch (++ctx->cur_step) { + case STEP_DECRYPT: + if (ctx->enabled_steps & (1 << STEP_DECRYPT)) { + INIT_WORK(&ctx->work, decrypt_work); + fscrypt_enqueue_decrypt_work(&ctx->work); + return; + } + ctx->cur_step++; + /* fall-through */ + default: + __read_end_io(ctx->bio); + } +} + +static bool f2fs_bio_post_read_required(struct bio *bio) +{ + return bio->bi_private && !bio->bi_error; +} + +static void f2fs_read_end_io(struct bio *bio) +{ #ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) { f2fs_show_injection_info(FAULT_IO); @@ -61,28 +132,15 @@ static void f2fs_read_end_io(struct bio *bio) } #endif - if (f2fs_bio_encrypted(bio)) { - if (bio->bi_error) { - fscrypt_release_ctx(bio->bi_private); - } else { - fscrypt_enqueue_decrypt_bio(bio->bi_private, bio); - return; - } + if (f2fs_bio_post_read_required(bio)) { + struct bio_post_read_ctx *ctx = bio->bi_private; + + ctx->cur_step = STEP_INITIAL; + bio_post_read_processing(ctx); + return; } - bio_for_each_segment_all(bvec, bio, i) { - struct page *page = bvec->bv_page; - - if (!bio->bi_error) { - if (!PageUptodate(page)) - SetPageUptodate(page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - unlock_page(page); - } - bio_put(bio); + __read_end_io(bio); } static void f2fs_write_end_io(struct bio *bio) @@ -479,29 +537,33 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, unsigned nr_pages) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct fscrypt_ctx *ctx = NULL; struct bio *bio; + struct bio_post_read_ctx *ctx; + unsigned int post_read_steps = 0; - if (f2fs_encrypted_file(inode)) { - ctx = fscrypt_get_ctx(inode, GFP_NOFS); - if (IS_ERR(ctx)) - return ERR_CAST(ctx); + bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), false); + if (!bio) + return ERR_PTR(-ENOMEM); + f2fs_target_device(sbi, blkaddr, bio); + bio->bi_end_io = f2fs_read_end_io; + bio_set_op_attrs(bio, REQ_OP_READ, 0); + + if (f2fs_encrypted_file(inode)) + post_read_steps |= 1 << STEP_DECRYPT; + if (post_read_steps) { + ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); + if (!ctx) { + bio_put(bio); + return ERR_PTR(-ENOMEM); + } + ctx->bio = bio; + ctx->enabled_steps = post_read_steps; + bio->bi_private = ctx; /* wait the page to be moved by cleaning */ f2fs_wait_on_block_writeback(sbi, blkaddr); } - bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), false); - if (!bio) { - if (ctx) - fscrypt_release_ctx(ctx); - return ERR_PTR(-ENOMEM); - } - f2fs_target_device(sbi, blkaddr, bio); - bio->bi_end_io = f2fs_read_end_io; - bio->bi_private = ctx; - bio_set_op_attrs(bio, REQ_OP_READ, 0); - return bio; } @@ -1522,7 +1584,7 @@ static int encrypt_one_page(struct f2fs_io_info *fio) if (!f2fs_encrypted_file(inode)) return 0; - /* wait for GCed encrypted page writeback */ + /* wait for GCed page writeback via META_MAPPING */ f2fs_wait_on_block_writeback(fio->sbi, fio->old_blkaddr); retry_encrypt: @@ -2224,8 +2286,8 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, f2fs_wait_on_page_writeback(page, DATA, false); - /* wait for GCed encrypted page writeback */ - if (f2fs_encrypted_file(inode)) + /* wait for GCed page writeback via META_MAPPING */ + if (f2fs_post_read_required(inode)) f2fs_wait_on_block_writeback(sbi, blkaddr); if (len == PAGE_SIZE || PageUptodate(page)) @@ -2556,3 +2618,27 @@ const struct address_space_operations f2fs_dblock_aops = { .migratepage = f2fs_migrate_page, #endif }; + +int __init f2fs_init_post_read_processing(void) +{ + bio_post_read_ctx_cache = KMEM_CACHE(bio_post_read_ctx, 0); + if (!bio_post_read_ctx_cache) + goto fail; + bio_post_read_ctx_pool = + mempool_create_slab_pool(NUM_PREALLOC_POST_READ_CTXS, + bio_post_read_ctx_cache); + if (!bio_post_read_ctx_pool) + goto fail_free_cache; + return 0; + +fail_free_cache: + kmem_cache_destroy(bio_post_read_ctx_cache); +fail: + return -ENOMEM; +} + +void __exit f2fs_destroy_post_read_processing(void) +{ + mempool_destroy(bio_post_read_ctx_pool); + kmem_cache_destroy(bio_post_read_ctx_cache); +} diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a8bdcf5fc50f..5ca193f25874 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2942,6 +2942,8 @@ void destroy_checkpoint_caches(void); /* * data.c */ +int f2fs_init_post_read_processing(void); +void f2fs_destroy_post_read_processing(void); void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, nid_t ino, pgoff_t idx, @@ -3302,9 +3304,13 @@ static inline void f2fs_set_encrypted_inode(struct inode *inode) #endif } -static inline bool f2fs_bio_encrypted(struct bio *bio) +/* + * Returns true if the reads of the inode's data need to undergo some + * postprocessing step, like decryption or authenticity verification. + */ +static inline bool f2fs_post_read_required(struct inode *inode) { - return bio->bi_private != NULL; + return f2fs_encrypted_file(inode); } #define F2FS_FEATURE_FUNCS(name, flagname) \ @@ -3372,7 +3378,7 @@ static inline bool f2fs_may_encrypt(struct inode *inode) static inline bool f2fs_force_buffered_io(struct inode *inode, int rw) { - return (f2fs_encrypted_file(inode) || + return (f2fs_post_read_required(inode) || (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) || F2FS_I_SB(inode)->s_ndevs); } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 39c3acb454a3..7587758a285f 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -113,8 +113,8 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, /* fill the page */ f2fs_wait_on_page_writeback(page, DATA, false); - /* wait for GCed encrypted page writeback */ - if (f2fs_encrypted_file(inode)) + /* wait for GCed page writeback via META_MAPPING */ + if (f2fs_post_read_required(inode)) f2fs_wait_on_block_writeback(sbi, dn.data_blkaddr); out_sem: diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 54f51a990794..c009b50d69f5 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -850,8 +850,8 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (IS_ERR(inode) || is_bad_inode(inode)) continue; - /* if encrypted inode, let's go phase 3 */ - if (f2fs_encrypted_file(inode)) { + /* if inode uses special I/O path, let's go phase 3 */ + if (f2fs_post_read_required(inode)) { add_gc_inode(gc_list, inode); continue; } @@ -899,7 +899,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, start_bidx = start_bidx_of_node(nofs, inode) + ofs_in_node; - if (f2fs_encrypted_file(inode)) + if (f2fs_post_read_required(inode)) move_data_block(inode, start_bidx, segno, off); else move_data_page(inode, start_bidx, gc_type, diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 12f6c6471c56..67523fabb822 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -25,7 +25,7 @@ bool f2fs_may_inline_data(struct inode *inode) if (i_size_read(inode) > MAX_INLINE_DATA(inode)) return false; - if (f2fs_encrypted_file(inode)) + if (f2fs_post_read_required(inode)) return false; return true; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a622eb4f59f2..55b2bad55671 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3100,8 +3100,13 @@ static int __init init_f2fs_fs(void) err = f2fs_create_root_stats(); if (err) goto free_filesystem; + err = f2fs_init_post_read_processing(); + if (err) + goto free_root_stats; return 0; +free_root_stats: + f2fs_destroy_root_stats(); free_filesystem: unregister_filesystem(&f2fs_fs_type); free_shrinker: @@ -3124,6 +3129,7 @@ static int __init init_f2fs_fs(void) static void __exit exit_f2fs_fs(void) { + f2fs_destroy_post_read_processing(); f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); unregister_shrinker(&f2fs_shrinker_info); From dafecc032ea1cfbc1399ec5371d10961aaf59a59 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 18 Apr 2018 15:48:42 -0700 Subject: [PATCH 0665/1212] f2fs: call unlock_new_inode() before d_instantiate() xfstest generic/429 sometimes hangs on f2fs, caused by a thread being unable to take a directory's i_rwsem for write in vfs_rmdir(). In the test, one thread repeatedly creates and removes a directory, and other threads repeatedly look up a file in the directory. The bug is that f2fs_mkdir() calls d_instantiate() before unlock_new_inode(), resulting in the directory inode being exposed to lookups before it has been fully initialized. And with CONFIG_DEBUG_LOCK_ALLOC, unlock_new_inode() reinitializes ->i_rwsem, corrupting its state when it is already held. Fix it by calling unlock_new_inode() before d_instantiate(). This matches what other filesystems do. Fixes: 57397d86c62d ("f2fs: add inode operations for special inodes") Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 5ec20f077629..fecae8685d2a 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -294,8 +294,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, alloc_nid_done(sbi, ino); - d_instantiate(dentry, inode); unlock_new_inode(inode); + d_instantiate(dentry, inode); if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); @@ -594,8 +594,8 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, err = page_symlink(inode, disk_link.name, disk_link.len); err_out: - d_instantiate(dentry, inode); unlock_new_inode(inode); + d_instantiate(dentry, inode); /* * Let's flush symlink data in order to avoid broken symlink as much as @@ -658,8 +658,8 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) alloc_nid_done(sbi, inode->i_ino); - d_instantiate(dentry, inode); unlock_new_inode(inode); + d_instantiate(dentry, inode); if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); @@ -710,8 +710,8 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, alloc_nid_done(sbi, inode->i_ino); - d_instantiate(dentry, inode); unlock_new_inode(inode); + d_instantiate(dentry, inode); if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); From 070da80085a4503d7e7c662f15cd4793d622a626 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 11 Apr 2018 23:09:04 -0700 Subject: [PATCH 0666/1212] f2fs: clear PageError on writepage This patch clears PageError in some pages tagged by read path, but when we write the pages with valid contents, writepage should clear the bit likewise ext4. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d3d2e4775003..b8c142de7bb4 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1734,6 +1734,7 @@ int do_write_data_page(struct f2fs_io_info *fio) goto out_writepage; set_page_writeback(page); + ClearPageError(page); f2fs_put_dnode(&dn); if (fio->need_lock == LOCK_REQ) f2fs_unlock_op(fio->sbi); @@ -1756,6 +1757,7 @@ int do_write_data_page(struct f2fs_io_info *fio) goto out_writepage; set_page_writeback(page); + ClearPageError(page); /* LFS mode write path */ write_data_page(&dn, fio); From 3e7a141175756d3df614ee9ab7480c2e04642b1d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 20 Apr 2018 19:29:52 -0700 Subject: [PATCH 0667/1212] Revert "f2fs: introduce f2fs_set_page_dirty_nobuffer" This patch reverts copied f2fs_set_page_dirty_nobuffer to use generic function for stability. This reverts commit fe76b796fc5194cc3d57265002e3a748566d073f. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/data.c | 35 +---------------------------------- fs/f2fs/f2fs.h | 1 - fs/f2fs/node.c | 2 +- 4 files changed, 3 insertions(+), 37 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 04c608646fd5..760d1ad22722 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -386,7 +386,7 @@ static int f2fs_set_meta_page_dirty(struct page *page) if (!PageUptodate(page)) SetPageUptodate(page); if (!PageDirty(page)) { - f2fs_set_page_dirty_nobuffers(page); + __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META); SetPagePrivate(page); f2fs_trace_pid(page); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b8c142de7bb4..b48c578c0bf6 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -19,8 +19,6 @@ #include #include #include -#include -#include #include #include "f2fs.h" @@ -2473,37 +2471,6 @@ int f2fs_release_page(struct page *page, gfp_t wait) return 1; } -/* - * This was copied from __set_page_dirty_buffers which gives higher performance - * in very high speed storages. (e.g., pmem) - */ -void f2fs_set_page_dirty_nobuffers(struct page *page) -{ - struct address_space *mapping = page->mapping; - struct mem_cgroup *memcg; - unsigned long flags; - - if (unlikely(!mapping)) - return; - - spin_lock(&mapping->private_lock); - memcg = mem_cgroup_begin_page_stat(page); - SetPageDirty(page); - spin_unlock(&mapping->private_lock); - - spin_lock_irqsave(&mapping->tree_lock, flags); - WARN_ON_ONCE(!PageUptodate(page)); - account_page_dirtied(page, mapping, memcg); - radix_tree_tag_set(&mapping->page_tree, - page_index(page), PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); - - mem_cgroup_end_page_stat(memcg); - - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - return; -} - static int f2fs_set_data_page_dirty(struct page *page) { struct address_space *mapping = page->mapping; @@ -2527,7 +2494,7 @@ static int f2fs_set_data_page_dirty(struct page *page) } if (!PageDirty(page)) { - f2fs_set_page_dirty_nobuffers(page); + __set_page_dirty_nobuffers(page); update_dirty_page(inode, page); return 1; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5ca193f25874..486107b8b38c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2975,7 +2975,6 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); bool should_update_inplace(struct inode *inode, struct f2fs_io_info *fio); bool should_update_outplace(struct inode *inode, struct f2fs_io_info *fio); -void f2fs_set_page_dirty_nobuffers(struct page *page); int __f2fs_write_data_pages(struct address_space *mapping, struct writeback_control *wbc, enum iostat_type io_type); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 157d768c7b31..3871e7d3f69e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1775,7 +1775,7 @@ static int f2fs_set_node_page_dirty(struct page *page) if (!PageUptodate(page)) SetPageUptodate(page); if (!PageDirty(page)) { - f2fs_set_page_dirty_nobuffers(page); + __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); SetPagePrivate(page); f2fs_trace_pid(page); From f819874f58cf77184907d41e7358d970f32bc061 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 20 Apr 2018 23:44:59 -0700 Subject: [PATCH 0668/1212] f2fs: check cap_resource only for data blocks This patch changes the rule to check cap_resource for data blocks, not inode or node blocks in order to avoid selinux denial. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 486107b8b38c..978c58d329f8 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1678,7 +1678,7 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs) } static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi, - struct inode *inode) + struct inode *inode, bool cap) { if (!inode) return true; @@ -1691,7 +1691,7 @@ static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi, if (!gid_eq(F2FS_OPTION(sbi).s_resgid, GLOBAL_ROOT_GID) && in_group_p(F2FS_OPTION(sbi).s_resgid)) return true; - if (capable(CAP_SYS_RESOURCE)) + if (cap && capable(CAP_SYS_RESOURCE)) return true; return false; } @@ -1726,7 +1726,7 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, avail_user_block_count = sbi->user_block_count - sbi->current_reserved_blocks; - if (!__allow_reserved_blocks(sbi, inode)) + if (!__allow_reserved_blocks(sbi, inode, true)) avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks; if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) { @@ -1933,7 +1933,7 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, valid_block_count = sbi->total_valid_block_count + sbi->current_reserved_blocks + 1; - if (!__allow_reserved_blocks(sbi, inode)) + if (!__allow_reserved_blocks(sbi, inode, false)) valid_block_count += F2FS_OPTION(sbi).root_reserved_blocks; if (unlikely(valid_block_count > sbi->user_block_count)) { From a44b418c31458b213ab59659776f5597e7e78b32 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 11 Apr 2018 23:09:04 -0700 Subject: [PATCH 0669/1212] f2fs: clear PageError on writepage - part 2 This patch clears PageError in some pages tagged by read path, but when we write the pages with valid contents, writepage should clear the bit likewise ext4. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 1 + fs/f2fs/inline.c | 1 + fs/f2fs/node.c | 1 + fs/f2fs/segment.c | 1 + 4 files changed, 4 insertions(+) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index c009b50d69f5..d28d31cbd7d2 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -693,6 +693,7 @@ static void move_data_block(struct inode *inode, block_t bidx, dec_page_count(fio.sbi, F2FS_DIRTY_META); set_page_writeback(fio.encrypted_page); + ClearPageError(page); /* allocate block address */ f2fs_wait_on_page_writeback(dn.node_page, NODE, true); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 67523fabb822..2ff0305391cd 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -139,6 +139,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) /* write data page to try to make data consistent */ set_page_writeback(page); + ClearPageError(page); fio.old_blkaddr = dn->data_blkaddr; set_inode_flag(dn->inode, FI_HOT_DATA); write_data_page(dn, &fio); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 3871e7d3f69e..16aee2a7b8a9 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1398,6 +1398,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, fio.op_flags |= WRITE_FLUSH_FUA; set_page_writeback(page); + ClearPageError(page); fio.old_blkaddr = ni.blk_addr; write_node_page(nid, &fio); set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d7bac60ad719..01bc94df9f00 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2838,6 +2838,7 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, fio.op_flags &= ~REQ_META; set_page_writeback(page); + ClearPageError(page); f2fs_submit_page_write(&fio); f2fs_update_iostat(sbi, io_type, F2FS_BLKSIZE); From 520a9486182437847212c8e226d042b1e14b7cc2 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 3 May 2018 23:26:02 -0700 Subject: [PATCH 0670/1212] f2fs: avoid fsync() failure caused by EAGAIN in writepage() pageout() in MM traslates EAGAIN, so calls handle_write_error() -> mapping_set_error() -> set_bit(AS_EIO, ...). file_write_and_wait_range() will see EIO error, which is critical to return value of fsync() followed by atomic_write failure to user. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b48c578c0bf6..b675d5dd5c91 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1901,7 +1901,13 @@ static int __write_data_page(struct page *page, bool *submitted, redirty_out: redirty_page_for_writepage(wbc, page); - if (!err) + /* + * pageout() in MM traslates EAGAIN, so calls handle_write_error() + * -> mapping_set_error() -> set_bit(AS_EIO, ...). + * file_write_and_wait_range() will see EIO error, which is critical + * to return value of fsync() followed by atomic_write failure to user. + */ + if (!err || wbc->for_reclaim) return AOP_WRITEPAGE_ACTIVATE; unlock_page(page); return err; From bb53d06b5f21161295e6dea0eda941351cc9d3a1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 24 May 2018 13:57:26 -0700 Subject: [PATCH 0671/1212] f2fs: let fstrim issue discard commands in lower priority The fstrim gathers huge number of large discard commands, and tries to issue without IO awareness, which results in long user-perceive IO latencies on READ, WRITE, and FLUSH in UFS. We've observed some of commands take several seconds due to long discard latency. This patch limits the maximum size to 2MB per candidate, and check IO congestion when issuing them to disk. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 4 +- fs/f2fs/segment.c | 139 +++++++++++++++++++++++++--------------------- 2 files changed, 78 insertions(+), 65 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 978c58d329f8..0bb23ad94b39 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -250,6 +250,7 @@ enum { (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) #define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi) #define DEF_MAX_DISCARD_REQUEST 8 /* issue 8 discards per round */ +#define DEF_MAX_DISCARD_LEN 512 /* Max. 2MB per discard */ #define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */ #define DEF_MAX_DISCARD_ISSUE_TIME 60000 /* 60 s, if no candidates */ #define DEF_CP_INTERVAL 60 /* 60 secs */ @@ -758,7 +759,8 @@ static inline void set_extent_info(struct extent_info *ei, unsigned int fofs, static inline bool __is_discard_mergeable(struct discard_info *back, struct discard_info *front) { - return back->lstart + back->len == front->lstart; + return (back->lstart + back->len == front->lstart) && + (back->len + front->len < DEF_MAX_DISCARD_LEN); } static inline bool __is_discard_back_mergeable(struct discard_info *cur, diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 01bc94df9f00..0889f4b8dbf3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1210,68 +1210,6 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, return 0; } -static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi, - struct discard_policy *dpolicy, - unsigned int start, unsigned int end) -{ - struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct discard_cmd *prev_dc = NULL, *next_dc = NULL; - struct rb_node **insert_p = NULL, *insert_parent = NULL; - struct discard_cmd *dc; - struct blk_plug plug; - int issued; - -next: - issued = 0; - - mutex_lock(&dcc->cmd_lock); - f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); - - dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root, - NULL, start, - (struct rb_entry **)&prev_dc, - (struct rb_entry **)&next_dc, - &insert_p, &insert_parent, true); - if (!dc) - dc = next_dc; - - blk_start_plug(&plug); - - while (dc && dc->lstart <= end) { - struct rb_node *node; - - if (dc->len < dpolicy->granularity) - goto skip; - - if (dc->state != D_PREP) { - list_move_tail(&dc->list, &dcc->fstrim_list); - goto skip; - } - - __submit_discard_cmd(sbi, dpolicy, dc); - - if (++issued >= dpolicy->max_requests) { - start = dc->lstart + dc->len; - - blk_finish_plug(&plug); - mutex_unlock(&dcc->cmd_lock); - - schedule(); - - goto next; - } -skip: - node = rb_next(&dc->rb_node); - dc = rb_entry_safe(node, struct discard_cmd, rb_node); - - if (fatal_signal_pending(current)) - break; - } - - blk_finish_plug(&plug); - mutex_unlock(&dcc->cmd_lock); -} - static int __issue_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy) { @@ -1412,7 +1350,18 @@ static unsigned int __wait_discard_cmd_range(struct f2fs_sb_info *sbi, static void __wait_all_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy) { - __wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX); + struct discard_policy dp; + + if (dpolicy) { + __wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX); + return; + } + + /* wait all */ + init_discard_policy(&dp, DPOLICY_FSTRIM, 1); + __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); + init_discard_policy(&dp, DPOLICY_UMOUNT, 1); + __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); } /* This should be covered by global mutex, &sit_i->sentry_lock */ @@ -1460,8 +1409,9 @@ bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) init_discard_policy(&dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity); __issue_discard_cmd(sbi, &dpolicy); dropped = __drop_discard_cmd(sbi); - __wait_all_discard_cmd(sbi, &dpolicy); + /* just to make sure there is no pending discard commands */ + __wait_all_discard_cmd(sbi, NULL); return dropped; } @@ -2453,6 +2403,67 @@ bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc) return has_candidate; } +static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy, + unsigned int start, unsigned int end) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *prev_dc = NULL, *next_dc = NULL; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + struct discard_cmd *dc; + struct blk_plug plug; + int issued; + +next: + issued = 0; + + mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); + + dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root, + NULL, start, + (struct rb_entry **)&prev_dc, + (struct rb_entry **)&next_dc, + &insert_p, &insert_parent, true); + if (!dc) + dc = next_dc; + + blk_start_plug(&plug); + + while (dc && dc->lstart <= end) { + struct rb_node *node; + + if (dc->len < dpolicy->granularity) + goto skip; + + if (dc->state != D_PREP) { + list_move_tail(&dc->list, &dcc->fstrim_list); + goto skip; + } + + __submit_discard_cmd(sbi, dpolicy, dc); + + if (++issued >= dpolicy->max_requests) { + start = dc->lstart + dc->len; + + blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); + __wait_all_discard_cmd(sbi, NULL); + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto next; + } +skip: + node = rb_next(&dc->rb_node); + dc = rb_entry_safe(node, struct discard_cmd, rb_node); + + if (fatal_signal_pending(current)) + break; + } + + blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); +} + int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) { __u64 start = F2FS_BYTES_TO_BLK(range->start); From 70676ef73646598128b0521187d78e2fd492bed0 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 25 May 2018 18:02:58 -0700 Subject: [PATCH 0672/1212] f2fs: add fsync_mode=nobarrier for non-atomic files For non-atomic files, this patch adds an option to give nobarrier which doesn't issue flush commands to the device. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 16 +++++++++------- fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 2 +- fs/f2fs/super.c | 4 ++++ 4 files changed, 15 insertions(+), 8 deletions(-) diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index 1f52baea2f69..ecccb51c7279 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -180,13 +180,15 @@ whint_mode=%s Control which write hints are passed down to block passes down hints with its policy. alloc_mode=%s Adjust block allocation policy, which supports "reuse" and "default". -fsync_mode=%s Control the policy of fsync. Currently supports "posix" - and "strict". In "posix" mode, which is default, fsync - will follow POSIX semantics and does a light operation - to improve the filesystem performance. In "strict" mode, - fsync will be heavy and behaves in line with xfs, ext4 - and btrfs, where xfstest generic/342 will pass, but the - performance will regress. +fsync_mode=%s Control the policy of fsync. Currently supports "posix", + "strict", and "nobarrier". In "posix" mode, which is + default, fsync will follow POSIX semantics and does a + light operation to improve the filesystem performance. + In "strict" mode, fsync will be heavy and behaves in line + with xfs, ext4 and btrfs, where xfstest generic/342 will + pass, but the performance will regress. "nobarrier" is + based on "posix", but doesn't issue flush command for + non-atomic files likewise "nobarrier" mount option. test_dummy_encryption Enable dummy encryption, which provides a fake fscrypt context. The fake fscrypt context is used by xfstests. diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0bb23ad94b39..2b722d50f096 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1146,6 +1146,7 @@ enum { enum fsync_mode { FSYNC_MODE_POSIX, /* fsync follows posix semantics */ FSYNC_MODE_STRICT, /* fsync behaves in line with ext4 */ + FSYNC_MODE_NOBARRIER, /* fsync behaves nobarrier based on posix */ }; #ifdef CONFIG_F2FS_FS_ENCRYPTION diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7587758a285f..40d03d58b390 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -309,7 +309,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, remove_ino_entry(sbi, ino, APPEND_INO); clear_inode_flag(inode, FI_APPEND_WRITE); flush_out: - if (!atomic) + if (!atomic && F2FS_OPTION(sbi).fsync_mode != FSYNC_MODE_NOBARRIER) ret = f2fs_issue_flush(sbi, inode->i_ino); if (!ret) { remove_ino_entry(sbi, ino, UPDATE_INO); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 55b2bad55671..cb57ad3ca32d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -740,6 +740,10 @@ static int parse_options(struct super_block *sb, char *options) } else if (strlen(name) == 6 && !strncmp(name, "strict", 6)) { F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_STRICT; + } else if (strlen(name) == 9 && + !strncmp(name, "nobarrier", 9)) { + F2FS_OPTION(sbi).fsync_mode = + FSYNC_MODE_NOBARRIER; } else { kfree(name); return -EINVAL; From 31e2713935ea102ddb29dc5bf496d335a213f7f9 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 29 May 2018 09:58:42 -0700 Subject: [PATCH 0673/1212] f2fs: issue discard commands proactively in high fs utilization In the high utilization like over 80%, we don't expect huge # of large discard commands, but do many small pending discards which affects FTL GCs a lot. Let's issue them in that case. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 +- fs/f2fs/segment.c | 71 ++++++++++++++++++++++++++--------------------- 2 files changed, 40 insertions(+), 34 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2b722d50f096..249635a5f472 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -253,6 +253,7 @@ enum { #define DEF_MAX_DISCARD_LEN 512 /* Max. 2MB per discard */ #define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */ #define DEF_MAX_DISCARD_ISSUE_TIME 60000 /* 60 s, if no candidates */ +#define DEF_DISCARD_URGENT_UTIL 80 /* do more discard over 80% */ #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ @@ -2861,8 +2862,6 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi); void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); -void init_discard_policy(struct discard_policy *dpolicy, int discard_type, - unsigned int granularity); void drop_discard_cmd(struct f2fs_sb_info *sbi); void stop_discard_thread(struct f2fs_sb_info *sbi); bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0889f4b8dbf3..8df1a168256b 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -996,6 +996,38 @@ static void __check_sit_bitmap(struct f2fs_sb_info *sbi, #endif } +static void __init_discard_policy(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy, + int discard_type, unsigned int granularity) +{ + /* common policy */ + dpolicy->type = discard_type; + dpolicy->sync = true; + dpolicy->granularity = granularity; + + dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; + dpolicy->io_aware_gran = MAX_PLIST_NUM; + + if (discard_type == DPOLICY_BG) { + dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; + dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; + dpolicy->io_aware = true; + if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) { + dpolicy->granularity = 1; + dpolicy->max_interval = DEF_MIN_DISCARD_ISSUE_TIME; + } + } else if (discard_type == DPOLICY_FORCE) { + dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; + dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; + dpolicy->io_aware = false; + } else if (discard_type == DPOLICY_FSTRIM) { + dpolicy->io_aware = false; + } else if (discard_type == DPOLICY_UMOUNT) { + dpolicy->io_aware = false; + } +} + + /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, @@ -1358,9 +1390,9 @@ static void __wait_all_discard_cmd(struct f2fs_sb_info *sbi, } /* wait all */ - init_discard_policy(&dp, DPOLICY_FSTRIM, 1); + __init_discard_policy(sbi, &dp, DPOLICY_FSTRIM, 1); __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); - init_discard_policy(&dp, DPOLICY_UMOUNT, 1); + __init_discard_policy(sbi, &dp, DPOLICY_UMOUNT, 1); __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); } @@ -1406,7 +1438,8 @@ bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) struct discard_policy dpolicy; bool dropped; - init_discard_policy(&dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity); + __init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT, + dcc->discard_granularity); __issue_discard_cmd(sbi, &dpolicy); dropped = __drop_discard_cmd(sbi); @@ -1427,7 +1460,7 @@ static int issue_discard_thread(void *data) set_freezable(); do { - init_discard_policy(&dpolicy, DPOLICY_BG, + __init_discard_policy(sbi, &dpolicy, DPOLICY_BG, dcc->discard_granularity); wait_event_interruptible_timeout(*q, @@ -1445,7 +1478,7 @@ static int issue_discard_thread(void *data) dcc->discard_wake = 0; if (sbi->gc_thread && sbi->gc_thread->gc_urgent) - init_discard_policy(&dpolicy, DPOLICY_FORCE, 1); + __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); sb_start_intwrite(sbi->sb); @@ -1738,32 +1771,6 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) wake_up_discard_thread(sbi, false); } -void init_discard_policy(struct discard_policy *dpolicy, - int discard_type, unsigned int granularity) -{ - /* common policy */ - dpolicy->type = discard_type; - dpolicy->sync = true; - dpolicy->granularity = granularity; - - dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; - dpolicy->io_aware_gran = MAX_PLIST_NUM; - - if (discard_type == DPOLICY_BG) { - dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; - dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; - dpolicy->io_aware = true; - } else if (discard_type == DPOLICY_FORCE) { - dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; - dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; - dpolicy->io_aware = false; - } else if (discard_type == DPOLICY_FSTRIM) { - dpolicy->io_aware = false; - } else if (discard_type == DPOLICY_UMOUNT) { - dpolicy->io_aware = false; - } -} - static int create_discard_cmd_control(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; @@ -2522,7 +2529,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) start_block = START_BLOCK(sbi, start_segno); end_block = START_BLOCK(sbi, min(cur_segno, end_segno) + 1); - init_discard_policy(&dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen); + __init_discard_policy(sbi, &dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen); __issue_discard_cmd_range(sbi, &dpolicy, start_block, end_block); trimmed = __wait_discard_cmd_range(sbi, &dpolicy, start_block, end_block); From 4738f527db84ecb5d40691b8e5bf3e9bfced2243 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 9 Apr 2018 10:25:23 +0800 Subject: [PATCH 0674/1212] f2fs: don't split checkpoint in fstrim Now, we issue discard asynchronously in separated thread instead of in checkpoint, after that, we won't encounter long latency in checkpoint due to huge number of synchronous discard command handling, so, we don't need to split checkpoint to do trim in batch, merge it and obsolete related sysfs entry. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 1 + fs/f2fs/f2fs.h | 5 ---- fs/f2fs/segment.c | 39 ++++++++----------------- fs/f2fs/sysfs.c | 3 ++ 4 files changed, 16 insertions(+), 32 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index b8d0a30f1644..f82da9bbb1fd 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -101,6 +101,7 @@ Date: February 2015 Contact: "Jaegeuk Kim" Description: Controls the trimming rate in batch mode. + What: /sys/fs/f2fs//cp_interval Date: October 2015 diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 249635a5f472..51e4a9499f04 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -243,11 +243,6 @@ enum { #define CP_DISCARD 0x00000010 #define CP_TRIMMED 0x00000020 -#define DEF_BATCHED_TRIM_SECTIONS 2048 -#define BATCHED_TRIM_SEGMENTS(sbi) \ - (GET_SEG_FROM_SEC(sbi, SM_I(sbi)->trim_sections)) -#define BATCHED_TRIM_BLOCKS(sbi) \ - (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) #define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi) #define DEF_MAX_DISCARD_REQUEST 8 /* issue 8 discards per round */ #define DEF_MAX_DISCARD_LEN 512 /* Max. 2MB per discard */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 8df1a168256b..30f07dd5da3f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2475,7 +2475,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) { __u64 start = F2FS_BYTES_TO_BLK(range->start); __u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1; - unsigned int start_segno, end_segno, cur_segno; + unsigned int start_segno, end_segno; block_t start_block, end_block; struct cp_control cpc; struct discard_policy dpolicy; @@ -2501,40 +2501,27 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) cpc.reason = CP_DISCARD; cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen)); + cpc.trim_start = start_segno; + cpc.trim_end = end_segno; - /* do checkpoint to issue discard commands safely */ - for (cur_segno = start_segno; cur_segno <= end_segno; - cur_segno = cpc.trim_end + 1) { - cpc.trim_start = cur_segno; + if (sbi->discard_blks == 0) + goto out; - if (sbi->discard_blks == 0) - break; - else if (sbi->discard_blks < BATCHED_TRIM_BLOCKS(sbi)) - cpc.trim_end = end_segno; - else - cpc.trim_end = min_t(unsigned int, - rounddown(cur_segno + - BATCHED_TRIM_SEGMENTS(sbi), - sbi->segs_per_sec) - 1, end_segno); - - mutex_lock(&sbi->gc_mutex); - err = write_checkpoint(sbi, &cpc); - mutex_unlock(&sbi->gc_mutex); - if (err) - break; - - schedule(); - } + mutex_lock(&sbi->gc_mutex); + err = write_checkpoint(sbi, &cpc); + mutex_unlock(&sbi->gc_mutex); + if (err) + goto out; start_block = START_BLOCK(sbi, start_segno); - end_block = START_BLOCK(sbi, min(cur_segno, end_segno) + 1); + end_block = START_BLOCK(sbi, end_segno + 1); __init_discard_policy(sbi, &dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen); __issue_discard_cmd_range(sbi, &dpolicy, start_block, end_block); trimmed = __wait_discard_cmd_range(sbi, &dpolicy, start_block, end_block); -out: range->len = F2FS_BLK_TO_BYTES(trimmed); +out: return err; } @@ -3922,8 +3909,6 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS; sm_info->min_ssr_sections = reserved_sections(sbi); - sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS; - INIT_LIST_HEAD(&sm_info->sit_entry_set); init_rwsem(&sm_info->curseg_lock); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index f33a56d6e6dd..2c53de9251be 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -245,6 +245,9 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, return count; } + if (!strcmp(a->attr.name, "trim_sections")) + return -EINVAL; + *ui = t; if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0) From 85d2070f60c66b469e21a0c8e67c25c7cd5b4c45 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 10 Apr 2018 15:43:09 +0800 Subject: [PATCH 0675/1212] f2fs: turn down IO priority of discard from background In order to avoid interfering normal r/w IO, let's turn down IO priority of discard issued from background. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 30f07dd5da3f..478a4504ba9a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1012,6 +1012,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; dpolicy->io_aware = true; + dpolicy->sync = false; if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) { dpolicy->granularity = 1; dpolicy->max_interval = DEF_MIN_DISCARD_ISSUE_TIME; From 73450231fffffe6a2863d493cd053596f2a2de57 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 31 May 2018 10:20:48 -0700 Subject: [PATCH 0676/1212] f2fs: run fstrim asynchronously if runtime discard is on We don't need to wait for whole bunch of discard candidates in fstrim, since runtime discard will issue them in idle time. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 478a4504ba9a..a02d5c1a7ed2 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2519,9 +2519,18 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) __init_discard_policy(sbi, &dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen); __issue_discard_cmd_range(sbi, &dpolicy, start_block, end_block); - trimmed = __wait_discard_cmd_range(sbi, &dpolicy, + + /* + * We filed discard candidates, but actually we don't need to wait for + * all of them, since they'll be issued in idle time along with runtime + * discard option. User configuration looks like using runtime discard + * or periodic fstrim instead of it. + */ + if (!test_opt(sbi, DISCARD)) { + trimmed = __wait_discard_cmd_range(sbi, &dpolicy, start_block, end_block); - range->len = F2FS_BLK_TO_BYTES(trimmed); + range->len = F2FS_BLK_TO_BYTES(trimmed); + } out: return err; } From 39b14449060651cde9d2d5e0b6e48f0674a087c7 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 20 Apr 2018 16:30:02 -0700 Subject: [PATCH 0677/1212] fscrypt: use unbound workqueue for decryption Improve fscrypt read performance by switching the decryption workqueue from bound to unbound. With the bound workqueue, when multiple bios completed on the same CPU, they were decrypted on that same CPU. But with the unbound queue, they are now decrypted in parallel on any CPU. Although fscrypt read performance can be tough to measure due to the many sources of variation, this change is most beneficial when decryption is slow, e.g. on CPUs without AES instructions. For example, I timed tarring up encrypted directories on f2fs. On x86 with AES-NI instructions disabled, the unbound workqueue improved performance by about 25-35%, using 1 to NUM_CPUs jobs with 4 or 8 CPUs available. But with AES-NI enabled, performance was unchanged to within ~2%. I also did the same test on a quad-core ARM CPU using xts-speck128-neon encryption. There performance was usually about 10% better with the unbound workqueue, bringing it closer to the unencrypted speed. The unbound workqueue may be worse in some cases due to worse locality, but I think it's still the better default. dm-crypt uses an unbound workqueue by default too, so this change makes fscrypt match. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/crypto.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 0758d32ad01b..2f646b1248bc 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -433,8 +433,17 @@ int fscrypt_initialize(unsigned int cop_flags) */ static int __init fscrypt_init(void) { + /* + * Use an unbound workqueue to allow bios to be decrypted in parallel + * even when they happen to complete on the same CPU. This sacrifices + * locality, but it's worthwhile since decryption is CPU-intensive. + * + * Also use a high-priority workqueue to prioritize decryption work, + * which blocks reads from completing, over regular application tasks. + */ fscrypt_read_workqueue = alloc_workqueue("fscrypt_read_queue", - WQ_HIGHPRI, 0); + WQ_UNBOUND | WQ_HIGHPRI, + num_online_cpus()); if (!fscrypt_read_workqueue) goto fail; From fb10231825e94a1eea7d5e0b9d23824b6add6113 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Apr 2018 15:51:36 -0700 Subject: [PATCH 0678/1212] fscrypt: clean up after fscrypt_prepare_lookup() conversions Now that all filesystems have been converted to use fscrypt_prepare_lookup(), we can remove the fscrypt_set_d_op() and fscrypt_set_encrypted_dentry() functions as well as un-export fscrypt_d_ops. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/crypto.c | 1 - fs/crypto/fscrypt_private.h | 1 + include/linux/fscrypt_notsupp.h | 10 ---------- include/linux/fscrypt_supp.h | 14 -------------- 4 files changed, 1 insertion(+), 25 deletions(-) diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 2f646b1248bc..a00efa266eb5 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -359,7 +359,6 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) const struct dentry_operations fscrypt_d_ops = { .d_revalidate = fscrypt_d_revalidate, }; -EXPORT_SYMBOL(fscrypt_d_ops); void fscrypt_restore_control_page(struct page *page) { diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 426aa1b27f17..978d0e061aed 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -115,6 +115,7 @@ extern int fscrypt_do_page_crypto(const struct inode *inode, gfp_t gfp_flags); extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags); +extern const struct dentry_operations fscrypt_d_ops; /* fname.c */ extern int fname_encrypt(struct inode *inode, const struct qstr *iname, diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index 44bd4fbd3ec5..e2729c6d9829 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -67,16 +67,6 @@ static inline void fscrypt_restore_control_page(struct page *page) return; } -static inline void fscrypt_set_d_op(struct dentry *dentry) -{ - return; -} - -static inline void fscrypt_set_encrypted_dentry(struct dentry *dentry) -{ - return; -} - /* policy.c */ static inline int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index 9d1857302b73..4f0a5c5ab441 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -74,20 +74,6 @@ static inline struct page *fscrypt_control_page(struct page *page) extern void fscrypt_restore_control_page(struct page *); -extern const struct dentry_operations fscrypt_d_ops; - -static inline void fscrypt_set_d_op(struct dentry *dentry) -{ - d_set_d_op(dentry, &fscrypt_d_ops); -} - -static inline void fscrypt_set_encrypted_dentry(struct dentry *dentry) -{ - spin_lock(&dentry->d_lock); - dentry->d_flags |= DCACHE_ENCRYPTED_WITH_KEY; - spin_unlock(&dentry->d_lock); -} - /* policy.c */ extern int fscrypt_ioctl_set_policy(struct file *, const void __user *); extern int fscrypt_ioctl_get_policy(struct file *, void __user *); From f68d3b84aef18bb91329907995d2c7e083ae8c75 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Apr 2018 15:51:37 -0700 Subject: [PATCH 0679/1212] fscrypt: remove unnecessary NULL check when allocating skcipher crypto_alloc_skcipher() returns an ERR_PTR() on failure, not NULL. Remove the unnecessary check for NULL. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/keyinfo.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 7c00331da5df..7750179bba4b 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -317,8 +317,8 @@ int fscrypt_get_encryption_info(struct inode *inode) goto out; } ctfm = crypto_alloc_skcipher(cipher_str, 0, 0); - if (!ctfm || IS_ERR(ctfm)) { - res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; + if (IS_ERR(ctfm)) { + res = PTR_ERR(ctfm); pr_debug("%s: error %d (inode %lu) allocating crypto tfm\n", __func__, res, inode->i_ino); goto out; From d56de4e926ade7e0afd929792aced636c1f178ba Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Apr 2018 15:51:38 -0700 Subject: [PATCH 0680/1212] fscrypt: remove error messages for skcipher_request_alloc() failure skcipher_request_alloc() can only fail due to lack of memory, and in that case the memory allocator will have already printed a detailed error message. Thus, remove the redundant error messages from fscrypt. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/crypto.c | 6 +----- fs/crypto/fname.c | 10 ++-------- 2 files changed, 3 insertions(+), 13 deletions(-) diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index a00efa266eb5..021f348900b1 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -162,12 +162,8 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, } req = skcipher_request_alloc(tfm, gfp_flags); - if (!req) { - printk_ratelimited(KERN_ERR - "%s: crypto_request_alloc() failed\n", - __func__); + if (!req) return -ENOMEM; - } skcipher_request_set_callback( req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index b18fa323d1d9..8af9e35b4f29 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -58,11 +58,8 @@ int fname_encrypt(struct inode *inode, const struct qstr *iname, /* Set up the encryption request */ req = skcipher_request_alloc(tfm, GFP_NOFS); - if (!req) { - printk_ratelimited(KERN_ERR - "%s: skcipher_request_alloc() failed\n", __func__); + if (!req) return -ENOMEM; - } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &wait); @@ -107,11 +104,8 @@ static int fname_decrypt(struct inode *inode, /* Allocate request */ req = skcipher_request_alloc(tfm, GFP_NOFS); - if (!req) { - printk_ratelimited(KERN_ERR - "%s: crypto_request_alloc() failed\n", __func__); + if (!req) return -ENOMEM; - } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &wait); From 89b7fb82982fbe9a0951fde557a23cdf99b8cdbd Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Apr 2018 15:51:39 -0700 Subject: [PATCH 0681/1212] fscrypt: remove stale comment from fscrypt_d_revalidate() Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/crypto.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 021f348900b1..b12c53e6efb1 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -328,7 +328,6 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) return 0; } - /* this should eventually be an flag in d_flags */ spin_lock(&dentry->d_lock); cached_with_key = dentry->d_flags & DCACHE_ENCRYPTED_WITH_KEY; spin_unlock(&dentry->d_lock); From 52c51f7b7bde658d8e5abb50729dae361d0e8e35 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Apr 2018 15:51:40 -0700 Subject: [PATCH 0682/1212] fscrypt: don't clear flags on crypto transform fscrypt is clearing the flags on the crypto_skcipher it allocates for each inode. But, this is unnecessary and may cause problems in the future because it will even clear flags that are meant to be internal to the crypto API, e.g. CRYPTO_TFM_NEED_KEY. Remove the unnecessary flag clearing. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/keyinfo.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 7750179bba4b..875ee0108468 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -324,7 +324,6 @@ int fscrypt_get_encryption_info(struct inode *inode) goto out; } crypt_info->ci_ctfm = ctfm; - crypto_skcipher_clear_flags(ctfm, ~0); crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY); /* * if the provided key is longer than keysize, we use the first From 3f7af9d27fd6cabff05d149ca1178a5e27852c19 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Apr 2018 15:51:41 -0700 Subject: [PATCH 0683/1212] fscrypt: don't special-case EOPNOTSUPP from fscrypt_get_encryption_info() In fscrypt_setup_filename(), remove the unnecessary check for fscrypt_get_encryption_info() returning EOPNOTSUPP. There's no reason to handle this error differently from any other. I think there may have been some confusion because the "notsupp" version of fscrypt_get_encryption_info() returns EOPNOTSUPP -- but that's not applicable from inside fs/crypto/. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/fname.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 8af9e35b4f29..19715de54d37 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -334,7 +334,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, return 0; } ret = fscrypt_get_encryption_info(dir); - if (ret && ret != -EOPNOTSUPP) + if (ret) return ret; if (dir->i_crypt_info) { From 0077eff1d2e3816a40b71997ab677bb6ca671115 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Apr 2018 15:51:42 -0700 Subject: [PATCH 0684/1212] fscrypt: drop max_namelen check from fname_decrypt() fname_decrypt() returns an error if the input filename is longer than the inode's ->max_namelen() as given by the filesystem. But, this doesn't actually make sense because the filesystem provided the input filename in the first place, where it was subject to the filesystem's limits. And fname_decrypt() has no internal limit itself. Thus, remove this unnecessary check. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/fname.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 19715de54d37..d21a5329d6ca 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -92,14 +92,11 @@ static int fname_decrypt(struct inode *inode, struct skcipher_request *req = NULL; DECLARE_CRYPTO_WAIT(wait); struct scatterlist src_sg, dst_sg; - struct fscrypt_info *ci = inode->i_crypt_info; - struct crypto_skcipher *tfm = ci->ci_ctfm; + struct crypto_skcipher *tfm = inode->i_crypt_info->ci_ctfm; int res = 0; char iv[FS_CRYPTO_BLOCK_SIZE]; - unsigned lim; - lim = inode->i_sb->s_cop->max_namelen(inode); - if (iname->len <= 0 || iname->len > lim) + if (iname->len <= 0) return -EIO; /* Allocate request */ From f572a22ef9a515eb0f81d6e9bac0fcc4988399f2 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Apr 2018 15:51:43 -0700 Subject: [PATCH 0685/1212] fscrypt: drop empty name check from fname_decrypt() fname_decrypt() is validating that the encrypted filename is nonempty. However, earlier a stronger precondition was already enforced: the encrypted filename must be at least 16 (FS_CRYPTO_BLOCK_SIZE) bytes. Drop the redundant check for an empty filename. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/fname.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index d21a5329d6ca..6c4c84ec18ff 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -96,9 +96,6 @@ static int fname_decrypt(struct inode *inode, int res = 0; char iv[FS_CRYPTO_BLOCK_SIZE]; - if (iname->len <= 0) - return -EIO; - /* Allocate request */ req = skcipher_request_alloc(tfm, GFP_NOFS); if (!req) From 56446c91422e938c60f19fcf36115cbaae737b0d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Apr 2018 15:51:44 -0700 Subject: [PATCH 0686/1212] fscrypt: make fscrypt_operations.max_namelen an integer Now ->max_namelen() is only called to limit the filename length when adding NUL padding, and only for real filenames -- not symlink targets. It also didn't give the correct length for symlink targets anyway since it forgot to subtract 'sizeof(struct fscrypt_symlink_data)'. Thus, change ->max_namelen from a function to a simple 'unsigned int' that gives the filesystem's maximum filename length. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/fname.c | 2 +- fs/f2fs/super.c | 8 +------- include/linux/fscrypt_supp.h | 2 +- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 6c4c84ec18ff..b1b69ec4b4ff 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -333,7 +333,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, if (dir->i_crypt_info) { if (!fscrypt_fname_encrypted_size(dir, iname->len, - dir->i_sb->s_cop->max_namelen(dir), + dir->i_sb->s_cop->max_namelen, &fname->crypto_buf.len)) return -ENAMETOOLONG; fname->crypto_buf.name = kmalloc(fname->crypto_buf.len, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index cb57ad3ca32d..777ed4eafa6c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1938,19 +1938,13 @@ static bool f2fs_dummy_context(struct inode *inode) return DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(inode)); } -static unsigned f2fs_max_namelen(struct inode *inode) -{ - return S_ISLNK(inode->i_mode) ? - inode->i_sb->s_blocksize : F2FS_NAME_LEN; -} - static const struct fscrypt_operations f2fs_cryptops = { .key_prefix = "f2fs:", .get_context = f2fs_get_context, .set_context = f2fs_set_context, .dummy_context = f2fs_dummy_context, .empty_dir = f2fs_empty_dir, - .max_namelen = f2fs_max_namelen, + .max_namelen = F2FS_NAME_LEN, }; #endif diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index 4f0a5c5ab441..46b62d82b6d6 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -28,7 +28,7 @@ struct fscrypt_operations { int (*set_context)(struct inode *, const void *, size_t, void *); bool (*dummy_context)(struct inode *); bool (*empty_dir)(struct inode *); - unsigned (*max_namelen)(struct inode *); + unsigned int max_namelen; }; struct fscrypt_ctx { From 7149dd4d39b54d3a59ecea7b2a95c842aa39a283 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Apr 2018 15:51:45 -0700 Subject: [PATCH 0687/1212] fscrypt: remove unnecessary check for non-logon key type We're passing 'key_type_logon' to request_key(), so the found key is guaranteed to be of type "logon". Thus, there is no reason to check later that the key is really a "logon" key. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/keyinfo.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 875ee0108468..90b326941c67 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -88,12 +88,6 @@ static int validate_user_key(struct fscrypt_info *crypt_info, return PTR_ERR(keyring_key); down_read(&keyring_key->sem); - if (keyring_key->type != &key_type_logon) { - printk_once(KERN_WARNING - "%s: key type must be logon\n", __func__); - res = -ENOKEY; - goto out; - } ukp = user_key_payload(keyring_key); if (!ukp) { /* key was revoked before we acquired its semaphore */ From ff8e7c745e2bb71c549a0813dc8fbd8a1daf970f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Apr 2018 15:51:46 -0700 Subject: [PATCH 0688/1212] fscrypt: remove internal key size constants With one exception, the internal key size constants such as FS_AES_256_XTS_KEY_SIZE are only used for the 'available_modes' array, where they really only serve to obfuscate what the values are. Also some of the constants are unused, and the key sizes tend to be in the names of the algorithms anyway. In the past these values were also misused, e.g. we used to have FS_AES_256_XTS_KEY_SIZE in places that technically should have been FS_MAX_KEY_SIZE. The exception is that FS_AES_128_ECB_KEY_SIZE is used for key derivation. But it's more appropriate to use FS_KEY_DERIVATION_NONCE_SIZE for that instead. Thus, just put the sizes directly in the 'available_modes' array. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/fscrypt_private.h | 10 +--------- fs/crypto/keyinfo.c | 17 ++++++----------- 2 files changed, 7 insertions(+), 20 deletions(-) diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 978d0e061aed..cc64e7e42fa1 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -17,15 +17,7 @@ /* Encryption parameters */ #define FS_IV_SIZE 16 -#define FS_AES_128_ECB_KEY_SIZE 16 -#define FS_AES_128_CBC_KEY_SIZE 16 -#define FS_AES_128_CTS_KEY_SIZE 16 -#define FS_AES_256_GCM_KEY_SIZE 32 -#define FS_AES_256_CBC_KEY_SIZE 32 -#define FS_AES_256_CTS_KEY_SIZE 32 -#define FS_AES_256_XTS_KEY_SIZE 64 - -#define FS_KEY_DERIVATION_NONCE_SIZE 16 +#define FS_KEY_DERIVATION_NONCE_SIZE 16 /** * Encryption context for inode diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 90b326941c67..f1ea6c517cfb 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -26,7 +26,7 @@ static struct crypto_shash *essiv_hash_tfm; * * Return: Zero on success; non-zero otherwise. */ -static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], +static int derive_key_aes(u8 deriving_key[FS_KEY_DERIVATION_NONCE_SIZE], const struct fscrypt_key *source_key, u8 derived_raw_key[FS_MAX_KEY_SIZE]) { @@ -51,7 +51,7 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &wait); res = crypto_skcipher_setkey(tfm, deriving_key, - FS_AES_128_ECB_KEY_SIZE); + FS_KEY_DERIVATION_NONCE_SIZE); if (res < 0) goto out; @@ -99,7 +99,6 @@ static int validate_user_key(struct fscrypt_info *crypt_info, goto out; } master_key = (struct fscrypt_key *)ukp->data; - BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE); if (master_key->size < min_keysize || master_key->size > FS_MAX_KEY_SIZE || master_key->size % AES_BLOCK_SIZE != 0) { @@ -120,14 +119,10 @@ static const struct { const char *cipher_str; int keysize; } available_modes[] = { - [FS_ENCRYPTION_MODE_AES_256_XTS] = { "xts(aes)", - FS_AES_256_XTS_KEY_SIZE }, - [FS_ENCRYPTION_MODE_AES_256_CTS] = { "cts(cbc(aes))", - FS_AES_256_CTS_KEY_SIZE }, - [FS_ENCRYPTION_MODE_AES_128_CBC] = { "cbc(aes)", - FS_AES_128_CBC_KEY_SIZE }, - [FS_ENCRYPTION_MODE_AES_128_CTS] = { "cts(cbc(aes))", - FS_AES_128_CTS_KEY_SIZE }, + [FS_ENCRYPTION_MODE_AES_256_XTS] = { "xts(aes)", 64 }, + [FS_ENCRYPTION_MODE_AES_256_CTS] = { "cts(cbc(aes))", 32 }, + [FS_ENCRYPTION_MODE_AES_128_CBC] = { "cbc(aes)", 16 }, + [FS_ENCRYPTION_MODE_AES_128_CTS] = { "cts(cbc(aes))", 16 }, }; static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode, From 52359cf4fd6dd5208b6b9613df5140dfd9a329c7 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Apr 2018 15:51:47 -0700 Subject: [PATCH 0689/1212] fscrypt: use a common logging function Use a common function for fscrypt warning and error messages so that all the messages are consistently ratelimited, include the "fscrypt:" prefix, and include the filesystem name if applicable. Also fix up a few of the log messages to be more descriptive. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/crypto.c | 28 +++++++++++++++++++++++++--- fs/crypto/fname.c | 10 ++++++---- fs/crypto/fscrypt_private.h | 8 ++++++++ fs/crypto/hooks.c | 5 +++-- fs/crypto/keyinfo.c | 27 +++++++++++++++------------ 5 files changed, 57 insertions(+), 21 deletions(-) diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index b12c53e6efb1..0f46cf550907 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -180,9 +180,10 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); skcipher_request_free(req); if (res) { - printk_ratelimited(KERN_ERR - "%s: crypto_skcipher_encrypt() returned %d\n", - __func__, res); + fscrypt_err(inode->i_sb, + "%scryption failed for inode %lu, block %llu: %d", + (rw == FS_DECRYPT ? "de" : "en"), + inode->i_ino, lblk_num, res); return res; } return 0; @@ -422,6 +423,27 @@ int fscrypt_initialize(unsigned int cop_flags) return res; } +void fscrypt_msg(struct super_block *sb, const char *level, + const char *fmt, ...) +{ + static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + struct va_format vaf; + va_list args; + + if (!__ratelimit(&rs)) + return; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + if (sb) + printk("%sfscrypt (%s): %pV\n", level, sb->s_id, &vaf); + else + printk("%sfscrypt: %pV\n", level, &vaf); + va_end(args); +} + /** * fscrypt_init() - Set up for fs encryption. */ diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index b1b69ec4b4ff..1bdb9f226eec 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -70,8 +70,9 @@ int fname_encrypt(struct inode *inode, const struct qstr *iname, res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); skcipher_request_free(req); if (res < 0) { - printk_ratelimited(KERN_ERR - "%s: Error (error code %d)\n", __func__, res); + fscrypt_err(inode->i_sb, + "Filename encryption failed for inode %lu: %d", + inode->i_ino, res); return res; } @@ -114,8 +115,9 @@ static int fname_decrypt(struct inode *inode, res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait); skcipher_request_free(req); if (res < 0) { - printk_ratelimited(KERN_ERR - "%s: Error (error code %d)\n", __func__, res); + fscrypt_err(inode->i_sb, + "Filename decryption failed for inode %lu: %d", + inode->i_ino, res); return res; } diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index cc64e7e42fa1..92c6c0ace1b1 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -109,6 +109,14 @@ extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags); extern const struct dentry_operations fscrypt_d_ops; +extern void __printf(3, 4) __cold +fscrypt_msg(struct super_block *sb, const char *level, const char *fmt, ...); + +#define fscrypt_warn(sb, fmt, ...) \ + fscrypt_msg(sb, KERN_WARNING, fmt, ##__VA_ARGS__) +#define fscrypt_err(sb, fmt, ...) \ + fscrypt_msg(sb, KERN_ERR, fmt, ##__VA_ARGS__) + /* fname.c */ extern int fname_encrypt(struct inode *inode, const struct qstr *iname, u8 *out, unsigned int olen); diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index bc010e4609ef..b5328a0c6364 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -39,8 +39,9 @@ int fscrypt_file_open(struct inode *inode, struct file *filp) dir = dget_parent(file_dentry(filp)); if (IS_ENCRYPTED(d_inode(dir)) && !fscrypt_has_permitted_context(d_inode(dir), inode)) { - pr_warn_ratelimited("fscrypt: inconsistent encryption contexts: %lu/%lu", - d_inode(dir)->i_ino, inode->i_ino); + fscrypt_warn(inode->i_sb, + "inconsistent encryption contexts: %lu/%lu", + d_inode(dir)->i_ino, inode->i_ino); err = -EPERM; } dput(dir); diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index f1ea6c517cfb..580117f81a54 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -102,9 +102,8 @@ static int validate_user_key(struct fscrypt_info *crypt_info, if (master_key->size < min_keysize || master_key->size > FS_MAX_KEY_SIZE || master_key->size % AES_BLOCK_SIZE != 0) { - printk_once(KERN_WARNING - "%s: key size incorrect: %d\n", - __func__, master_key->size); + fscrypt_warn(NULL, "key size incorrect: %u", + master_key->size); res = -ENOKEY; goto out; } @@ -131,9 +130,10 @@ static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode, u32 mode; if (!fscrypt_valid_enc_modes(ci->ci_data_mode, ci->ci_filename_mode)) { - pr_warn_ratelimited("fscrypt: inode %lu uses unsupported encryption modes (contents mode %d, filenames mode %d)\n", - inode->i_ino, - ci->ci_data_mode, ci->ci_filename_mode); + fscrypt_warn(inode->i_sb, + "inode %lu uses unsupported encryption modes (contents mode %d, filenames mode %d)", + inode->i_ino, ci->ci_data_mode, + ci->ci_filename_mode); return -EINVAL; } @@ -172,8 +172,9 @@ static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt) tfm = crypto_alloc_shash("sha256", 0, 0); if (IS_ERR(tfm)) { - pr_warn_ratelimited("fscrypt: error allocating SHA-256 transform: %ld\n", - PTR_ERR(tfm)); + fscrypt_warn(NULL, + "error allocating SHA-256 transform: %ld", + PTR_ERR(tfm)); return PTR_ERR(tfm); } prev_tfm = cmpxchg(&essiv_hash_tfm, NULL, tfm); @@ -308,8 +309,9 @@ int fscrypt_get_encryption_info(struct inode *inode) ctfm = crypto_alloc_skcipher(cipher_str, 0, 0); if (IS_ERR(ctfm)) { res = PTR_ERR(ctfm); - pr_debug("%s: error %d (inode %lu) allocating crypto tfm\n", - __func__, res, inode->i_ino); + fscrypt_warn(inode->i_sb, + "error allocating '%s' transform for inode %lu: %d", + cipher_str, inode->i_ino, res); goto out; } crypt_info->ci_ctfm = ctfm; @@ -326,8 +328,9 @@ int fscrypt_get_encryption_info(struct inode *inode) crypt_info->ci_data_mode == FS_ENCRYPTION_MODE_AES_128_CBC) { res = init_essiv_generator(crypt_info, raw_key, keysize); if (res) { - pr_debug("%s: error %d (inode %lu) allocating essiv tfm\n", - __func__, res, inode->i_ino); + fscrypt_warn(inode->i_sb, + "error initializing ESSIV generator for inode %lu: %d", + inode->i_ino, res); goto out; } } From f68a71fa8f773c82ed70feb398d7b1ab8cca2dd3 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Apr 2018 15:51:48 -0700 Subject: [PATCH 0690/1212] fscrypt: separate key lookup from key derivation Refactor the confusingly-named function 'validate_user_key()' into a new function 'find_and_derive_key()' which first finds the keyring key, then does the key derivation. Among other benefits this avoids the strange behavior we had previously where if key derivation failed for some reason, then we would fall back to the alternate key prefix. Now, we'll only fall back to the alternate key prefix if a valid key isn't found. This patch also improves the warning messages that are logged when the keyring key's payload is invalid. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/keyinfo.c | 124 +++++++++++++++++++++++++++----------------- 1 file changed, 75 insertions(+), 49 deletions(-) diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 580117f81a54..86177a7b1001 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -26,7 +26,7 @@ static struct crypto_shash *essiv_hash_tfm; * * Return: Zero on success; non-zero otherwise. */ -static int derive_key_aes(u8 deriving_key[FS_KEY_DERIVATION_NONCE_SIZE], +static int derive_key_aes(const u8 deriving_key[FS_KEY_DERIVATION_NONCE_SIZE], const struct fscrypt_key *source_key, u8 derived_raw_key[FS_MAX_KEY_SIZE]) { @@ -66,52 +66,88 @@ static int derive_key_aes(u8 deriving_key[FS_KEY_DERIVATION_NONCE_SIZE], return res; } -static int validate_user_key(struct fscrypt_info *crypt_info, - struct fscrypt_context *ctx, u8 *raw_key, - const char *prefix, int min_keysize) +/* + * Search the current task's subscribed keyrings for a "logon" key with + * description prefix:descriptor, and if found acquire a read lock on it and + * return a pointer to its validated payload in *payload_ret. + */ +static struct key * +find_and_lock_process_key(const char *prefix, + const u8 descriptor[FS_KEY_DESCRIPTOR_SIZE], + unsigned int min_keysize, + const struct fscrypt_key **payload_ret) { char *description; - struct key *keyring_key; - struct fscrypt_key *master_key; + struct key *key; const struct user_key_payload *ukp; - int res; + const struct fscrypt_key *payload; description = kasprintf(GFP_NOFS, "%s%*phN", prefix, - FS_KEY_DESCRIPTOR_SIZE, - ctx->master_key_descriptor); + FS_KEY_DESCRIPTOR_SIZE, descriptor); if (!description) - return -ENOMEM; + return ERR_PTR(-ENOMEM); - keyring_key = request_key(&key_type_logon, description, NULL); + key = request_key(&key_type_logon, description, NULL); kfree(description); - if (IS_ERR(keyring_key)) - return PTR_ERR(keyring_key); - down_read(&keyring_key->sem); + if (IS_ERR(key)) + return key; - ukp = user_key_payload(keyring_key); - if (!ukp) { - /* key was revoked before we acquired its semaphore */ - res = -EKEYREVOKED; - goto out; - } - if (ukp->datalen != sizeof(struct fscrypt_key)) { - res = -EINVAL; - goto out; - } - master_key = (struct fscrypt_key *)ukp->data; + down_read(&key->sem); + ukp = user_key_payload(key); - if (master_key->size < min_keysize || master_key->size > FS_MAX_KEY_SIZE - || master_key->size % AES_BLOCK_SIZE != 0) { - fscrypt_warn(NULL, "key size incorrect: %u", - master_key->size); - res = -ENOKEY; - goto out; + if (!ukp) /* was the key revoked before we acquired its semaphore? */ + goto invalid; + + payload = (const struct fscrypt_key *)ukp->data; + + if (ukp->datalen != sizeof(struct fscrypt_key) || + payload->size < 1 || payload->size > FS_MAX_KEY_SIZE) { + fscrypt_warn(NULL, + "key with description '%s' has invalid payload", + key->description); + goto invalid; } - res = derive_key_aes(ctx->nonce, master_key, raw_key); -out: - up_read(&keyring_key->sem); - key_put(keyring_key); - return res; + + if (payload->size < min_keysize || + payload->size % AES_BLOCK_SIZE != 0) { + fscrypt_warn(NULL, + "key with description '%s' is too short or is misaligned (got %u bytes, need %u+ bytes)", + key->description, payload->size, min_keysize); + goto invalid; + } + + *payload_ret = payload; + return key; + +invalid: + up_read(&key->sem); + key_put(key); + return ERR_PTR(-ENOKEY); +} + +/* Find the master key, then derive the inode's actual encryption key */ +static int find_and_derive_key(const struct inode *inode, + const struct fscrypt_context *ctx, + u8 *derived_key, unsigned int derived_keysize) +{ + struct key *key; + const struct fscrypt_key *payload; + int err; + + key = find_and_lock_process_key(FS_KEY_DESC_PREFIX, + ctx->master_key_descriptor, + derived_keysize, &payload); + if (key == ERR_PTR(-ENOKEY) && inode->i_sb->s_cop->key_prefix) { + key = find_and_lock_process_key(inode->i_sb->s_cop->key_prefix, + ctx->master_key_descriptor, + derived_keysize, &payload); + } + if (IS_ERR(key)) + return PTR_ERR(key); + err = derive_key_aes(ctx->nonce, payload, derived_key); + up_read(&key->sem); + key_put(key); + return err; } static const struct { @@ -292,20 +328,10 @@ int fscrypt_get_encryption_info(struct inode *inode) if (!raw_key) goto out; - res = validate_user_key(crypt_info, &ctx, raw_key, FS_KEY_DESC_PREFIX, - keysize); - if (res && inode->i_sb->s_cop->key_prefix) { - int res2 = validate_user_key(crypt_info, &ctx, raw_key, - inode->i_sb->s_cop->key_prefix, - keysize); - if (res2) { - if (res2 == -ENOKEY) - res = -ENOKEY; - goto out; - } - } else if (res) { + res = find_and_derive_key(inode, &ctx, raw_key, keysize); + if (res) goto out; - } + ctfm = crypto_alloc_skcipher(cipher_str, 0, 0); if (IS_ERR(ctfm)) { res = PTR_ERR(ctfm); From 27a0e77380a3767929df1b4e563096f51b90a041 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Apr 2018 15:51:49 -0700 Subject: [PATCH 0691/1212] fscrypt: only derive the needed portion of the key Currently the key derivation function in fscrypt uses the master key length as the amount of output key material to derive. This works, but it means we can waste time deriving more key material than is actually used, e.g. most commonly, deriving 64 bytes for directories which only take a 32-byte AES-256-CTS-CBC key. It also forces us to validate that the master key length is a multiple of AES_BLOCK_SIZE, which wouldn't otherwise be necessary. Fix it to only derive the needed length key. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/keyinfo.c | 39 ++++++++++++++++----------------------- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 86177a7b1001..44bcb695c206 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -18,17 +18,16 @@ static struct crypto_shash *essiv_hash_tfm; -/** - * derive_key_aes() - Derive a key using AES-128-ECB - * @deriving_key: Encryption key used for derivation. - * @source_key: Source key to which to apply derivation. - * @derived_raw_key: Derived raw key. +/* + * Key derivation function. This generates the derived key by encrypting the + * master key with AES-128-ECB using the inode's nonce as the AES key. * - * Return: Zero on success; non-zero otherwise. + * The master key must be at least as long as the derived key. If the master + * key is longer, then only the first 'derived_keysize' bytes are used. */ -static int derive_key_aes(const u8 deriving_key[FS_KEY_DERIVATION_NONCE_SIZE], - const struct fscrypt_key *source_key, - u8 derived_raw_key[FS_MAX_KEY_SIZE]) +static int derive_key_aes(const u8 *master_key, + const struct fscrypt_context *ctx, + u8 *derived_key, unsigned int derived_keysize) { int res = 0; struct skcipher_request *req = NULL; @@ -50,14 +49,13 @@ static int derive_key_aes(const u8 deriving_key[FS_KEY_DERIVATION_NONCE_SIZE], skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &wait); - res = crypto_skcipher_setkey(tfm, deriving_key, - FS_KEY_DERIVATION_NONCE_SIZE); + res = crypto_skcipher_setkey(tfm, ctx->nonce, sizeof(ctx->nonce)); if (res < 0) goto out; - sg_init_one(&src_sg, source_key->raw, source_key->size); - sg_init_one(&dst_sg, derived_raw_key, source_key->size); - skcipher_request_set_crypt(req, &src_sg, &dst_sg, source_key->size, + sg_init_one(&src_sg, master_key, derived_keysize); + sg_init_one(&dst_sg, derived_key, derived_keysize); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, derived_keysize, NULL); res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); out: @@ -108,10 +106,9 @@ find_and_lock_process_key(const char *prefix, goto invalid; } - if (payload->size < min_keysize || - payload->size % AES_BLOCK_SIZE != 0) { + if (payload->size < min_keysize) { fscrypt_warn(NULL, - "key with description '%s' is too short or is misaligned (got %u bytes, need %u+ bytes)", + "key with description '%s' is too short (got %u bytes, need %u+ bytes)", key->description, payload->size, min_keysize); goto invalid; } @@ -144,7 +141,7 @@ static int find_and_derive_key(const struct inode *inode, } if (IS_ERR(key)) return PTR_ERR(key); - err = derive_key_aes(ctx->nonce, payload, derived_key); + err = derive_key_aes(payload->raw, ctx, derived_key, derived_keysize); up_read(&key->sem); key_put(key); return err; @@ -324,7 +321,7 @@ int fscrypt_get_encryption_info(struct inode *inode) * crypto API as part of key derivation. */ res = -ENOMEM; - raw_key = kmalloc(FS_MAX_KEY_SIZE, GFP_NOFS); + raw_key = kmalloc(keysize, GFP_NOFS); if (!raw_key) goto out; @@ -342,10 +339,6 @@ int fscrypt_get_encryption_info(struct inode *inode) } crypt_info->ci_ctfm = ctfm; crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY); - /* - * if the provided key is longer than keysize, we use the first - * keysize bytes of the derived key only - */ res = crypto_skcipher_setkey(ctfm, raw_key, keysize); if (res) goto out; From eb13e0b69296ad1d3a9a3fa0cb6570aaf99f9f0c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 7 May 2018 17:22:08 -0700 Subject: [PATCH 0692/1212] fscrypt: add Speck128/256 support fscrypt currently only supports AES encryption. However, many low-end mobile devices have older CPUs that don't have AES instructions, e.g. the ARMv8 Cryptography Extensions. Currently, user data on such devices is not encrypted at rest because AES is too slow, even when the NEON bit-sliced implementation of AES is used. Unfortunately, it is infeasible to encrypt these devices at all when AES is the only option. Therefore, this patch updates fscrypt to support the Speck block cipher, which was recently added to the crypto API. The C implementation of Speck is not especially fast, but Speck can be implemented very efficiently with general-purpose vector instructions, e.g. ARM NEON. For example, on an ARMv7 processor, we measured the NEON-accelerated Speck128/256-XTS at 69 MB/s for both encryption and decryption, while AES-256-XTS with the NEON bit-sliced implementation was only 22 MB/s encryption and 19 MB/s decryption. There are multiple variants of Speck. This patch only adds support for Speck128/256, which is the variant with a 128-bit block size and 256-bit key size -- the same as AES-256. This is believed to be the most secure variant of Speck, and it's only about 6% slower than Speck128/128. Speck64/128 would be at least 20% faster because it has 20% rounds, and it can be even faster on CPUs that can't efficiently do the 64-bit operations needed for Speck128. However, Speck64's 64-bit block size is not preferred security-wise. ARM NEON also supports the needed 64-bit operations even on 32-bit CPUs, resulting in Speck128 being fast enough for our targeted use cases so far. The chosen modes of operation are XTS for contents and CTS-CBC for filenames. These are the same modes of operation that fscrypt defaults to for AES. Note that as with the other fscrypt modes, Speck will not be used unless userspace chooses to use it. Nor are any of the existing modes (which are all AES-based) being removed, of course. We intentionally don't make CONFIG_FS_ENCRYPTION select CONFIG_CRYPTO_SPECK, so people will have to enable Speck support themselves if they need it. This is because we shouldn't bloat the FS_ENCRYPTION dependencies with every new cipher, especially ones that aren't recommended for most users. Moreover, CRYPTO_SPECK is just the generic implementation, which won't be fast enough for many users; in practice, they'll need to enable CRYPTO_SPECK_NEON to get acceptable performance. More details about our choice of Speck can be found in our patches that added Speck to the crypto API, and the follow-on discussion threads. We're planning a publication that explains the choice in more detail. But briefly, we can't use ChaCha20 as we previously proposed, since it would be insecure to use a stream cipher in this context, with potential IV reuse during writes on f2fs and/or on wear-leveling flash storage. We also evaluated many other lightweight and/or ARX-based block ciphers such as Chaskey-LTS, RC5, LEA, CHAM, Threefish, RC6, NOEKEON, SPARX, and XTEA. However, all had disadvantages vs. Speck, such as insufficient performance with NEON, much less published cryptanalysis, or an insufficient security level. Various design choices in Speck make it perform better with NEON than competing ciphers while still having a security margin similar to AES, and in the case of Speck128 also the same available security levels. Unfortunately, Speck does have some political baggage attached -- it's an NSA designed cipher, and was rejected from an ISO standard (though for context, as far as I know none of the above-mentioned alternatives are ISO standards either). Nevertheless, we believe it is a good solution to the problem from a technical perspective. Certain algorithms constructed from ChaCha or the ChaCha permutation, such as MEM (Masked Even-Mansour) or HPolyC, may also meet our performance requirements. However, these are new constructions that need more time to receive the cryptographic review and acceptance needed to be confident in their security. HPolyC hasn't been published yet, and we are concerned that MEM makes stronger assumptions about the underlying permutation than the ChaCha stream cipher does. In contrast, the XTS mode of operation is relatively well accepted, and Speck has over 70 cryptanalysis papers. Of course, these ChaCha-based algorithms can still be added later if they become ready. The best known attack on Speck128/256 is a differential cryptanalysis attack on 25 of 34 rounds with 2^253 time complexity and 2^125 chosen plaintexts, i.e. only marginally faster than brute force. There is no known attack on the full 34 rounds. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- Documentation/filesystems/fscrypt.rst | 626 ++++++++++++++++++++++++++ fs/crypto/fscrypt_private.h | 4 + fs/crypto/keyinfo.c | 2 + include/uapi/linux/fs.h | 2 + 4 files changed, 634 insertions(+) create mode 100644 Documentation/filesystems/fscrypt.rst diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst new file mode 100644 index 000000000000..48b424de85bb --- /dev/null +++ b/Documentation/filesystems/fscrypt.rst @@ -0,0 +1,626 @@ +===================================== +Filesystem-level encryption (fscrypt) +===================================== + +Introduction +============ + +fscrypt is a library which filesystems can hook into to support +transparent encryption of files and directories. + +Note: "fscrypt" in this document refers to the kernel-level portion, +implemented in ``fs/crypto/``, as opposed to the userspace tool +`fscrypt `_. This document only +covers the kernel-level portion. For command-line examples of how to +use encryption, see the documentation for the userspace tool `fscrypt +`_. Also, it is recommended to use +the fscrypt userspace tool, or other existing userspace tools such as +`fscryptctl `_ or `Android's key +management system +`_, over +using the kernel's API directly. Using existing tools reduces the +chance of introducing your own security bugs. (Nevertheless, for +completeness this documentation covers the kernel's API anyway.) + +Unlike dm-crypt, fscrypt operates at the filesystem level rather than +at the block device level. This allows it to encrypt different files +with different keys and to have unencrypted files on the same +filesystem. This is useful for multi-user systems where each user's +data-at-rest needs to be cryptographically isolated from the others. +However, except for filenames, fscrypt does not encrypt filesystem +metadata. + +Unlike eCryptfs, which is a stacked filesystem, fscrypt is integrated +directly into supported filesystems --- currently ext4, F2FS, and +UBIFS. This allows encrypted files to be read and written without +caching both the decrypted and encrypted pages in the pagecache, +thereby nearly halving the memory used and bringing it in line with +unencrypted files. Similarly, half as many dentries and inodes are +needed. eCryptfs also limits encrypted filenames to 143 bytes, +causing application compatibility issues; fscrypt allows the full 255 +bytes (NAME_MAX). Finally, unlike eCryptfs, the fscrypt API can be +used by unprivileged users, with no need to mount anything. + +fscrypt does not support encrypting files in-place. Instead, it +supports marking an empty directory as encrypted. Then, after +userspace provides the key, all regular files, directories, and +symbolic links created in that directory tree are transparently +encrypted. + +Threat model +============ + +Offline attacks +--------------- + +Provided that userspace chooses a strong encryption key, fscrypt +protects the confidentiality of file contents and filenames in the +event of a single point-in-time permanent offline compromise of the +block device content. fscrypt does not protect the confidentiality of +non-filename metadata, e.g. file sizes, file permissions, file +timestamps, and extended attributes. Also, the existence and location +of holes (unallocated blocks which logically contain all zeroes) in +files is not protected. + +fscrypt is not guaranteed to protect confidentiality or authenticity +if an attacker is able to manipulate the filesystem offline prior to +an authorized user later accessing the filesystem. + +Online attacks +-------------- + +fscrypt (and storage encryption in general) can only provide limited +protection, if any at all, against online attacks. In detail: + +fscrypt is only resistant to side-channel attacks, such as timing or +electromagnetic attacks, to the extent that the underlying Linux +Cryptographic API algorithms are. If a vulnerable algorithm is used, +such as a table-based implementation of AES, it may be possible for an +attacker to mount a side channel attack against the online system. +Side channel attacks may also be mounted against applications +consuming decrypted data. + +After an encryption key has been provided, fscrypt is not designed to +hide the plaintext file contents or filenames from other users on the +same system, regardless of the visibility of the keyring key. +Instead, existing access control mechanisms such as file mode bits, +POSIX ACLs, LSMs, or mount namespaces should be used for this purpose. +Also note that as long as the encryption keys are *anywhere* in +memory, an online attacker can necessarily compromise them by mounting +a physical attack or by exploiting any kernel security vulnerability +which provides an arbitrary memory read primitive. + +While it is ostensibly possible to "evict" keys from the system, +recently accessed encrypted files will remain accessible at least +until the filesystem is unmounted or the VFS caches are dropped, e.g. +using ``echo 2 > /proc/sys/vm/drop_caches``. Even after that, if the +RAM is compromised before being powered off, it will likely still be +possible to recover portions of the plaintext file contents, if not +some of the encryption keys as well. (Since Linux v4.12, all +in-kernel keys related to fscrypt are sanitized before being freed. +However, userspace would need to do its part as well.) + +Currently, fscrypt does not prevent a user from maliciously providing +an incorrect key for another user's existing encrypted files. A +protection against this is planned. + +Key hierarchy +============= + +Master Keys +----------- + +Each encrypted directory tree is protected by a *master key*. Master +keys can be up to 64 bytes long, and must be at least as long as the +greater of the key length needed by the contents and filenames +encryption modes being used. For example, if AES-256-XTS is used for +contents encryption, the master key must be 64 bytes (512 bits). Note +that the XTS mode is defined to require a key twice as long as that +required by the underlying block cipher. + +To "unlock" an encrypted directory tree, userspace must provide the +appropriate master key. There can be any number of master keys, each +of which protects any number of directory trees on any number of +filesystems. + +Userspace should generate master keys either using a cryptographically +secure random number generator, or by using a KDF (Key Derivation +Function). Note that whenever a KDF is used to "stretch" a +lower-entropy secret such as a passphrase, it is critical that a KDF +designed for this purpose be used, such as scrypt, PBKDF2, or Argon2. + +Per-file keys +------------- + +Master keys are not used to encrypt file contents or names directly. +Instead, a unique key is derived for each encrypted file, including +each regular file, directory, and symbolic link. This has several +advantages: + +- In cryptosystems, the same key material should never be used for + different purposes. Using the master key as both an XTS key for + contents encryption and as a CTS-CBC key for filenames encryption + would violate this rule. +- Per-file keys simplify the choice of IVs (Initialization Vectors) + for contents encryption. Without per-file keys, to ensure IV + uniqueness both the inode and logical block number would need to be + encoded in the IVs. This would make it impossible to renumber + inodes, which e.g. ``resize2fs`` can do when resizing an ext4 + filesystem. With per-file keys, it is sufficient to encode just the + logical block number in the IVs. +- Per-file keys strengthen the encryption of filenames, where IVs are + reused out of necessity. With a unique key per directory, IV reuse + is limited to within a single directory. +- Per-file keys allow individual files to be securely erased simply by + securely erasing their keys. (Not yet implemented.) + +A KDF (Key Derivation Function) is used to derive per-file keys from +the master key. This is done instead of wrapping a randomly-generated +key for each file because it reduces the size of the encryption xattr, +which for some filesystems makes the xattr more likely to fit in-line +in the filesystem's inode table. With a KDF, only a 16-byte nonce is +required --- long enough to make key reuse extremely unlikely. A +wrapped key, on the other hand, would need to be up to 64 bytes --- +the length of an AES-256-XTS key. Furthermore, currently there is no +requirement to support unlocking a file with multiple alternative +master keys or to support rotating master keys. Instead, the master +keys may be wrapped in userspace, e.g. as done by the `fscrypt +`_ tool. + +The current KDF encrypts the master key using the 16-byte nonce as an +AES-128-ECB key. The output is used as the derived key. If the +output is longer than needed, then it is truncated to the needed +length. Truncation is the norm for directories and symlinks, since +those use the CTS-CBC encryption mode which requires a key half as +long as that required by the XTS encryption mode. + +Note: this KDF meets the primary security requirement, which is to +produce unique derived keys that preserve the entropy of the master +key, assuming that the master key is already a good pseudorandom key. +However, it is nonstandard and has some problems such as being +reversible, so it is generally considered to be a mistake! It may be +replaced with HKDF or another more standard KDF in the future. + +Encryption modes and usage +========================== + +fscrypt allows one encryption mode to be specified for file contents +and one encryption mode to be specified for filenames. Different +directory trees are permitted to use different encryption modes. +Currently, the following pairs of encryption modes are supported: + +- AES-256-XTS for contents and AES-256-CTS-CBC for filenames +- AES-128-CBC for contents and AES-128-CTS-CBC for filenames +- Speck128/256-XTS for contents and Speck128/256-CTS-CBC for filenames + +It is strongly recommended to use AES-256-XTS for contents encryption. +AES-128-CBC was added only for low-powered embedded devices with +crypto accelerators such as CAAM or CESA that do not support XTS. + +Similarly, Speck128/256 support was only added for older or low-end +CPUs which cannot do AES fast enough -- especially ARM CPUs which have +NEON instructions but not the Cryptography Extensions -- and for which +it would not otherwise be feasible to use encryption at all. It is +not recommended to use Speck on CPUs that have AES instructions. +Speck support is only available if it has been enabled in the crypto +API via CONFIG_CRYPTO_SPECK. Also, on ARM platforms, to get +acceptable performance CONFIG_CRYPTO_SPECK_NEON must be enabled. + +New encryption modes can be added relatively easily, without changes +to individual filesystems. However, authenticated encryption (AE) +modes are not currently supported because of the difficulty of dealing +with ciphertext expansion. + +For file contents, each filesystem block is encrypted independently. +Currently, only the case where the filesystem block size is equal to +the system's page size (usually 4096 bytes) is supported. With the +XTS mode of operation (recommended), the logical block number within +the file is used as the IV. With the CBC mode of operation (not +recommended), ESSIV is used; specifically, the IV for CBC is the +logical block number encrypted with AES-256, where the AES-256 key is +the SHA-256 hash of the inode's data encryption key. + +For filenames, the full filename is encrypted at once. Because of the +requirements to retain support for efficient directory lookups and +filenames of up to 255 bytes, a constant initialization vector (IV) is +used. However, each encrypted directory uses a unique key, which +limits IV reuse to within a single directory. Note that IV reuse in +the context of CTS-CBC encryption means that when the original +filenames share a common prefix at least as long as the cipher block +size (16 bytes for AES), the corresponding encrypted filenames will +also share a common prefix. This is undesirable; it may be fixed in +the future by switching to an encryption mode that is a strong +pseudorandom permutation on arbitrary-length messages, e.g. the HEH +(Hash-Encrypt-Hash) mode. + +Since filenames are encrypted with the CTS-CBC mode of operation, the +plaintext and ciphertext filenames need not be multiples of the AES +block size, i.e. 16 bytes. However, the minimum size that can be +encrypted is 16 bytes, so shorter filenames are NUL-padded to 16 bytes +before being encrypted. In addition, to reduce leakage of filename +lengths via their ciphertexts, all filenames are NUL-padded to the +next 4, 8, 16, or 32-byte boundary (configurable). 32 is recommended +since this provides the best confidentiality, at the cost of making +directory entries consume slightly more space. Note that since NUL +(``\0``) is not otherwise a valid character in filenames, the padding +will never produce duplicate plaintexts. + +Symbolic link targets are considered a type of filename and are +encrypted in the same way as filenames in directory entries. Each +symlink also uses a unique key; hence, the hardcoded IV is not a +problem for symlinks. + +User API +======== + +Setting an encryption policy +---------------------------- + +The FS_IOC_SET_ENCRYPTION_POLICY ioctl sets an encryption policy on an +empty directory or verifies that a directory or regular file already +has the specified encryption policy. It takes in a pointer to a +:c:type:`struct fscrypt_policy`, defined as follows:: + + #define FS_KEY_DESCRIPTOR_SIZE 8 + + struct fscrypt_policy { + __u8 version; + __u8 contents_encryption_mode; + __u8 filenames_encryption_mode; + __u8 flags; + __u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; + }; + +This structure must be initialized as follows: + +- ``version`` must be 0. + +- ``contents_encryption_mode`` and ``filenames_encryption_mode`` must + be set to constants from ```` which identify the + encryption modes to use. If unsure, use + FS_ENCRYPTION_MODE_AES_256_XTS (1) for ``contents_encryption_mode`` + and FS_ENCRYPTION_MODE_AES_256_CTS (4) for + ``filenames_encryption_mode``. + +- ``flags`` must be set to a value from ```` which + identifies the amount of NUL-padding to use when encrypting + filenames. If unsure, use FS_POLICY_FLAGS_PAD_32 (0x3). + +- ``master_key_descriptor`` specifies how to find the master key in + the keyring; see `Adding keys`_. It is up to userspace to choose a + unique ``master_key_descriptor`` for each master key. The e4crypt + and fscrypt tools use the first 8 bytes of + ``SHA-512(SHA-512(master_key))``, but this particular scheme is not + required. Also, the master key need not be in the keyring yet when + FS_IOC_SET_ENCRYPTION_POLICY is executed. However, it must be added + before any files can be created in the encrypted directory. + +If the file is not yet encrypted, then FS_IOC_SET_ENCRYPTION_POLICY +verifies that the file is an empty directory. If so, the specified +encryption policy is assigned to the directory, turning it into an +encrypted directory. After that, and after providing the +corresponding master key as described in `Adding keys`_, all regular +files, directories (recursively), and symlinks created in the +directory will be encrypted, inheriting the same encryption policy. +The filenames in the directory's entries will be encrypted as well. + +Alternatively, if the file is already encrypted, then +FS_IOC_SET_ENCRYPTION_POLICY validates that the specified encryption +policy exactly matches the actual one. If they match, then the ioctl +returns 0. Otherwise, it fails with EEXIST. This works on both +regular files and directories, including nonempty directories. + +Note that the ext4 filesystem does not allow the root directory to be +encrypted, even if it is empty. Users who want to encrypt an entire +filesystem with one key should consider using dm-crypt instead. + +FS_IOC_SET_ENCRYPTION_POLICY can fail with the following errors: + +- ``EACCES``: the file is not owned by the process's uid, nor does the + process have the CAP_FOWNER capability in a namespace with the file + owner's uid mapped +- ``EEXIST``: the file is already encrypted with an encryption policy + different from the one specified +- ``EINVAL``: an invalid encryption policy was specified (invalid + version, mode(s), or flags) +- ``ENOTDIR``: the file is unencrypted and is a regular file, not a + directory +- ``ENOTEMPTY``: the file is unencrypted and is a nonempty directory +- ``ENOTTY``: this type of filesystem does not implement encryption +- ``EOPNOTSUPP``: the kernel was not configured with encryption + support for this filesystem, or the filesystem superblock has not + had encryption enabled on it. (For example, to use encryption on an + ext4 filesystem, CONFIG_EXT4_ENCRYPTION must be enabled in the + kernel config, and the superblock must have had the "encrypt" + feature flag enabled using ``tune2fs -O encrypt`` or ``mkfs.ext4 -O + encrypt``.) +- ``EPERM``: this directory may not be encrypted, e.g. because it is + the root directory of an ext4 filesystem +- ``EROFS``: the filesystem is readonly + +Getting an encryption policy +---------------------------- + +The FS_IOC_GET_ENCRYPTION_POLICY ioctl retrieves the :c:type:`struct +fscrypt_policy`, if any, for a directory or regular file. See above +for the struct definition. No additional permissions are required +beyond the ability to open the file. + +FS_IOC_GET_ENCRYPTION_POLICY can fail with the following errors: + +- ``EINVAL``: the file is encrypted, but it uses an unrecognized + encryption context format +- ``ENODATA``: the file is not encrypted +- ``ENOTTY``: this type of filesystem does not implement encryption +- ``EOPNOTSUPP``: the kernel was not configured with encryption + support for this filesystem + +Note: if you only need to know whether a file is encrypted or not, on +most filesystems it is also possible to use the FS_IOC_GETFLAGS ioctl +and check for FS_ENCRYPT_FL, or to use the statx() system call and +check for STATX_ATTR_ENCRYPTED in stx_attributes. + +Getting the per-filesystem salt +------------------------------- + +Some filesystems, such as ext4 and F2FS, also support the deprecated +ioctl FS_IOC_GET_ENCRYPTION_PWSALT. This ioctl retrieves a randomly +generated 16-byte value stored in the filesystem superblock. This +value is intended to used as a salt when deriving an encryption key +from a passphrase or other low-entropy user credential. + +FS_IOC_GET_ENCRYPTION_PWSALT is deprecated. Instead, prefer to +generate and manage any needed salt(s) in userspace. + +Adding keys +----------- + +To provide a master key, userspace must add it to an appropriate +keyring using the add_key() system call (see: +``Documentation/security/keys/core.rst``). The key type must be +"logon"; keys of this type are kept in kernel memory and cannot be +read back by userspace. The key description must be "fscrypt:" +followed by the 16-character lower case hex representation of the +``master_key_descriptor`` that was set in the encryption policy. The +key payload must conform to the following structure:: + + #define FS_MAX_KEY_SIZE 64 + + struct fscrypt_key { + u32 mode; + u8 raw[FS_MAX_KEY_SIZE]; + u32 size; + }; + +``mode`` is ignored; just set it to 0. The actual key is provided in +``raw`` with ``size`` indicating its size in bytes. That is, the +bytes ``raw[0..size-1]`` (inclusive) are the actual key. + +The key description prefix "fscrypt:" may alternatively be replaced +with a filesystem-specific prefix such as "ext4:". However, the +filesystem-specific prefixes are deprecated and should not be used in +new programs. + +There are several different types of keyrings in which encryption keys +may be placed, such as a session keyring, a user session keyring, or a +user keyring. Each key must be placed in a keyring that is "attached" +to all processes that might need to access files encrypted with it, in +the sense that request_key() will find the key. Generally, if only +processes belonging to a specific user need to access a given +encrypted directory and no session keyring has been installed, then +that directory's key should be placed in that user's user session +keyring or user keyring. Otherwise, a session keyring should be +installed if needed, and the key should be linked into that session +keyring, or in a keyring linked into that session keyring. + +Note: introducing the complex visibility semantics of keyrings here +was arguably a mistake --- especially given that by design, after any +process successfully opens an encrypted file (thereby setting up the +per-file key), possessing the keyring key is not actually required for +any process to read/write the file until its in-memory inode is +evicted. In the future there probably should be a way to provide keys +directly to the filesystem instead, which would make the intended +semantics clearer. + +Access semantics +================ + +With the key +------------ + +With the encryption key, encrypted regular files, directories, and +symlinks behave very similarly to their unencrypted counterparts --- +after all, the encryption is intended to be transparent. However, +astute users may notice some differences in behavior: + +- Unencrypted files, or files encrypted with a different encryption + policy (i.e. different key, modes, or flags), cannot be renamed or + linked into an encrypted directory; see `Encryption policy + enforcement`_. Attempts to do so will fail with EPERM. However, + encrypted files can be renamed within an encrypted directory, or + into an unencrypted directory. + +- Direct I/O is not supported on encrypted files. Attempts to use + direct I/O on such files will fall back to buffered I/O. + +- The fallocate operations FALLOC_FL_COLLAPSE_RANGE, + FALLOC_FL_INSERT_RANGE, and FALLOC_FL_ZERO_RANGE are not supported + on encrypted files and will fail with EOPNOTSUPP. + +- Online defragmentation of encrypted files is not supported. The + EXT4_IOC_MOVE_EXT and F2FS_IOC_MOVE_RANGE ioctls will fail with + EOPNOTSUPP. + +- The ext4 filesystem does not support data journaling with encrypted + regular files. It will fall back to ordered data mode instead. + +- DAX (Direct Access) is not supported on encrypted files. + +- The st_size of an encrypted symlink will not necessarily give the + length of the symlink target as required by POSIX. It will actually + give the length of the ciphertext, which will be slightly longer + than the plaintext due to NUL-padding and an extra 2-byte overhead. + +- The maximum length of an encrypted symlink is 2 bytes shorter than + the maximum length of an unencrypted symlink. For example, on an + EXT4 filesystem with a 4K block size, unencrypted symlinks can be up + to 4095 bytes long, while encrypted symlinks can only be up to 4093 + bytes long (both lengths excluding the terminating null). + +Note that mmap *is* supported. This is possible because the pagecache +for an encrypted file contains the plaintext, not the ciphertext. + +Without the key +--------------- + +Some filesystem operations may be performed on encrypted regular +files, directories, and symlinks even before their encryption key has +been provided: + +- File metadata may be read, e.g. using stat(). + +- Directories may be listed, in which case the filenames will be + listed in an encoded form derived from their ciphertext. The + current encoding algorithm is described in `Filename hashing and + encoding`_. The algorithm is subject to change, but it is + guaranteed that the presented filenames will be no longer than + NAME_MAX bytes, will not contain the ``/`` or ``\0`` characters, and + will uniquely identify directory entries. + + The ``.`` and ``..`` directory entries are special. They are always + present and are not encrypted or encoded. + +- Files may be deleted. That is, nondirectory files may be deleted + with unlink() as usual, and empty directories may be deleted with + rmdir() as usual. Therefore, ``rm`` and ``rm -r`` will work as + expected. + +- Symlink targets may be read and followed, but they will be presented + in encrypted form, similar to filenames in directories. Hence, they + are unlikely to point to anywhere useful. + +Without the key, regular files cannot be opened or truncated. +Attempts to do so will fail with ENOKEY. This implies that any +regular file operations that require a file descriptor, such as +read(), write(), mmap(), fallocate(), and ioctl(), are also forbidden. + +Also without the key, files of any type (including directories) cannot +be created or linked into an encrypted directory, nor can a name in an +encrypted directory be the source or target of a rename, nor can an +O_TMPFILE temporary file be created in an encrypted directory. All +such operations will fail with ENOKEY. + +It is not currently possible to backup and restore encrypted files +without the encryption key. This would require special APIs which +have not yet been implemented. + +Encryption policy enforcement +============================= + +After an encryption policy has been set on a directory, all regular +files, directories, and symbolic links created in that directory +(recursively) will inherit that encryption policy. Special files --- +that is, named pipes, device nodes, and UNIX domain sockets --- will +not be encrypted. + +Except for those special files, it is forbidden to have unencrypted +files, or files encrypted with a different encryption policy, in an +encrypted directory tree. Attempts to link or rename such a file into +an encrypted directory will fail with EPERM. This is also enforced +during ->lookup() to provide limited protection against offline +attacks that try to disable or downgrade encryption in known locations +where applications may later write sensitive data. It is recommended +that systems implementing a form of "verified boot" take advantage of +this by validating all top-level encryption policies prior to access. + +Implementation details +====================== + +Encryption context +------------------ + +An encryption policy is represented on-disk by a :c:type:`struct +fscrypt_context`. It is up to individual filesystems to decide where +to store it, but normally it would be stored in a hidden extended +attribute. It should *not* be exposed by the xattr-related system +calls such as getxattr() and setxattr() because of the special +semantics of the encryption xattr. (In particular, there would be +much confusion if an encryption policy were to be added to or removed +from anything other than an empty directory.) The struct is defined +as follows:: + + #define FS_KEY_DESCRIPTOR_SIZE 8 + #define FS_KEY_DERIVATION_NONCE_SIZE 16 + + struct fscrypt_context { + u8 format; + u8 contents_encryption_mode; + u8 filenames_encryption_mode; + u8 flags; + u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; + u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; + }; + +Note that :c:type:`struct fscrypt_context` contains the same +information as :c:type:`struct fscrypt_policy` (see `Setting an +encryption policy`_), except that :c:type:`struct fscrypt_context` +also contains a nonce. The nonce is randomly generated by the kernel +and is used to derive the inode's encryption key as described in +`Per-file keys`_. + +Data path changes +----------------- + +For the read path (->readpage()) of regular files, filesystems can +read the ciphertext into the page cache and decrypt it in-place. The +page lock must be held until decryption has finished, to prevent the +page from becoming visible to userspace prematurely. + +For the write path (->writepage()) of regular files, filesystems +cannot encrypt data in-place in the page cache, since the cached +plaintext must be preserved. Instead, filesystems must encrypt into a +temporary buffer or "bounce page", then write out the temporary +buffer. Some filesystems, such as UBIFS, already use temporary +buffers regardless of encryption. Other filesystems, such as ext4 and +F2FS, have to allocate bounce pages specially for encryption. + +Filename hashing and encoding +----------------------------- + +Modern filesystems accelerate directory lookups by using indexed +directories. An indexed directory is organized as a tree keyed by +filename hashes. When a ->lookup() is requested, the filesystem +normally hashes the filename being looked up so that it can quickly +find the corresponding directory entry, if any. + +With encryption, lookups must be supported and efficient both with and +without the encryption key. Clearly, it would not work to hash the +plaintext filenames, since the plaintext filenames are unavailable +without the key. (Hashing the plaintext filenames would also make it +impossible for the filesystem's fsck tool to optimize encrypted +directories.) Instead, filesystems hash the ciphertext filenames, +i.e. the bytes actually stored on-disk in the directory entries. When +asked to do a ->lookup() with the key, the filesystem just encrypts +the user-supplied name to get the ciphertext. + +Lookups without the key are more complicated. The raw ciphertext may +contain the ``\0`` and ``/`` characters, which are illegal in +filenames. Therefore, readdir() must base64-encode the ciphertext for +presentation. For most filenames, this works fine; on ->lookup(), the +filesystem just base64-decodes the user-supplied name to get back to +the raw ciphertext. + +However, for very long filenames, base64 encoding would cause the +filename length to exceed NAME_MAX. To prevent this, readdir() +actually presents long filenames in an abbreviated form which encodes +a strong "hash" of the ciphertext filename, along with the optional +filesystem-specific hash(es) needed for directory lookups. This +allows the filesystem to still, with a high degree of confidence, map +the filename given in ->lookup() back to a particular directory entry +that was previously listed by readdir(). See :c:type:`struct +fscrypt_digested_name` in the source for more details. + +Note that the precise way that filenames are presented to userspace +without the key is subject to change in the future. It is only meant +as a way to temporarily present valid filenames so that commands like +``rm -r`` work as expected on encrypted directories. diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 92c6c0ace1b1..ea372cd53ab6 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -93,6 +93,10 @@ static inline bool fscrypt_valid_enc_modes(u32 contents_mode, filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS) return true; + if (contents_mode == FS_ENCRYPTION_MODE_SPECK128_256_XTS && + filenames_mode == FS_ENCRYPTION_MODE_SPECK128_256_CTS) + return true; + return false; } diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 44bcb695c206..154cd89c2212 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -155,6 +155,8 @@ static const struct { [FS_ENCRYPTION_MODE_AES_256_CTS] = { "cts(cbc(aes))", 32 }, [FS_ENCRYPTION_MODE_AES_128_CBC] = { "cbc(aes)", 16 }, [FS_ENCRYPTION_MODE_AES_128_CTS] = { "cts(cbc(aes))", 16 }, + [FS_ENCRYPTION_MODE_SPECK128_256_XTS] = { "xts(speck128)", 64 }, + [FS_ENCRYPTION_MODE_SPECK128_256_CTS] = { "cts(cbc(speck128))", 32 }, }; static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode, diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index f3ef5016cf9c..52cedebfd202 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -191,6 +191,8 @@ struct inodes_stat_t { #define FS_ENCRYPTION_MODE_AES_256_CTS 4 #define FS_ENCRYPTION_MODE_AES_128_CBC 5 #define FS_ENCRYPTION_MODE_AES_128_CTS 6 +#define FS_ENCRYPTION_MODE_SPECK128_256_XTS 7 +#define FS_ENCRYPTION_MODE_SPECK128_256_CTS 8 struct fscrypt_policy { From a0ca4bdf47449c111a0225f49b644bf5e1fc72bd Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 21 Jan 2016 17:10:56 +0800 Subject: [PATCH 0693/1212] crypto: skcipher - Add default key size helper While converting ecryptfs over to skcipher I found that it needs to pick a default key size if one isn't given. Rather than having it poke into the guts of the algorithm to get max_keysize, let's provide a helper that is meant to give a sane default (just in case we ever get an algorithm that has no maximum key size). Signed-off-by: Herbert Xu --- crypto/skcipher.c | 4 ++-- include/crypto/skcipher.h | 11 ++++++++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/crypto/skcipher.c b/crypto/skcipher.c index d199c0b1751c..69230e9d4ac9 100644 --- a/crypto/skcipher.c +++ b/crypto/skcipher.c @@ -118,7 +118,7 @@ static int crypto_init_skcipher_ops_blkcipher(struct crypto_tfm *tfm) skcipher->decrypt = skcipher_decrypt_blkcipher; skcipher->ivsize = crypto_blkcipher_ivsize(blkcipher); - skcipher->has_setkey = calg->cra_blkcipher.max_keysize; + skcipher->keysize = calg->cra_blkcipher.max_keysize; return 0; } @@ -211,7 +211,7 @@ static int crypto_init_skcipher_ops_ablkcipher(struct crypto_tfm *tfm) skcipher->ivsize = crypto_ablkcipher_ivsize(ablkcipher); skcipher->reqsize = crypto_ablkcipher_reqsize(ablkcipher) + sizeof(struct ablkcipher_request); - skcipher->has_setkey = calg->cra_ablkcipher.max_keysize; + skcipher->keysize = calg->cra_ablkcipher.max_keysize; return 0; } diff --git a/include/crypto/skcipher.h b/include/crypto/skcipher.h index fd8742a40ff3..2f07b4fce3e0 100644 --- a/include/crypto/skcipher.h +++ b/include/crypto/skcipher.h @@ -60,8 +60,7 @@ struct crypto_skcipher { unsigned int ivsize; unsigned int reqsize; - - bool has_setkey; + unsigned int keysize; struct crypto_tfm base; }; @@ -309,7 +308,13 @@ static inline int crypto_skcipher_setkey(struct crypto_skcipher *tfm, static inline bool crypto_skcipher_has_setkey(struct crypto_skcipher *tfm) { - return tfm->has_setkey; + return tfm->keysize; +} + +static inline unsigned int crypto_skcipher_default_keysize( + struct crypto_skcipher *tfm) +{ + return tfm->keysize; } /** From a9146e42354783b81999191970349a9e5a9d1c98 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 26 Jan 2016 22:14:36 +0800 Subject: [PATCH 0694/1212] crypto: skcipher - Add helper to retrieve driver name This patch adds the helper crypto_skcipher_driver_name which returns the driver name of the alg object for a given tfm. This is needed by ecryptfs. Signed-off-by: Herbert Xu --- include/crypto/skcipher.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/crypto/skcipher.h b/include/crypto/skcipher.h index 2f07b4fce3e0..41418790c536 100644 --- a/include/crypto/skcipher.h +++ b/include/crypto/skcipher.h @@ -231,6 +231,12 @@ static inline int crypto_has_skcipher(const char *alg_name, u32 type, crypto_skcipher_mask(mask)); } +static inline const char *crypto_skcipher_driver_name( + struct crypto_skcipher *tfm) +{ + return crypto_tfm_alg_name(crypto_skcipher_tfm(tfm)); +} + /** * crypto_skcipher_ivsize() - obtain IV size * @tfm: cipher handle From b24dcaae875314079dd4fe65ce231fd9b0bf58be Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 12 Jul 2016 13:17:31 +0800 Subject: [PATCH 0695/1212] crypto: skcipher - Add low-level skcipher interface This patch allows skcipher algorithms and instances to be created and registered with the crypto API. They are accessible through the top-level skcipher interface, along with ablkcipher/blkcipher algorithms and instances. This patch also introduces a new parameter called chunk size which is meant for ciphers such as CTR and CTS which ostensibly can handle arbitrary lengths, but still behave like block ciphers in that you can only process a partial block at the very end. For these ciphers the block size will continue to be set to 1 as it is now while the chunk size will be set to the underlying block size. Signed-off-by: Herbert Xu --- crypto/skcipher.c | 196 +++++++++++++++++++++++++++-- include/crypto/internal/skcipher.h | 87 +++++++++++++ include/crypto/skcipher.h | 130 +++++++++++++++++++ include/linux/crypto.h | 1 + 4 files changed, 407 insertions(+), 7 deletions(-) diff --git a/crypto/skcipher.c b/crypto/skcipher.c index 69230e9d4ac9..d248008e7f7b 100644 --- a/crypto/skcipher.c +++ b/crypto/skcipher.c @@ -16,7 +16,11 @@ #include #include +#include #include +#include +#include +#include #include "internal.h" @@ -25,10 +29,11 @@ static unsigned int crypto_skcipher_extsize(struct crypto_alg *alg) if (alg->cra_type == &crypto_blkcipher_type) return sizeof(struct crypto_blkcipher *); - BUG_ON(alg->cra_type != &crypto_ablkcipher_type && - alg->cra_type != &crypto_givcipher_type); + if (alg->cra_type == &crypto_ablkcipher_type || + alg->cra_type == &crypto_givcipher_type) + return sizeof(struct crypto_ablkcipher *); - return sizeof(struct crypto_ablkcipher *); + return crypto_alg_extsize(alg); } static int skcipher_setkey_blkcipher(struct crypto_skcipher *tfm, @@ -216,26 +221,118 @@ static int crypto_init_skcipher_ops_ablkcipher(struct crypto_tfm *tfm) return 0; } +static void crypto_skcipher_exit_tfm(struct crypto_tfm *tfm) +{ + struct crypto_skcipher *skcipher = __crypto_skcipher_cast(tfm); + struct skcipher_alg *alg = crypto_skcipher_alg(skcipher); + + alg->exit(skcipher); +} + static int crypto_skcipher_init_tfm(struct crypto_tfm *tfm) { + struct crypto_skcipher *skcipher = __crypto_skcipher_cast(tfm); + struct skcipher_alg *alg = crypto_skcipher_alg(skcipher); + if (tfm->__crt_alg->cra_type == &crypto_blkcipher_type) return crypto_init_skcipher_ops_blkcipher(tfm); - BUG_ON(tfm->__crt_alg->cra_type != &crypto_ablkcipher_type && - tfm->__crt_alg->cra_type != &crypto_givcipher_type); + if (tfm->__crt_alg->cra_type == &crypto_ablkcipher_type || + tfm->__crt_alg->cra_type == &crypto_givcipher_type) + return crypto_init_skcipher_ops_ablkcipher(tfm); - return crypto_init_skcipher_ops_ablkcipher(tfm); + skcipher->setkey = alg->setkey; + skcipher->encrypt = alg->encrypt; + skcipher->decrypt = alg->decrypt; + skcipher->ivsize = alg->ivsize; + skcipher->keysize = alg->max_keysize; + + if (alg->exit) + skcipher->base.exit = crypto_skcipher_exit_tfm; + + if (alg->init) + return alg->init(skcipher); + + return 0; } +static void crypto_skcipher_free_instance(struct crypto_instance *inst) +{ + struct skcipher_instance *skcipher = + container_of(inst, struct skcipher_instance, s.base); + + skcipher->free(skcipher); +} + +static void crypto_skcipher_show(struct seq_file *m, struct crypto_alg *alg) + __attribute__ ((unused)); +static void crypto_skcipher_show(struct seq_file *m, struct crypto_alg *alg) +{ + struct skcipher_alg *skcipher = container_of(alg, struct skcipher_alg, + base); + + seq_printf(m, "type : skcipher\n"); + seq_printf(m, "async : %s\n", + alg->cra_flags & CRYPTO_ALG_ASYNC ? "yes" : "no"); + seq_printf(m, "blocksize : %u\n", alg->cra_blocksize); + seq_printf(m, "min keysize : %u\n", skcipher->min_keysize); + seq_printf(m, "max keysize : %u\n", skcipher->max_keysize); + seq_printf(m, "ivsize : %u\n", skcipher->ivsize); + seq_printf(m, "chunksize : %u\n", skcipher->chunksize); +} + +#ifdef CONFIG_NET +static int crypto_skcipher_report(struct sk_buff *skb, struct crypto_alg *alg) +{ + struct crypto_report_blkcipher rblkcipher; + struct skcipher_alg *skcipher = container_of(alg, struct skcipher_alg, + base); + + strncpy(rblkcipher.type, "skcipher", sizeof(rblkcipher.type)); + strncpy(rblkcipher.geniv, "", sizeof(rblkcipher.geniv)); + + rblkcipher.blocksize = alg->cra_blocksize; + rblkcipher.min_keysize = skcipher->min_keysize; + rblkcipher.max_keysize = skcipher->max_keysize; + rblkcipher.ivsize = skcipher->ivsize; + + if (nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER, + sizeof(struct crypto_report_blkcipher), &rblkcipher)) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -EMSGSIZE; +} +#else +static int crypto_skcipher_report(struct sk_buff *skb, struct crypto_alg *alg) +{ + return -ENOSYS; +} +#endif + static const struct crypto_type crypto_skcipher_type2 = { .extsize = crypto_skcipher_extsize, .init_tfm = crypto_skcipher_init_tfm, + .free = crypto_skcipher_free_instance, +#ifdef CONFIG_PROC_FS + .show = crypto_skcipher_show, +#endif + .report = crypto_skcipher_report, .maskclear = ~CRYPTO_ALG_TYPE_MASK, .maskset = CRYPTO_ALG_TYPE_BLKCIPHER_MASK, - .type = CRYPTO_ALG_TYPE_BLKCIPHER, + .type = CRYPTO_ALG_TYPE_SKCIPHER, .tfmsize = offsetof(struct crypto_skcipher, base), }; +int crypto_grab_skcipher2(struct crypto_skcipher_spawn *spawn, + const char *name, u32 type, u32 mask) +{ + spawn->base.frontend = &crypto_skcipher_type2; + return crypto_grab_spawn(&spawn->base, name, type, mask); +} +EXPORT_SYMBOL_GPL(crypto_grab_skcipher2); + struct crypto_skcipher *crypto_alloc_skcipher(const char *alg_name, u32 type, u32 mask) { @@ -243,5 +340,90 @@ struct crypto_skcipher *crypto_alloc_skcipher(const char *alg_name, } EXPORT_SYMBOL_GPL(crypto_alloc_skcipher); +int crypto_has_skcipher2(const char *alg_name, u32 type, u32 mask) +{ + return crypto_type_has_alg(alg_name, &crypto_skcipher_type2, + type, mask); +} +EXPORT_SYMBOL_GPL(crypto_has_skcipher2); + +static int skcipher_prepare_alg(struct skcipher_alg *alg) +{ + struct crypto_alg *base = &alg->base; + + if (alg->ivsize > PAGE_SIZE / 8 || alg->chunksize > PAGE_SIZE / 8) + return -EINVAL; + + if (!alg->chunksize) + alg->chunksize = base->cra_blocksize; + + base->cra_type = &crypto_skcipher_type2; + base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK; + base->cra_flags |= CRYPTO_ALG_TYPE_SKCIPHER; + + return 0; +} + +int crypto_register_skcipher(struct skcipher_alg *alg) +{ + struct crypto_alg *base = &alg->base; + int err; + + err = skcipher_prepare_alg(alg); + if (err) + return err; + + return crypto_register_alg(base); +} +EXPORT_SYMBOL_GPL(crypto_register_skcipher); + +void crypto_unregister_skcipher(struct skcipher_alg *alg) +{ + crypto_unregister_alg(&alg->base); +} +EXPORT_SYMBOL_GPL(crypto_unregister_skcipher); + +int crypto_register_skciphers(struct skcipher_alg *algs, int count) +{ + int i, ret; + + for (i = 0; i < count; i++) { + ret = crypto_register_skcipher(&algs[i]); + if (ret) + goto err; + } + + return 0; + +err: + for (--i; i >= 0; --i) + crypto_unregister_skcipher(&algs[i]); + + return ret; +} +EXPORT_SYMBOL_GPL(crypto_register_skciphers); + +void crypto_unregister_skciphers(struct skcipher_alg *algs, int count) +{ + int i; + + for (i = count - 1; i >= 0; --i) + crypto_unregister_skcipher(&algs[i]); +} +EXPORT_SYMBOL_GPL(crypto_unregister_skciphers); + +int skcipher_register_instance(struct crypto_template *tmpl, + struct skcipher_instance *inst) +{ + int err; + + err = skcipher_prepare_alg(&inst->alg); + if (err) + return err; + + return crypto_register_instance(tmpl, skcipher_crypto_instance(inst)); +} +EXPORT_SYMBOL_GPL(skcipher_register_instance); + MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Symmetric key cipher type"); diff --git a/include/crypto/internal/skcipher.h b/include/crypto/internal/skcipher.h index 2cf7a61ece59..ce6619c339fe 100644 --- a/include/crypto/internal/skcipher.h +++ b/include/crypto/internal/skcipher.h @@ -19,12 +19,46 @@ struct rtattr; +struct skcipher_instance { + void (*free)(struct skcipher_instance *inst); + union { + struct { + char head[offsetof(struct skcipher_alg, base)]; + struct crypto_instance base; + } s; + struct skcipher_alg alg; + }; +}; + struct crypto_skcipher_spawn { struct crypto_spawn base; }; extern const struct crypto_type crypto_givcipher_type; +static inline struct crypto_instance *skcipher_crypto_instance( + struct skcipher_instance *inst) +{ + return &inst->s.base; +} + +static inline struct skcipher_instance *skcipher_alg_instance( + struct crypto_skcipher *skcipher) +{ + return container_of(crypto_skcipher_alg(skcipher), + struct skcipher_instance, alg); +} + +static inline void *skcipher_instance_ctx(struct skcipher_instance *inst) +{ + return crypto_instance_ctx(skcipher_crypto_instance(inst)); +} + +static inline void skcipher_request_complete(struct skcipher_request *req, int err) +{ + req->base.complete(&req->base, err); +} + static inline void crypto_set_skcipher_spawn( struct crypto_skcipher_spawn *spawn, struct crypto_instance *inst) { @@ -33,6 +67,8 @@ static inline void crypto_set_skcipher_spawn( int crypto_grab_skcipher(struct crypto_skcipher_spawn *spawn, const char *name, u32 type, u32 mask); +int crypto_grab_skcipher2(struct crypto_skcipher_spawn *spawn, + const char *name, u32 type, u32 mask); struct crypto_alg *crypto_lookup_skcipher(const char *name, u32 type, u32 mask); @@ -47,6 +83,12 @@ static inline struct crypto_alg *crypto_skcipher_spawn_alg( return spawn->base.alg; } +static inline struct skcipher_alg *crypto_spawn_skcipher_alg( + struct crypto_skcipher_spawn *spawn) +{ + return container_of(spawn->base.alg, struct skcipher_alg, base); +} + static inline struct crypto_ablkcipher *crypto_spawn_skcipher( struct crypto_skcipher_spawn *spawn) { @@ -55,6 +97,25 @@ static inline struct crypto_ablkcipher *crypto_spawn_skcipher( crypto_skcipher_mask(0))); } +static inline struct crypto_skcipher *crypto_spawn_skcipher2( + struct crypto_skcipher_spawn *spawn) +{ + return crypto_spawn_tfm2(&spawn->base); +} + +static inline void crypto_skcipher_set_reqsize( + struct crypto_skcipher *skcipher, unsigned int reqsize) +{ + skcipher->reqsize = reqsize; +} + +int crypto_register_skcipher(struct skcipher_alg *alg); +void crypto_unregister_skcipher(struct skcipher_alg *alg); +int crypto_register_skciphers(struct skcipher_alg *algs, int count); +void crypto_unregister_skciphers(struct skcipher_alg *algs, int count); +int skcipher_register_instance(struct crypto_template *tmpl, + struct skcipher_instance *inst); + int skcipher_null_givencrypt(struct skcipher_givcrypt_request *req); int skcipher_null_givdecrypt(struct skcipher_givcrypt_request *req); const char *crypto_default_geniv(const struct crypto_alg *alg); @@ -122,5 +183,31 @@ static inline u32 skcipher_request_flags(struct skcipher_request *req) return req->base.flags; } +static inline unsigned int crypto_skcipher_alg_min_keysize( + struct skcipher_alg *alg) +{ + if ((alg->base.cra_flags & CRYPTO_ALG_TYPE_MASK) == + CRYPTO_ALG_TYPE_BLKCIPHER) + return alg->base.cra_blkcipher.min_keysize; + + if (alg->base.cra_ablkcipher.encrypt) + return alg->base.cra_ablkcipher.min_keysize; + + return alg->min_keysize; +} + +static inline unsigned int crypto_skcipher_alg_max_keysize( + struct skcipher_alg *alg) +{ + if ((alg->base.cra_flags & CRYPTO_ALG_TYPE_MASK) == + CRYPTO_ALG_TYPE_BLKCIPHER) + return alg->base.cra_blkcipher.max_keysize; + + if (alg->base.cra_ablkcipher.encrypt) + return alg->base.cra_ablkcipher.max_keysize; + + return alg->max_keysize; +} + #endif /* _CRYPTO_INTERNAL_SKCIPHER_H */ diff --git a/include/crypto/skcipher.h b/include/crypto/skcipher.h index 41418790c536..5c90d3edf975 100644 --- a/include/crypto/skcipher.h +++ b/include/crypto/skcipher.h @@ -65,6 +65,75 @@ struct crypto_skcipher { struct crypto_tfm base; }; +/** + * struct skcipher_alg - symmetric key cipher definition + * @min_keysize: Minimum key size supported by the transformation. This is the + * smallest key length supported by this transformation algorithm. + * This must be set to one of the pre-defined values as this is + * not hardware specific. Possible values for this field can be + * found via git grep "_MIN_KEY_SIZE" include/crypto/ + * @max_keysize: Maximum key size supported by the transformation. This is the + * largest key length supported by this transformation algorithm. + * This must be set to one of the pre-defined values as this is + * not hardware specific. Possible values for this field can be + * found via git grep "_MAX_KEY_SIZE" include/crypto/ + * @setkey: Set key for the transformation. This function is used to either + * program a supplied key into the hardware or store the key in the + * transformation context for programming it later. Note that this + * function does modify the transformation context. This function can + * be called multiple times during the existence of the transformation + * object, so one must make sure the key is properly reprogrammed into + * the hardware. This function is also responsible for checking the key + * length for validity. In case a software fallback was put in place in + * the @cra_init call, this function might need to use the fallback if + * the algorithm doesn't support all of the key sizes. + * @encrypt: Encrypt a scatterlist of blocks. This function is used to encrypt + * the supplied scatterlist containing the blocks of data. The crypto + * API consumer is responsible for aligning the entries of the + * scatterlist properly and making sure the chunks are correctly + * sized. In case a software fallback was put in place in the + * @cra_init call, this function might need to use the fallback if + * the algorithm doesn't support all of the key sizes. In case the + * key was stored in transformation context, the key might need to be + * re-programmed into the hardware in this function. This function + * shall not modify the transformation context, as this function may + * be called in parallel with the same transformation object. + * @decrypt: Decrypt a single block. This is a reverse counterpart to @encrypt + * and the conditions are exactly the same. + * @init: Initialize the cryptographic transformation object. This function + * is used to initialize the cryptographic transformation object. + * This function is called only once at the instantiation time, right + * after the transformation context was allocated. In case the + * cryptographic hardware has some special requirements which need to + * be handled by software, this function shall check for the precise + * requirement of the transformation and put any software fallbacks + * in place. + * @exit: Deinitialize the cryptographic transformation object. This is a + * counterpart to @init, used to remove various changes set in + * @init. + * @ivsize: IV size applicable for transformation. The consumer must provide an + * IV of exactly that size to perform the encrypt or decrypt operation. + * @chunksize: Equal to the block size except for stream ciphers such as + * CTR where it is set to the underlying block size. + * + * All fields except @ivsize are mandatory and must be filled. + */ +struct skcipher_alg { + int (*setkey)(struct crypto_skcipher *tfm, const u8 *key, + unsigned int keylen); + int (*encrypt)(struct skcipher_request *req); + int (*decrypt)(struct skcipher_request *req); + int (*init)(struct crypto_skcipher *tfm); + void (*exit)(struct crypto_skcipher *tfm); + + unsigned int min_keysize; + unsigned int max_keysize; + unsigned int ivsize; + unsigned int chunksize; + + struct crypto_alg base; +}; + #define SKCIPHER_REQUEST_ON_STACK(name, tfm) \ char __##name##_desc[sizeof(struct skcipher_request) + \ crypto_skcipher_reqsize(tfm)] CRYPTO_MINALIGN_ATTR; \ @@ -231,12 +300,43 @@ static inline int crypto_has_skcipher(const char *alg_name, u32 type, crypto_skcipher_mask(mask)); } +/** + * crypto_has_skcipher2() - Search for the availability of an skcipher. + * @alg_name: is the cra_name / name or cra_driver_name / driver name of the + * skcipher + * @type: specifies the type of the skcipher + * @mask: specifies the mask for the skcipher + * + * Return: true when the skcipher is known to the kernel crypto API; false + * otherwise + */ +int crypto_has_skcipher2(const char *alg_name, u32 type, u32 mask); + static inline const char *crypto_skcipher_driver_name( struct crypto_skcipher *tfm) { return crypto_tfm_alg_name(crypto_skcipher_tfm(tfm)); } +static inline struct skcipher_alg *crypto_skcipher_alg( + struct crypto_skcipher *tfm) +{ + return container_of(crypto_skcipher_tfm(tfm)->__crt_alg, + struct skcipher_alg, base); +} + +static inline unsigned int crypto_skcipher_alg_ivsize(struct skcipher_alg *alg) +{ + if ((alg->base.cra_flags & CRYPTO_ALG_TYPE_MASK) == + CRYPTO_ALG_TYPE_BLKCIPHER) + return alg->base.cra_blkcipher.ivsize; + + if (alg->base.cra_ablkcipher.encrypt) + return alg->base.cra_ablkcipher.ivsize; + + return alg->ivsize; +} + /** * crypto_skcipher_ivsize() - obtain IV size * @tfm: cipher handle @@ -251,6 +351,36 @@ static inline unsigned int crypto_skcipher_ivsize(struct crypto_skcipher *tfm) return tfm->ivsize; } +static inline unsigned int crypto_skcipher_alg_chunksize( + struct skcipher_alg *alg) +{ + if ((alg->base.cra_flags & CRYPTO_ALG_TYPE_MASK) == + CRYPTO_ALG_TYPE_BLKCIPHER) + return alg->base.cra_blocksize; + + if (alg->base.cra_ablkcipher.encrypt) + return alg->base.cra_blocksize; + + return alg->chunksize; +} + +/** + * crypto_skcipher_chunksize() - obtain chunk size + * @tfm: cipher handle + * + * The block size is set to one for ciphers such as CTR. However, + * you still need to provide incremental updates in multiples of + * the underlying block size as the IV does not have sub-block + * granularity. This is known in this API as the chunk size. + * + * Return: chunk size in bytes + */ +static inline unsigned int crypto_skcipher_chunksize( + struct crypto_skcipher *tfm) +{ + return crypto_skcipher_alg_chunksize(crypto_skcipher_alg(tfm)); +} + /** * crypto_skcipher_blocksize() - obtain block size of cipher * @tfm: cipher handle diff --git a/include/linux/crypto.h b/include/linux/crypto.h index b7c1e1a7ebac..d7c8b37b2e95 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -48,6 +48,7 @@ #define CRYPTO_ALG_TYPE_AEAD 0x00000003 #define CRYPTO_ALG_TYPE_BLKCIPHER 0x00000004 #define CRYPTO_ALG_TYPE_ABLKCIPHER 0x00000005 +#define CRYPTO_ALG_TYPE_SKCIPHER 0x00000005 #define CRYPTO_ALG_TYPE_GIVCIPHER 0x00000006 #define CRYPTO_ALG_TYPE_DIGEST 0x00000008 #define CRYPTO_ALG_TYPE_HASH 0x00000008 From 4cbda579cd3d67e4f2097bd790ffcd28eef40c7b Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 23 Jan 2016 13:51:01 +0800 Subject: [PATCH 0696/1212] crypto: api - Add crypto_type_has_alg helper This patch adds the helper crypto_type_has_alg which is meant to replace crypto_has_alg for new-style crypto types. Rather than hard-coding type/mask information they're now retrieved from the crypto_type object. Signed-off-by: Herbert Xu --- crypto/algapi.c | 15 +++++++++++++++ crypto/internal.h | 3 +++ 2 files changed, 18 insertions(+) diff --git a/crypto/algapi.c b/crypto/algapi.c index 59bf491fe3d8..c63f06a8b76a 100644 --- a/crypto/algapi.c +++ b/crypto/algapi.c @@ -988,6 +988,21 @@ unsigned int crypto_alg_extsize(struct crypto_alg *alg) } EXPORT_SYMBOL_GPL(crypto_alg_extsize); +int crypto_type_has_alg(const char *name, const struct crypto_type *frontend, + u32 type, u32 mask) +{ + int ret = 0; + struct crypto_alg *alg = crypto_find_alg(name, frontend, type, mask); + + if (!IS_ERR(alg)) { + crypto_mod_put(alg); + ret = 1; + } + + return ret; +} +EXPORT_SYMBOL_GPL(crypto_type_has_alg); + static int __init crypto_algapi_init(void) { crypto_init_proc(); diff --git a/crypto/internal.h b/crypto/internal.h index 00e42a3ed814..7eefcdb00227 100644 --- a/crypto/internal.h +++ b/crypto/internal.h @@ -104,6 +104,9 @@ int crypto_probing_notify(unsigned long val, void *v); unsigned int crypto_alg_extsize(struct crypto_alg *alg); +int crypto_type_has_alg(const char *name, const struct crypto_type *frontend, + u32 type, u32 mask); + static inline struct crypto_alg *crypto_alg_get(struct crypto_alg *alg) { atomic_inc(&alg->cra_refcnt); From e7724207f71e4bb50b1a34e234f22247c721b246 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 18 May 2018 10:58:14 -0700 Subject: [PATCH 0697/1212] fscrypt: log the crypto algorithm implementations Log the crypto algorithm driver name for each fscrypt encryption mode on its first use, also showing a friendly name for the mode. This will help people determine whether the expected implementations are being used. In some cases we've seen people do benchmarks and reject using encryption for performance reasons, when in fact they used a much slower implementation of AES-XTS than was possible on the hardware. It can make an enormous difference; e.g., AES-XTS on ARM is about 10x faster with the crypto extensions (AES instructions) than without. This also makes it more obvious which modes are being used, now that fscrypt supports multiple combinations of modes. Example messages (with default modes, on x86_64): [ 35.492057] fscrypt: AES-256-CTS-CBC using implementation "cts(cbc-aes-aesni)" [ 35.492171] fscrypt: AES-256-XTS using implementation "xts-aes-aesni" Note: algorithms can be dynamically added to the crypto API, which can result in different implementations being used at different times. But this is rare; for most users, showing the first will be good enough. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/keyinfo.c | 102 +++++++++++++++++++++++++++++--------------- 1 file changed, 68 insertions(+), 34 deletions(-) diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 154cd89c2212..382e828f2f9a 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -147,44 +147,64 @@ static int find_and_derive_key(const struct inode *inode, return err; } -static const struct { +static struct fscrypt_mode { + const char *friendly_name; const char *cipher_str; int keysize; + bool logged_impl_name; } available_modes[] = { - [FS_ENCRYPTION_MODE_AES_256_XTS] = { "xts(aes)", 64 }, - [FS_ENCRYPTION_MODE_AES_256_CTS] = { "cts(cbc(aes))", 32 }, - [FS_ENCRYPTION_MODE_AES_128_CBC] = { "cbc(aes)", 16 }, - [FS_ENCRYPTION_MODE_AES_128_CTS] = { "cts(cbc(aes))", 16 }, - [FS_ENCRYPTION_MODE_SPECK128_256_XTS] = { "xts(speck128)", 64 }, - [FS_ENCRYPTION_MODE_SPECK128_256_CTS] = { "cts(cbc(speck128))", 32 }, + [FS_ENCRYPTION_MODE_AES_256_XTS] = { + .friendly_name = "AES-256-XTS", + .cipher_str = "xts(aes)", + .keysize = 64, + }, + [FS_ENCRYPTION_MODE_AES_256_CTS] = { + .friendly_name = "AES-256-CTS-CBC", + .cipher_str = "cts(cbc(aes))", + .keysize = 32, + }, + [FS_ENCRYPTION_MODE_AES_128_CBC] = { + .friendly_name = "AES-128-CBC", + .cipher_str = "cbc(aes)", + .keysize = 16, + }, + [FS_ENCRYPTION_MODE_AES_128_CTS] = { + .friendly_name = "AES-128-CTS-CBC", + .cipher_str = "cts(cbc(aes))", + .keysize = 16, + }, + [FS_ENCRYPTION_MODE_SPECK128_256_XTS] = { + .friendly_name = "Speck128/256-XTS", + .cipher_str = "xts(speck128)", + .keysize = 64, + }, + [FS_ENCRYPTION_MODE_SPECK128_256_CTS] = { + .friendly_name = "Speck128/256-CTS-CBC", + .cipher_str = "cts(cbc(speck128))", + .keysize = 32, + }, }; -static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode, - const char **cipher_str_ret, int *keysize_ret) +static struct fscrypt_mode * +select_encryption_mode(const struct fscrypt_info *ci, const struct inode *inode) { - u32 mode; - if (!fscrypt_valid_enc_modes(ci->ci_data_mode, ci->ci_filename_mode)) { fscrypt_warn(inode->i_sb, "inode %lu uses unsupported encryption modes (contents mode %d, filenames mode %d)", inode->i_ino, ci->ci_data_mode, ci->ci_filename_mode); - return -EINVAL; + return ERR_PTR(-EINVAL); } - if (S_ISREG(inode->i_mode)) { - mode = ci->ci_data_mode; - } else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) { - mode = ci->ci_filename_mode; - } else { - WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n", - inode->i_ino, (inode->i_mode & S_IFMT)); - return -EINVAL; - } + if (S_ISREG(inode->i_mode)) + return &available_modes[ci->ci_data_mode]; - *cipher_str_ret = available_modes[mode].cipher_str; - *keysize_ret = available_modes[mode].keysize; - return 0; + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + return &available_modes[ci->ci_filename_mode]; + + WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n", + inode->i_ino, (inode->i_mode & S_IFMT)); + return ERR_PTR(-EINVAL); } static void put_crypt_info(struct fscrypt_info *ci) @@ -269,8 +289,7 @@ int fscrypt_get_encryption_info(struct inode *inode) struct fscrypt_info *crypt_info; struct fscrypt_context ctx; struct crypto_skcipher *ctfm; - const char *cipher_str; - int keysize; + struct fscrypt_mode *mode; u8 *raw_key = NULL; int res; @@ -314,40 +333,55 @@ int fscrypt_get_encryption_info(struct inode *inode) memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, sizeof(crypt_info->ci_master_key)); - res = determine_cipher_type(crypt_info, inode, &cipher_str, &keysize); - if (res) + mode = select_encryption_mode(crypt_info, inode); + if (IS_ERR(mode)) { + res = PTR_ERR(mode); goto out; + } /* * This cannot be a stack buffer because it is passed to the scatterlist * crypto API as part of key derivation. */ res = -ENOMEM; - raw_key = kmalloc(keysize, GFP_NOFS); + raw_key = kmalloc(mode->keysize, GFP_NOFS); if (!raw_key) goto out; - res = find_and_derive_key(inode, &ctx, raw_key, keysize); + res = find_and_derive_key(inode, &ctx, raw_key, mode->keysize); if (res) goto out; - ctfm = crypto_alloc_skcipher(cipher_str, 0, 0); + ctfm = crypto_alloc_skcipher(mode->cipher_str, 0, 0); if (IS_ERR(ctfm)) { res = PTR_ERR(ctfm); fscrypt_warn(inode->i_sb, "error allocating '%s' transform for inode %lu: %d", - cipher_str, inode->i_ino, res); + mode->cipher_str, inode->i_ino, res); goto out; } + if (unlikely(!mode->logged_impl_name)) { + /* + * fscrypt performance can vary greatly depending on which + * crypto algorithm implementation is used. Help people debug + * performance problems by logging the ->cra_driver_name the + * first time a mode is used. Note that multiple threads can + * race here, but it doesn't really matter. + */ + mode->logged_impl_name = true; + pr_info("fscrypt: %s using implementation \"%s\"\n", + mode->friendly_name, + crypto_skcipher_alg(ctfm)->base.cra_driver_name); + } crypt_info->ci_ctfm = ctfm; crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY); - res = crypto_skcipher_setkey(ctfm, raw_key, keysize); + res = crypto_skcipher_setkey(ctfm, raw_key, mode->keysize); if (res) goto out; if (S_ISREG(inode->i_mode) && crypt_info->ci_data_mode == FS_ENCRYPTION_MODE_AES_128_CBC) { - res = init_essiv_generator(crypt_info, raw_key, keysize); + res = init_essiv_generator(crypt_info, raw_key, mode->keysize); if (res) { fscrypt_warn(inode->i_sb, "error initializing ESSIV generator for inode %lu: %d", From 71aaced0e1eea5f15b6aba888d9ded4eb29f8c9b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 3 Apr 2018 15:08:17 +0800 Subject: [PATCH 0698/1212] f2fs: introduce private inode status mapping Previously, we use generic FS_*_FL defined by vfs to indicate inode status for each bit of i_flags, so f2fs's flag status definition is tied to vfs' one, it will be hard for f2fs to reuse bits f2fs never used to indicate new status.. In order to solve this issue, we introduce private inode status mapping, Note, for these bits have already been persisted into disk, we should never change their definition, for other ones, we can remap them for later new coming status. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 57 ++++++++++++++++++++++++++++++++++++++++++++++--- fs/f2fs/file.c | 19 +++++++++-------- fs/f2fs/inode.c | 12 +++++------ fs/f2fs/namei.c | 6 +++--- fs/f2fs/super.c | 4 ++-- 5 files changed, 75 insertions(+), 23 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 51e4a9499f04..66c315a8ef78 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2219,9 +2219,60 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) *addr ^= mask; } -#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) -#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) -#define F2FS_FL_INHERITED (FS_PROJINHERIT_FL) +/* + * Inode flags + */ +#define F2FS_SECRM_FL 0x00000001 /* Secure deletion */ +#define F2FS_UNRM_FL 0x00000002 /* Undelete */ +#define F2FS_COMPR_FL 0x00000004 /* Compress file */ +#define F2FS_SYNC_FL 0x00000008 /* Synchronous updates */ +#define F2FS_IMMUTABLE_FL 0x00000010 /* Immutable file */ +#define F2FS_APPEND_FL 0x00000020 /* writes to file may only append */ +#define F2FS_NODUMP_FL 0x00000040 /* do not dump file */ +#define F2FS_NOATIME_FL 0x00000080 /* do not update atime */ +/* Reserved for compression usage... */ +#define F2FS_DIRTY_FL 0x00000100 +#define F2FS_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ +#define F2FS_NOCOMPR_FL 0x00000400 /* Don't compress */ +#define F2FS_ENCRYPT_FL 0x00000800 /* encrypted file */ +/* End compression flags --- maybe not all used */ +#define F2FS_INDEX_FL 0x00001000 /* hash-indexed directory */ +#define F2FS_IMAGIC_FL 0x00002000 /* AFS directory */ +#define F2FS_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ +#define F2FS_NOTAIL_FL 0x00008000 /* file tail should not be merged */ +#define F2FS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ +#define F2FS_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ +#define F2FS_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ +#define F2FS_EXTENTS_FL 0x00080000 /* Inode uses extents */ +#define F2FS_EA_INODE_FL 0x00200000 /* Inode used for large EA */ +#define F2FS_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ +#define F2FS_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ +#define F2FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ +#define F2FS_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ + +#define F2FS_FL_USER_VISIBLE 0x304BDFFF /* User visible flags */ +#define F2FS_FL_USER_MODIFIABLE 0x204BC0FF /* User modifiable flags */ + +/* Flags we can manipulate with through F2FS_IOC_FSSETXATTR */ +#define F2FS_FL_XFLAG_VISIBLE (F2FS_SYNC_FL | \ + F2FS_IMMUTABLE_FL | \ + F2FS_APPEND_FL | \ + F2FS_NODUMP_FL | \ + F2FS_NOATIME_FL | \ + F2FS_PROJINHERIT_FL) + +/* Flags that should be inherited by new inodes from their parent. */ +#define F2FS_FL_INHERITED (F2FS_SECRM_FL | F2FS_UNRM_FL | F2FS_COMPR_FL |\ + F2FS_SYNC_FL | F2FS_NODUMP_FL | F2FS_NOATIME_FL |\ + F2FS_NOCOMPR_FL | F2FS_JOURNAL_DATA_FL |\ + F2FS_NOTAIL_FL | F2FS_DIRSYNC_FL |\ + F2FS_PROJINHERIT_FL) + +/* Flags that are appropriate for regular files (all but dir-specific ones). */ +#define F2FS_REG_FLMASK (~(F2FS_DIRSYNC_FL | F2FS_TOPDIR_FL)) + +/* Flags that are appropriate for non-directories/regular files. */ +#define F2FS_OTHER_FLMASK (F2FS_NODUMP_FL | F2FS_NOATIME_FL) static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) { diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 40d03d58b390..fc7d07f93bbe 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -689,16 +689,16 @@ int f2fs_getattr(struct vfsmount *mnt, stat->btime.tv_nsec = fi->i_crtime.tv_nsec; } - flags = fi->i_flags & (FS_FL_USER_VISIBLE | FS_PROJINHERIT_FL); - if (flags & FS_APPEND_FL) + flags = fi->i_flags & (F2FS_FL_USER_VISIBLE | F2FS_PROJINHERIT_FL); + if (flags & F2FS_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; - if (flags & FS_COMPR_FL) + if (flags & F2FS_COMPR_FL) stat->attributes |= STATX_ATTR_COMPRESSED; if (f2fs_encrypted_inode(inode)) stat->attributes |= STATX_ATTR_ENCRYPTED; - if (flags & FS_IMMUTABLE_FL) + if (flags & F2FS_IMMUTABLE_FL) stat->attributes |= STATX_ATTR_IMMUTABLE; - if (flags & FS_NODUMP_FL) + if (flags & F2FS_NODUMP_FL) stat->attributes |= STATX_ATTR_NODUMP; stat->attributes_mask |= (STATX_ATTR_APPEND | @@ -1590,7 +1590,8 @@ static int f2fs_ioc_getflags(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_inode_info *fi = F2FS_I(inode); - unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE; + unsigned int flags = fi->i_flags & + (F2FS_FL_USER_VISIBLE | F2FS_PROJINHERIT_FL); return put_user(flags, (int __user *)arg); } @@ -1624,15 +1625,15 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) oldflags = fi->i_flags; - if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { + if ((flags ^ oldflags) & (F2FS_APPEND_FL | F2FS_IMMUTABLE_FL)) { if (!capable(CAP_LINUX_IMMUTABLE)) { ret = -EPERM; goto unlock_out; } } - flags = flags & FS_FL_USER_MODIFIABLE; - flags |= oldflags & ~FS_FL_USER_MODIFIABLE; + flags = flags & (F2FS_FL_USER_MODIFIABLE | F2FS_PROJINHERIT_FL); + flags |= oldflags & ~(F2FS_FL_USER_MODIFIABLE | F2FS_PROJINHERIT_FL); fi->i_flags = flags; inode->i_ctime = current_time(inode); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 51846fc54fbd..2056211379f9 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -36,15 +36,15 @@ void f2fs_set_inode_flags(struct inode *inode) unsigned int flags = F2FS_I(inode)->i_flags; unsigned int new_fl = 0; - if (flags & FS_SYNC_FL) + if (flags & F2FS_SYNC_FL) new_fl |= S_SYNC; - if (flags & FS_APPEND_FL) + if (flags & F2FS_APPEND_FL) new_fl |= S_APPEND; - if (flags & FS_IMMUTABLE_FL) + if (flags & F2FS_IMMUTABLE_FL) new_fl |= S_IMMUTABLE; - if (flags & FS_NOATIME_FL) + if (flags & F2FS_NOATIME_FL) new_fl |= S_NOATIME; - if (flags & FS_DIRSYNC_FL) + if (flags & F2FS_DIRSYNC_FL) new_fl |= S_DIRSYNC; if (f2fs_encrypted_inode(inode)) new_fl |= S_ENCRYPTED; @@ -268,7 +268,7 @@ static int do_read_inode(struct inode *inode) if (!need_inode_block_update(sbi, inode->i_ino)) fi->last_disk_size = inode->i_size; - if (fi->i_flags & FS_PROJINHERIT_FL) + if (fi->i_flags & F2FS_PROJINHERIT_FL) set_inode_flag(inode, FI_PROJ_INHERIT); if (f2fs_has_extra_attr(inode) && f2fs_sb_has_project_quota(sbi->sb) && diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index fecae8685d2a..dd77ecbd536d 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -61,7 +61,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) } if (f2fs_sb_has_project_quota(sbi->sb) && - (F2FS_I(dir)->i_flags & FS_PROJINHERIT_FL)) + (F2FS_I(dir)->i_flags & F2FS_PROJINHERIT_FL)) F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid; else F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns, @@ -116,9 +116,9 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED); if (S_ISDIR(inode->i_mode)) - F2FS_I(inode)->i_flags |= FS_INDEX_FL; + F2FS_I(inode)->i_flags |= F2FS_INDEX_FL; - if (F2FS_I(inode)->i_flags & FS_PROJINHERIT_FL) + if (F2FS_I(inode)->i_flags & F2FS_PROJINHERIT_FL) set_inode_flag(inode, FI_PROJ_INHERIT); trace_f2fs_new_inode(inode, 0); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 777ed4eafa6c..b6ce10f8128a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1805,7 +1805,7 @@ static int f2fs_quota_on(struct super_block *sb, int type, int format_id, inode = d_inode(path->dentry); inode_lock(inode); - F2FS_I(inode)->i_flags |= FS_NOATIME_FL | FS_IMMUTABLE_FL; + F2FS_I(inode)->i_flags |= F2FS_NOATIME_FL | F2FS_IMMUTABLE_FL; inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, S_NOATIME | S_IMMUTABLE); inode_unlock(inode); @@ -1829,7 +1829,7 @@ static int f2fs_quota_off(struct super_block *sb, int type) goto out_put; inode_lock(inode); - F2FS_I(inode)->i_flags &= ~(FS_NOATIME_FL | FS_IMMUTABLE_FL); + F2FS_I(inode)->i_flags &= ~(F2FS_NOATIME_FL | F2FS_IMMUTABLE_FL); inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); inode_unlock(inode); f2fs_mark_inode_dirty_sync(inode, false); From ec034d0f14ca093cf656843fa097350875c3895d Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Tue, 3 Apr 2018 19:42:41 +0800 Subject: [PATCH 0699/1212] f2fs: remove unmatched zero_user_segment when convert inline dentry Since the layout of regular dentry block is different from inline dentry block, zero_user_segment starting from MAX_INLINE_DATA(dir) is not correct for regular dentry block, besides, bitmap is already copied and used, so there is no necessary to zero page at all, so just remove the zero_user_segment is OK. Signed-off-by: Yunlong Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inline.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 2ff0305391cd..85371b0971d9 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -368,7 +368,6 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, goto out; f2fs_wait_on_page_writeback(page, DATA, true); - zero_user_segment(page, MAX_INLINE_DATA(dir), PAGE_SIZE); dentry_blk = page_address(page); From cd79eb2b5e451ca0be15338684252aef56dd319d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 4 Apr 2018 17:35:13 +0800 Subject: [PATCH 0700/1212] f2fs: remove redundant block plug For buffered IO, we don't need to use block plug to cache bio, for direct IO, generic f2fs_direct_IO has already added block plug, so let's remove redundant one in .write_iter. As Yunlei described in his patch: -f2fs_file_write_iter -blk_start_plug -__generic_file_write_iter ... -do_blockdev_direct_IO -blk_start_plug ... -blk_finish_plug ... -blk_finish_plug which may conduct performance decrease in our platform Signed-off-by: Yunlei He Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index fc7d07f93bbe..b2db8349c97b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2695,7 +2695,6 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); - struct blk_plug plug; ssize_t ret; if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) @@ -2740,9 +2739,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) return err; } } - blk_start_plug(&plug); ret = __generic_file_write_iter(iocb, from); - blk_finish_plug(&plug); clear_inode_flag(inode, FI_NO_PREALLOC); /* if we couldn't write data, we should deallocate blocks. */ From fdf61219dc2512cd29b8b03a460a51af8ddca876 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Wed, 4 Apr 2018 17:29:05 +0800 Subject: [PATCH 0701/1212] f2fs: issue all big range discards in umount process This patch modify max_requests to UINT_MAX, to issue all big range discards in umount. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a02d5c1a7ed2..3c2e44f76ff3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1024,6 +1024,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, } else if (discard_type == DPOLICY_FSTRIM) { dpolicy->io_aware = false; } else if (discard_type == DPOLICY_UMOUNT) { + dpolicy->max_requests = UINT_MAX; dpolicy->io_aware = false; } } From 298032d4d4a6dc6da4f7298da0200ef56e93006d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 9 Apr 2018 20:25:06 +0800 Subject: [PATCH 0702/1212] f2fs: don't use GFP_ZERO for page caches Related to https://lkml.org/lkml/2018/4/8/661 Sometimes, we need to write meta data to new allocated block address, then we will allocate a zeroed page in inner inode's address space, and fill partial data in it, and leave other place with zero value which means some fields are initial status. There are two inner inodes (meta inode and node inode) setting __GFP_ZERO, I have just checked them, for both of them, we can avoid using __GFP_ZERO, and do initialization by ourselves to avoid unneeded/redundant zeroing from mm. Cc: Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 +++- fs/f2fs/inode.c | 4 ++-- fs/f2fs/segment.c | 3 +++ fs/f2fs/segment.h | 1 + 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 760d1ad22722..0bdd5bdfeaf9 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -100,8 +100,10 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, * readonly and make sure do not write checkpoint with non-uptodate * meta page. */ - if (unlikely(!PageUptodate(page))) + if (unlikely(!PageUptodate(page))) { + memset(page_address(page), 0, PAGE_SIZE); f2fs_stop_checkpoint(sbi, false); + } out: return page; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 2056211379f9..8187ef8bab98 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -320,10 +320,10 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) make_now: if (ino == F2FS_NODE_INO(sbi)) { inode->i_mapping->a_ops = &f2fs_node_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO); + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); } else if (ino == F2FS_META_INO(sbi)) { inode->i_mapping->a_ops = &f2fs_meta_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO); + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); } else if (S_ISREG(inode->i_mode)) { inode->i_op = &f2fs_file_inode_operations; inode->i_fop = &f2fs_file_operations; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 3c2e44f76ff3..a1f9c8a19383 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2059,6 +2059,7 @@ static void write_current_sum_page(struct f2fs_sb_info *sbi, struct f2fs_summary_block *dst; dst = (struct f2fs_summary_block *)page_address(page); + memset(dst, 0, PAGE_SIZE); mutex_lock(&curseg->curseg_mutex); @@ -3214,6 +3215,7 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) page = grab_meta_page(sbi, blkaddr++); kaddr = (unsigned char *)page_address(page); + memset(kaddr, 0, PAGE_SIZE); /* Step 1: write nat cache */ seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -3238,6 +3240,7 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) if (!page) { page = grab_meta_page(sbi, blkaddr++); kaddr = (unsigned char *)page_address(page); + memset(kaddr, 0, PAGE_SIZE); written_size = 0; } summary = (struct f2fs_summary *)(kaddr + written_size); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 96a2d57ba8a4..e352e01854b0 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -375,6 +375,7 @@ static inline void seg_info_to_sit_page(struct f2fs_sb_info *sbi, int i; raw_sit = (struct f2fs_sit_block *)page_address(page); + memset(raw_sit, 0, PAGE_SIZE); for (i = 0; i < end - start; i++) { rs = &raw_sit->entries[i]; se = get_seg_entry(sbi, start + i); From 3e90db63fcfcac8c406704b165597d2a33de4450 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 8 Apr 2018 11:25:53 +0800 Subject: [PATCH 0703/1212] f2fs: remove unneeded F2FS_PROJINHERIT_FL Now F2FS_FL_USER_VISIBLE and F2FS_FL_USER_MODIFIABLE has included F2FS_PROJINHERIT_FL, so remove unneeded F2FS_PROJINHERIT_FL when using visible/modifiable flag macro. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index b2db8349c97b..5b4802a67eba 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -689,7 +689,7 @@ int f2fs_getattr(struct vfsmount *mnt, stat->btime.tv_nsec = fi->i_crtime.tv_nsec; } - flags = fi->i_flags & (F2FS_FL_USER_VISIBLE | F2FS_PROJINHERIT_FL); + flags = fi->i_flags & F2FS_FL_USER_VISIBLE; if (flags & F2FS_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; if (flags & F2FS_COMPR_FL) @@ -1632,8 +1632,8 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) } } - flags = flags & (F2FS_FL_USER_MODIFIABLE | F2FS_PROJINHERIT_FL); - flags |= oldflags & ~(F2FS_FL_USER_MODIFIABLE | F2FS_PROJINHERIT_FL); + flags = flags & (F2FS_FL_USER_MODIFIABLE); + flags |= oldflags & ~(F2FS_FL_USER_MODIFIABLE); fi->i_flags = flags; inode->i_ctime = current_time(inode); From 17f85d070886c69dd5bc5f32dc4fcdbd24199a7a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 8 Apr 2018 11:27:14 +0800 Subject: [PATCH 0704/1212] f2fs: fix to show missing bits in FS_IOC_GETFLAGS This patch fixes to show missing encrypt/inline_data flag in FS_IOC_GETFLAGS like ext4 does. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5b4802a67eba..06f500177bde 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1590,8 +1590,15 @@ static int f2fs_ioc_getflags(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_inode_info *fi = F2FS_I(inode); - unsigned int flags = fi->i_flags & - (F2FS_FL_USER_VISIBLE | F2FS_PROJINHERIT_FL); + unsigned int flags = fi->i_flags; + + if (file_is_encrypt(inode)) + flags |= F2FS_ENCRYPT_FL; + if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) + flags |= F2FS_INLINE_DATA_FL; + + flags &= F2FS_FL_USER_VISIBLE; + return put_user(flags, (int __user *)arg); } From 9190cadf38db9a3b321c8882b1d27219a5e6f436 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 8 Apr 2018 20:39:03 +0800 Subject: [PATCH 0705/1212] f2fs: correct return value of f2fs_trim_fs Correct return value in two cases: - return EINVAL if end boundary is out-of-range. - return EIO if fs needs off-line check. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a1f9c8a19383..f1fe260537e0 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2489,12 +2489,12 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) return -EINVAL; if (end <= MAIN_BLKADDR(sbi)) - goto out; + return -EINVAL; if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) { f2fs_msg(sbi->sb, KERN_WARNING, "Found FS corruption, run fsck to fix."); - goto out; + return -EIO; } /* start/end segment number in main_area */ From ea2813111f1f31e04892c955291356322eec23b8 Mon Sep 17 00:00:00 2001 From: Zhikang Zhang Date: Mon, 9 Apr 2018 04:28:41 +0800 Subject: [PATCH 0706/1212] f2fs: check cur_valid_map_mir & raw_sit block count when flush sit entries We should check valid_map_mir and block count to ensure the flushed raw_sit is correct. Signed-off-by: Zhikang Zhang Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f1fe260537e0..7d6c1e4b1374 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3481,6 +3481,11 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) int offset, sit_offset; se = get_seg_entry(sbi, segno); +#ifdef CONFIG_F2FS_CHECK_FS + if (memcmp(se->cur_valid_map, se->cur_valid_map_mir, + SIT_VBLOCK_MAP_SIZE)) + f2fs_bug_on(sbi, 1); +#endif /* add discard candidates */ if (!(cpc->reason & CP_DISCARD)) { @@ -3496,10 +3501,14 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) cpu_to_le32(segno); seg_info_to_raw_sit(se, &sit_in_journal(journal, offset)); + check_block_count(sbi, segno, + &sit_in_journal(journal, offset)); } else { sit_offset = SIT_ENTRY_OFFSET(sit_i, segno); seg_info_to_raw_sit(se, &raw_sit->entries[sit_offset]); + check_block_count(sbi, segno, + &raw_sit->entries[sit_offset]); } __clear_bit(segno, bitmap); From 0d17eb90b56aafeea4d7053e8eba8dd0cffaee39 Mon Sep 17 00:00:00 2001 From: Zhikang Zhang Date: Sat, 14 Apr 2018 01:02:34 +0800 Subject: [PATCH 0707/1212] f2fs: change le32 to le16 of f2fs_inode->i_extra_size In the structure of f2fs_inode, i_extra_size's type is __le16, so we should keep type consistent when using it. Fixes: 704956ecf5bc ("f2fs: support inode checksum") Signed-off-by: Zhikang Zhang Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 8187ef8bab98..b83e0cc49d3d 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -117,7 +117,6 @@ static void __recover_inline_status(struct inode *inode, struct page *ipage) static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page) { struct f2fs_inode *ri = &F2FS_NODE(page)->i; - int extra_isize = le32_to_cpu(ri->i_extra_isize); if (!f2fs_sb_has_inode_chksum(sbi->sb)) return false; @@ -125,7 +124,8 @@ static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page if (!RAW_IS_INODE(F2FS_NODE(page)) || !(ri->i_inline & F2FS_EXTRA_ATTR)) return false; - if (!F2FS_FITS_IN_INODE(ri, extra_isize, i_inode_checksum)) + if (!F2FS_FITS_IN_INODE(ri, le16_to_cpu(ri->i_extra_isize), + i_inode_checksum)) return false; return true; From 9d77ded0a71d5174ce8c4657b8b49a847122b143 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 17 Apr 2018 17:51:28 +0800 Subject: [PATCH 0708/1212] f2fs: fix race in between GC and atomic open Thread GC thread - f2fs_ioc_start_atomic_write - get_dirty_pages - filemap_write_and_wait_range - f2fs_gc - do_garbage_collect - gc_data_segment - move_data_page - f2fs_is_atomic_file - set_page_dirty - set_inode_flag(, FI_ATOMIC_FILE) Dirty data page can still be generated by GC in race condition as above call stack. This patch adds fi->dio_rwsem[WRITE] in f2fs_ioc_start_atomic_write to avoid such race. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 06f500177bde..93debcf83d29 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1676,6 +1676,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) inode_lock(inode); + down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + if (f2fs_is_atomic_file(inode)) goto out; @@ -1705,6 +1707,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) stat_inc_atomic_write(inode); stat_update_max_atomic_write(inode); out: + up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); inode_unlock(inode); mnt_drop_write_file(filp); return ret; From aa857e0f3b0993899e39659b2f671e8cc9870ac3 Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Tue, 17 Apr 2018 17:12:27 +0800 Subject: [PATCH 0709/1212] f2fs: check if inmem_pages list is empty correctly `cur' will never be NULL, we should check inmem_pages list instead. Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7d6c1e4b1374..a7f0e5932642 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -328,7 +328,7 @@ void drop_inmem_page(struct inode *inode, struct page *page) break; } - f2fs_bug_on(sbi, !cur || cur->page != page); + f2fs_bug_on(sbi, list_empty(head) || cur->page != page); list_del(&cur->list); mutex_unlock(&fi->inmem_lock); From 258489ec52208c6cc9893f3ce2791cc9d9fbb04b Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Wed, 18 Apr 2018 11:06:39 +0800 Subject: [PATCH 0710/1212] f2fs: allocate hot_data for atomic write more strictly If a file not set type as hot, has dirty pages more than threshold 64 before starting atomic write, may be lose hot flag. v1->v2: move set FI_ATOMIC_FILE flag behind flush dirty pages too, in case of dirty pages before starting atomic use atomic mode to write back. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 93debcf83d29..7ccb832aa929 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1685,24 +1685,20 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (ret) goto out; - set_inode_flag(inode, FI_ATOMIC_FILE); - set_inode_flag(inode, FI_HOT_DATA); - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - if (!get_dirty_pages(inode)) - goto inc_stat; + goto skip_flush; f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); - if (ret) { - clear_inode_flag(inode, FI_ATOMIC_FILE); - clear_inode_flag(inode, FI_HOT_DATA); + if (ret) goto out; - } +skip_flush: + set_inode_flag(inode, FI_HOT_DATA); + set_inode_flag(inode, FI_ATOMIC_FILE); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); -inc_stat: F2FS_I(inode)->inmem_task = current; stat_inc_atomic_write(inode); stat_update_max_atomic_write(inode); From a6d74bb282adbae0319ede6a0de3b6983c3c3b46 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 18 Apr 2018 17:45:02 +0800 Subject: [PATCH 0711/1212] f2fs: fix return value in f2fs_ioc_commit_atomic_write In f2fs_ioc_commit_atomic_write, if file is volatile, return -EINVAL to indicate that commit failure. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7ccb832aa929..4334683e5491 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1725,8 +1725,10 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); - if (f2fs_is_volatile_file(inode)) + if (f2fs_is_volatile_file(inode)) { + ret = -EINVAL; goto err_out; + } if (f2fs_is_atomic_file(inode)) { ret = commit_inmem_pages(inode); From 937f4ef79e257735e03149815fb231c8d02e3a1f Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Fri, 13 Apr 2018 11:08:05 +0800 Subject: [PATCH 0712/1212] f2fs: stop issue discard if something wrong with f2fs v4->v5: move data corruption check to __submit_discard_cmd, in order to control discard io submitted more accurately, besides, increase async thread wait time if data corruption detected. This patch stop async thread and umount process to issue discard if something wrong with f2fs, which is similar to fstrim. Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a7f0e5932642..d4b787b00c5a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1044,6 +1044,9 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, if (dc->state != D_PREP) return; + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) + return; + trace_f2fs_issue_discard(dc->bdev, dc->start, dc->len); dc->error = __blkdev_issue_discard(dc->bdev, @@ -1475,6 +1478,10 @@ static int issue_discard_thread(void *data) continue; if (kthread_should_stop()) return 0; + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) { + wait_ms = dpolicy.max_interval; + continue; + } if (dcc->discard_wake) dcc->discard_wake = 0; From 23d00b02878ee939100ed8aae68b5ac170899bf2 Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Sat, 21 Apr 2018 14:12:50 +0800 Subject: [PATCH 0713/1212] f2fs: remove duplicated dquot_initialize and fix error handling This patch removes duplicated dquot_initialize in recover_orphan_inode(), and fix the error handling if dquot_initialize fails. Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 0bdd5bdfeaf9..6d331c21f7ce 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -593,10 +593,11 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) } err = dquot_initialize(inode); - if (err) + if (err) { + iput(inode); goto err_out; + } - dquot_initialize(inode); clear_nlink(inode); /* truncate all the data during iput */ From 7aff5c69da4c925dcd7dc01a248a14be7d83d5c6 Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Mon, 23 Apr 2018 10:29:13 +0800 Subject: [PATCH 0714/1212] f2fs: do not check F2FS_INLINE_DOTS in recover Only dir may have F2FS_INLINE_DOTS flag, so there is no need to check the flag in recover flow. Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 4ddc2262baf1..7305226a7476 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -204,8 +204,6 @@ static void recover_inline_flags(struct inode *inode, struct f2fs_inode *ri) set_inode_flag(inode, FI_DATA_EXIST); else clear_inode_flag(inode, FI_DATA_EXIST); - if (!(ri->i_inline & F2FS_INLINE_DOTS)) - clear_inode_flag(inode, FI_INLINE_DOTS); } static void recover_inode(struct inode *inode, struct page *page) From b025f6dfc018e49f53549c846c3ad6045aab39cd Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 23 Apr 2018 10:36:14 +0800 Subject: [PATCH 0715/1212] f2fs: clean up commit_inmem_pages() This patch moves error handling from commit_inmem_pages() into __commit_inmem_page() for cleanup, no logic change. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 54 +++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d4b787b00c5a..7f6f029aa866 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -343,8 +343,7 @@ void drop_inmem_page(struct inode *inode, struct page *page) trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE); } -static int __commit_inmem_pages(struct inode *inode, - struct list_head *revoke_list) +static int __commit_inmem_pages(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); @@ -357,9 +356,12 @@ static int __commit_inmem_pages(struct inode *inode, .op_flags = REQ_SYNC | REQ_PRIO, .io_type = FS_DATA_IO, }; + struct list_head revoke_list; pgoff_t last_idx = ULONG_MAX; int err = 0; + INIT_LIST_HEAD(&revoke_list); + list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { struct page *page = cur->page; @@ -393,35 +395,13 @@ static int __commit_inmem_pages(struct inode *inode, last_idx = page->index; } unlock_page(page); - list_move_tail(&cur->list, revoke_list); + list_move_tail(&cur->list, &revoke_list); } if (last_idx != ULONG_MAX) f2fs_submit_merged_write_cond(sbi, inode, 0, last_idx, DATA); - if (!err) - __revoke_inmem_pages(inode, revoke_list, false, false); - - return err; -} - -int commit_inmem_pages(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_inode_info *fi = F2FS_I(inode); - struct list_head revoke_list; - int err; - - INIT_LIST_HEAD(&revoke_list); - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); - - set_inode_flag(inode, FI_ATOMIC_COMMIT); - - mutex_lock(&fi->inmem_lock); - err = __commit_inmem_pages(inode, &revoke_list); if (err) { - int ret; /* * try to revoke all committed pages, but still we could fail * due to no memory or other reason, if that happened, EAGAIN @@ -430,13 +410,31 @@ int commit_inmem_pages(struct inode *inode) * recovery or rewrite & commit last transaction. For other * error number, revoking was done by filesystem itself. */ - ret = __revoke_inmem_pages(inode, &revoke_list, false, true); - if (ret) - err = ret; + err = __revoke_inmem_pages(inode, &revoke_list, false, true); /* drop all uncommitted pages */ __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); + } else { + __revoke_inmem_pages(inode, &revoke_list, false, false); } + + return err; +} + +int commit_inmem_pages(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + int err; + + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); + + set_inode_flag(inode, FI_ATOMIC_COMMIT); + + mutex_lock(&fi->inmem_lock); + err = __commit_inmem_pages(inode); + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); if (!list_empty(&fi->inmem_ilist)) list_del_init(&fi->inmem_ilist); From 1a5d1966c0ca8c09e94d41f4490d7d7a53dd5cb2 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 23 Apr 2018 23:02:31 -0600 Subject: [PATCH 0716/1212] f2fs: give message and set need_fsck given broken node id syzbot hit the following crash on upstream commit 83beed7b2b26f232d782127792dd0cd4362fdc41 (Fri Apr 20 17:56:32 2018 +0000) Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/evalenti/linux-soc-thermal syzbot dashboard link: https://syzkaller.appspot.com/bug?extid=d154ec99402c6f628887 C reproducer: https://syzkaller.appspot.com/x/repro.c?id=5414336294027264 syzkaller reproducer: https://syzkaller.appspot.com/x/repro.syz?id=5471683234234368 Raw console output: https://syzkaller.appspot.com/x/log.txt?id=5436660795834368 Kernel config: https://syzkaller.appspot.com/x/.config?id=1808800213120130118 compiler: gcc (GCC) 8.0.1 20180413 (experimental) IMPORTANT: if you fix the bug, please add the following tag to the commit: Reported-by: syzbot+d154ec99402c6f628887@syzkaller.appspotmail.com It will help syzbot understand when the bug is fixed. See footer for details. If you forward the report, please keep this part and the footer. F2FS-fs (loop0): Magic Mismatch, valid(0xf2f52010) - read(0x0) F2FS-fs (loop0): Can't find valid F2FS filesystem in 1th superblock F2FS-fs (loop0): invalid crc value ------------[ cut here ]------------ kernel BUG at fs/f2fs/node.c:1185! invalid opcode: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 4549 Comm: syzkaller704305 Not tainted 4.17.0-rc1+ #10 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__get_node_page+0xb68/0x16e0 fs/f2fs/node.c:1185 RSP: 0018:ffff8801d960e820 EFLAGS: 00010293 RAX: ffff8801d88205c0 RBX: 0000000000000003 RCX: ffffffff82f6cc06 RDX: 0000000000000000 RSI: ffffffff82f6d5e8 RDI: 0000000000000004 RBP: ffff8801d960ec30 R08: ffff8801d88205c0 R09: ffffed003b5e46c2 R10: 0000000000000003 R11: 0000000000000003 R12: ffff8801a86e00c0 R13: 0000000000000001 R14: ffff8801a86e0530 R15: ffff8801d9745240 FS: 000000000072c880(0000) GS:ffff8801daf00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f3d403209b8 CR3: 00000001d8f3f000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: get_node_page fs/f2fs/node.c:1237 [inline] truncate_xattr_node+0x152/0x2e0 fs/f2fs/node.c:1014 remove_inode_page+0x200/0xaf0 fs/f2fs/node.c:1039 f2fs_evict_inode+0xe86/0x1710 fs/f2fs/inode.c:547 evict+0x4a6/0x960 fs/inode.c:557 iput_final fs/inode.c:1519 [inline] iput+0x62d/0xa80 fs/inode.c:1545 f2fs_fill_super+0x5f4e/0x7bf0 fs/f2fs/super.c:2849 mount_bdev+0x30c/0x3e0 fs/super.c:1164 f2fs_mount+0x34/0x40 fs/f2fs/super.c:3020 mount_fs+0xae/0x328 fs/super.c:1267 vfs_kern_mount.part.34+0xd4/0x4d0 fs/namespace.c:1037 vfs_kern_mount fs/namespace.c:1027 [inline] do_new_mount fs/namespace.c:2518 [inline] do_mount+0x564/0x3070 fs/namespace.c:2848 ksys_mount+0x12d/0x140 fs/namespace.c:3064 __do_sys_mount fs/namespace.c:3078 [inline] __se_sys_mount fs/namespace.c:3075 [inline] __x64_sys_mount+0xbe/0x150 fs/namespace.c:3075 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x443dea RSP: 002b:00007ffcc7882368 EFLAGS: 00000297 ORIG_RAX: 00000000000000a5 RAX: ffffffffffffffda RBX: 0000000020000c00 RCX: 0000000000443dea RDX: 0000000020000000 RSI: 0000000020000100 RDI: 00007ffcc7882370 RBP: 0000000000000003 R08: 0000000020016a00 R09: 000000000000000a R10: 0000000000000000 R11: 0000000000000297 R12: 0000000000000004 R13: 0000000000402ce0 R14: 0000000000000000 R15: 0000000000000000 RIP: __get_node_page+0xb68/0x16e0 fs/f2fs/node.c:1185 RSP: ffff8801d960e820 ---[ end trace 4edbeb71f002bb76 ]--- Reported-and-tested-by: syzbot+d154ec99402c6f628887@syzkaller.appspotmail.com Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 13 +------------ fs/f2fs/inode.c | 13 ++++++------- fs/f2fs/node.c | 21 +++++++++++++++++++-- 3 files changed, 26 insertions(+), 21 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 66c315a8ef78..527999edc2a9 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1649,18 +1649,6 @@ static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi) is_set_ckpt_flags(sbi, CP_FASTBOOT_FLAG)); } -/* - * Check whether the given nid is within node id range. - */ -static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) -{ - if (unlikely(nid < F2FS_ROOT_INO(sbi))) - return -EINVAL; - if (unlikely(nid >= NM_I(sbi)->max_nid)) - return -EINVAL; - return 0; -} - /* * Check whether the inode has blocks or not */ @@ -2854,6 +2842,7 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, struct dnode_of_data; struct node_info; +int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid); bool available_free_memory(struct f2fs_sb_info *sbi, int type); int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid); bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index b83e0cc49d3d..ff99110194ef 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -194,12 +194,8 @@ static int do_read_inode(struct inode *inode) projid_t i_projid; /* Check if ino is within scope */ - if (check_nid_range(sbi, inode->i_ino)) { - f2fs_msg(inode->i_sb, KERN_ERR, "bad inode number: %lu", - (unsigned long) inode->i_ino); - WARN_ON(1); + if (check_nid_range(sbi, inode->i_ino)) return -EINVAL; - } node_page = get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) @@ -588,8 +584,11 @@ void f2fs_evict_inode(struct inode *inode) alloc_nid_failed(sbi, inode->i_ino); clear_inode_flag(inode, FI_FREE_NID); } else { - f2fs_bug_on(sbi, err && - !exist_written_data(sbi, inode->i_ino, ORPHAN_INO)); + /* + * If xattr nid is corrupted, we can reach out error condition, + * err & !exist_written_data(sbi, inode->i_ino, ORPHAN_INO)). + * In that case, check_nid_range() is enough to give a clue. + */ } out_clear: fscrypt_put_encryption_info(inode, NULL); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 16aee2a7b8a9..7c3e8190cff2 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -29,6 +29,21 @@ static struct kmem_cache *nat_entry_slab; static struct kmem_cache *free_nid_slab; static struct kmem_cache *nat_entry_set_slab; +/* + * Check whether the given nid is within node id range. + */ +int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) +{ + if (unlikely(nid < F2FS_ROOT_INO(sbi) || nid >= NM_I(sbi)->max_nid)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: out-of-range nid=%x, run fsck to fix.", + __func__, nid); + return -EINVAL; + } + return 0; +} + bool available_free_memory(struct f2fs_sb_info *sbi, int type) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -1158,7 +1173,8 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) if (!nid) return; - f2fs_bug_on(sbi, check_nid_range(sbi, nid)); + if (check_nid_range(sbi, nid)) + return; rcu_read_lock(); apage = radix_tree_lookup(&NODE_MAPPING(sbi)->page_tree, nid); @@ -1182,7 +1198,8 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, if (!nid) return ERR_PTR(-ENOENT); - f2fs_bug_on(sbi, check_nid_range(sbi, nid)); + if (check_nid_range(sbi, nid)) + return ERR_PTR(-EINVAL); repeat: page = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false); if (!page) From ab758ada220fe5c9f1419bcd6c8fb249a3bd1dd4 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 24 Apr 2018 11:37:18 -0600 Subject: [PATCH 0717/1212] f2fs: avoid bug_on on corrupted inode syzbot has tested the proposed patch but the reproducer still triggered crash: kernel BUG at fs/f2fs/inode.c:LINE! F2FS-fs (loop1): invalid crc value F2FS-fs (loop5): Magic Mismatch, valid(0xf2f52010) - read(0x0) F2FS-fs (loop5): Can't find valid F2FS filesystem in 1th superblock F2FS-fs (loop5): invalid crc value ------------[ cut here ]------------ kernel BUG at fs/f2fs/inode.c:238! invalid opcode: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 4886 Comm: syz-executor1 Not tainted 4.17.0-rc1+ #1 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:do_read_inode fs/f2fs/inode.c:238 [inline] RIP: 0010:f2fs_iget+0x3307/0x3ca0 fs/f2fs/inode.c:313 RSP: 0018:ffff8801c44a70e8 EFLAGS: 00010293 RAX: ffff8801ce208040 RBX: ffff8801b3621080 RCX: ffffffff82eace18 F2FS-fs (loop2): Magic Mismatch, valid(0xf2f52010) - read(0x0) RDX: 0000000000000000 RSI: ffffffff82eaf047 RDI: 0000000000000007 RBP: ffff8801c44a7410 R08: ffff8801ce208040 R09: ffffed0039ee4176 R10: ffffed0039ee4176 R11: ffff8801cf720bb7 R12: ffff8801c0efa000 R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000000000 FS: 00007f753aa9d700(0000) GS:ffff8801daf00000(0000) knlGS:0000000000000000 ------------[ cut here ]------------ CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 kernel BUG at fs/f2fs/inode.c:238! CR2: 0000000001b03018 CR3: 00000001c8b74000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: f2fs_fill_super+0x4377/0x7bf0 fs/f2fs/super.c:2842 mount_bdev+0x30c/0x3e0 fs/super.c:1165 f2fs_mount+0x34/0x40 fs/f2fs/super.c:3020 mount_fs+0xae/0x328 fs/super.c:1268 vfs_kern_mount.part.34+0xd4/0x4d0 fs/namespace.c:1037 vfs_kern_mount fs/namespace.c:1027 [inline] do_new_mount fs/namespace.c:2517 [inline] do_mount+0x564/0x3070 fs/namespace.c:2847 ksys_mount+0x12d/0x140 fs/namespace.c:3063 __do_sys_mount fs/namespace.c:3077 [inline] __se_sys_mount fs/namespace.c:3074 [inline] __x64_sys_mount+0xbe/0x150 fs/namespace.c:3074 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x457daa RSP: 002b:00007f753aa9cba8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5 RAX: ffffffffffffffda RBX: 0000000020000000 RCX: 0000000000457daa RDX: 0000000020000000 RSI: 0000000020000100 RDI: 00007f753aa9cbf0 RBP: 0000000000000064 R08: 0000000020016a00 R09: 0000000020000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000003 R13: 0000000000000064 R14: 00000000006fcb80 R15: 0000000000000000 RIP: do_read_inode fs/f2fs/inode.c:238 [inline] RSP: ffff8801c44a70e8 RIP: f2fs_iget+0x3307/0x3ca0 fs/f2fs/inode.c:313 RSP: ffff8801c44a70e8 invalid opcode: 0000 [#2] SMP KASAN ---[ end trace 1cbcbec2156680bc ]--- Reported-and-tested-by: syzbot+41a1b341571f0952badb@syzkaller.appspotmail.com Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index ff99110194ef..bface995617b 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -185,6 +185,21 @@ void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page) ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, page)); } +static bool sanity_check_inode(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (f2fs_sb_has_flexible_inline_xattr(sbi->sb) + && !f2fs_has_extra_attr(inode)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: corrupted inode ino=%lx, run fsck to fix.", + __func__, inode->i_ino); + return false; + } + return true; +} + static int do_read_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -235,7 +250,6 @@ static int do_read_inode(struct inode *inode) le16_to_cpu(ri->i_extra_isize) : 0; if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) { - f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode)); fi->i_inline_xattr_size = le16_to_cpu(ri->i_inline_xattr_size); } else if (f2fs_has_inline_xattr(inode) || f2fs_has_inline_dentry(inode)) { @@ -313,6 +327,10 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) ret = do_read_inode(inode); if (ret) goto bad_inode; + if (!sanity_check_inode(inode)) { + ret = -EINVAL; + goto bad_inode; + } make_now: if (ino == F2FS_NODE_INO(sbi)) { inode->i_mapping->a_ops = &f2fs_node_aops; From 78f8b0f46fa23f9dc5c8b501db414e9546ba44b0 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 24 Apr 2018 15:44:16 -0600 Subject: [PATCH 0718/1212] f2fs: sanity check on sit entry syzbot hit the following crash on upstream commit 87ef12027b9b1dd0e0b12cf311fbcb19f9d92539 (Wed Apr 18 19:48:17 2018 +0000) Merge tag 'ceph-for-4.17-rc2' of git://github.com/ceph/ceph-client syzbot dashboard link: https://syzkaller.appspot.com/bug?extid=83699adeb2d13579c31e C reproducer: https://syzkaller.appspot.com/x/repro.c?id=5805208181407744 syzkaller reproducer: https://syzkaller.appspot.com/x/repro.syz?id=6005073343676416 Raw console output: https://syzkaller.appspot.com/x/log.txt?id=6555047731134464 Kernel config: https://syzkaller.appspot.com/x/.config?id=1808800213120130118 compiler: gcc (GCC) 8.0.1 20180413 (experimental) IMPORTANT: if you fix the bug, please add the following tag to the commit: Reported-by: syzbot+83699adeb2d13579c31e@syzkaller.appspotmail.com It will help syzbot understand when the bug is fixed. See footer for details. If you forward the report, please keep this part and the footer. F2FS-fs (loop0): Magic Mismatch, valid(0xf2f52010) - read(0x0) F2FS-fs (loop0): Can't find valid F2FS filesystem in 1th superblock F2FS-fs (loop0): invalid crc value BUG: unable to handle kernel paging request at ffffed006b2a50c0 PGD 21ffee067 P4D 21ffee067 PUD 21fbeb067 PMD 0 Oops: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 0 PID: 4514 Comm: syzkaller989480 Not tainted 4.17.0-rc1+ #8 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:build_sit_entries fs/f2fs/segment.c:3653 [inline] RIP: 0010:build_segment_manager+0x7ef7/0xbf70 fs/f2fs/segment.c:3852 RSP: 0018:ffff8801b102e5b0 EFLAGS: 00010a06 RAX: 1ffff1006b2a50c0 RBX: 0000000000000004 RCX: 0000000000000001 RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff8801ac74243e RBP: ffff8801b102f410 R08: ffff8801acbd46c0 R09: fffffbfff14d9af8 R10: fffffbfff14d9af8 R11: ffff8801acbd46c0 R12: ffff8801ac742a80 R13: ffff8801d9519100 R14: dffffc0000000000 R15: ffff880359528600 FS: 0000000001e04880(0000) GS:ffff8801dae00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffed006b2a50c0 CR3: 00000001ac6ac000 CR4: 00000000001406f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: f2fs_fill_super+0x4095/0x7bf0 fs/f2fs/super.c:2803 mount_bdev+0x30c/0x3e0 fs/super.c:1165 f2fs_mount+0x34/0x40 fs/f2fs/super.c:3020 mount_fs+0xae/0x328 fs/super.c:1268 vfs_kern_mount.part.34+0xd4/0x4d0 fs/namespace.c:1037 vfs_kern_mount fs/namespace.c:1027 [inline] do_new_mount fs/namespace.c:2517 [inline] do_mount+0x564/0x3070 fs/namespace.c:2847 ksys_mount+0x12d/0x140 fs/namespace.c:3063 __do_sys_mount fs/namespace.c:3077 [inline] __se_sys_mount fs/namespace.c:3074 [inline] __x64_sys_mount+0xbe/0x150 fs/namespace.c:3074 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x443d6a RSP: 002b:00007ffd312813c8 EFLAGS: 00000297 ORIG_RAX: 00000000000000a5 RAX: ffffffffffffffda RBX: 0000000020000c00 RCX: 0000000000443d6a RDX: 0000000020000000 RSI: 0000000020000100 RDI: 00007ffd312813d0 RBP: 0000000000000003 R08: 0000000020016a00 R09: 000000000000000a R10: 0000000000000000 R11: 0000000000000297 R12: 0000000000000004 R13: 0000000000402c60 R14: 0000000000000000 R15: 0000000000000000 RIP: build_sit_entries fs/f2fs/segment.c:3653 [inline] RSP: ffff8801b102e5b0 RIP: build_segment_manager+0x7ef7/0xbf70 fs/f2fs/segment.c:3852 RSP: ffff8801b102e5b0 CR2: ffffed006b2a50c0 ---[ end trace a2034989e196ff17 ]--- Reported-and-tested-by: syzbot+83699adeb2d13579c31e@syzkaller.appspotmail.com Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7f6f029aa866..ae3cf8dce38e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3762,6 +3762,15 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) unsigned int old_valid_blocks; start = le32_to_cpu(segno_in_journal(journal, i)); + if (start >= MAIN_SEGS(sbi)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Wrong journal entry on segno %u", + start); + set_sbi_flag(sbi, SBI_NEED_FSCK); + err = -EINVAL; + break; + } + se = &sit_i->sentries[start]; sit = sit_in_journal(journal, i); From 26bf4e8a96aada18cf1b23a920f1f3ee50b5b739 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 24 Apr 2018 21:34:05 -0600 Subject: [PATCH 0719/1212] f2fs: sanity check for total valid node blocks This patch enhances sanity check for SIT entries. syzbot hit the following crash on upstream commit 83beed7b2b26f232d782127792dd0cd4362fdc41 (Fri Apr 20 17:56:32 2018 +0000) Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/evalenti/linux-soc-thermal syzbot dashboard link: https://syzkaller.appspot.com/bug?extid=bf9253040425feb155ad syzkaller reproducer: https://syzkaller.appspot.com/x/repro.syz?id=5692130282438656 Raw console output: https://syzkaller.appspot.com/x/log.txt?id=5095924598571008 Kernel config: https://syzkaller.appspot.com/x/.config?id=1808800213120130118 compiler: gcc (GCC) 8.0.1 20180413 (experimental) IMPORTANT: if you fix the bug, please add the following tag to the commit: Reported-by: syzbot+bf9253040425feb155ad@syzkaller.appspotmail.com It will help syzbot understand when the bug is fixed. See footer for details. If you forward the report, please keep this part and the footer. F2FS-fs (loop0): invalid crc value F2FS-fs (loop0): Try to recover 1th superblock, ret: 0 F2FS-fs (loop0): Mounted with checkpoint version = d F2FS-fs (loop0): Bitmap was wrongly cleared, blk:9740 ------------[ cut here ]------------ kernel BUG at fs/f2fs/segment.c:1884! invalid opcode: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 4508 Comm: syz-executor0 Not tainted 4.17.0-rc1+ #10 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:update_sit_entry+0x1215/0x1590 fs/f2fs/segment.c:1882 RSP: 0018:ffff8801af526708 EFLAGS: 00010282 RAX: ffffed0035ea4cc0 RBX: ffff8801ad454f90 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffffff82eeb87e RDI: ffffed0035ea4cb6 RBP: ffff8801af526760 R08: ffff8801ad4a2480 R09: ffffed003b5e4f90 R10: ffffed003b5e4f90 R11: ffff8801daf27c87 R12: ffff8801adb8d380 R13: 0000000000000001 R14: 0000000000000008 R15: 00000000ffffffff FS: 00000000014af940(0000) GS:ffff8801daf00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f06bc223000 CR3: 00000001adb02000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: allocate_data_block+0x66f/0x2050 fs/f2fs/segment.c:2663 do_write_page+0x105/0x1b0 fs/f2fs/segment.c:2727 write_node_page+0x129/0x350 fs/f2fs/segment.c:2770 __write_node_page+0x7da/0x1370 fs/f2fs/node.c:1398 sync_node_pages+0x18cf/0x1eb0 fs/f2fs/node.c:1652 block_operations+0x429/0xa60 fs/f2fs/checkpoint.c:1088 write_checkpoint+0x3ba/0x5380 fs/f2fs/checkpoint.c:1405 f2fs_sync_fs+0x2fb/0x6a0 fs/f2fs/super.c:1077 __sync_filesystem fs/sync.c:39 [inline] sync_filesystem+0x265/0x310 fs/sync.c:67 generic_shutdown_super+0xd7/0x520 fs/super.c:429 kill_block_super+0xa4/0x100 fs/super.c:1191 kill_f2fs_super+0x9f/0xd0 fs/f2fs/super.c:3030 deactivate_locked_super+0x97/0x100 fs/super.c:316 deactivate_super+0x188/0x1b0 fs/super.c:347 cleanup_mnt+0xbf/0x160 fs/namespace.c:1174 __cleanup_mnt+0x16/0x20 fs/namespace.c:1181 task_work_run+0x1e4/0x290 kernel/task_work.c:113 tracehook_notify_resume include/linux/tracehook.h:191 [inline] exit_to_usermode_loop+0x2bd/0x310 arch/x86/entry/common.c:166 prepare_exit_to_usermode arch/x86/entry/common.c:196 [inline] syscall_return_slowpath arch/x86/entry/common.c:265 [inline] do_syscall_64+0x6ac/0x800 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x457d97 RSP: 002b:00007ffd46f9c8e8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000457d97 RDX: 00000000014b09a3 RSI: 0000000000000002 RDI: 00007ffd46f9da50 RBP: 00007ffd46f9da50 R08: 0000000000000000 R09: 0000000000000009 R10: 0000000000000005 R11: 0000000000000246 R12: 00000000014b0940 R13: 0000000000000000 R14: 0000000000000002 R15: 000000000000658e RIP: update_sit_entry+0x1215/0x1590 fs/f2fs/segment.c:1882 RSP: ffff8801af526708 ---[ end trace f498328bb02610a2 ]--- Reported-and-tested-by: syzbot+bf9253040425feb155ad@syzkaller.appspotmail.com Reported-and-tested-by: syzbot+7d6d31d3bc702f566ce3@syzkaller.appspotmail.com Reported-and-tested-by: syzbot+0a725420475916460f12@syzkaller.appspotmail.com Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ae3cf8dce38e..29a648e01415 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3712,6 +3712,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) unsigned int i, start, end; unsigned int readed, start_blk = 0; int err = 0; + block_t total_node_blocks = 0; do { readed = ra_meta_pages(sbi, start_blk, BIO_MAX_PAGES, @@ -3734,6 +3735,8 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) if (err) return err; seg_info_from_raw_sit(se, &sit); + if (IS_NODESEG(se->type)) + total_node_blocks += se->valid_blocks; /* build discard map only one time */ if (f2fs_discard_en(sbi)) { @@ -3775,11 +3778,15 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) sit = sit_in_journal(journal, i); old_valid_blocks = se->valid_blocks; + if (IS_NODESEG(se->type)) + total_node_blocks -= old_valid_blocks; err = check_block_count(sbi, start, &sit); if (err) break; seg_info_from_raw_sit(se, &sit); + if (IS_NODESEG(se->type)) + total_node_blocks += se->valid_blocks; if (f2fs_discard_en(sbi)) { if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { @@ -3798,6 +3805,15 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) se->valid_blocks - old_valid_blocks; } up_read(&curseg->journal_rwsem); + + if (!err && total_node_blocks != valid_node_count(sbi)) { + f2fs_msg(sbi->sb, KERN_ERR, + "SIT is corrupted node# %u vs %u", + total_node_blocks, valid_node_count(sbi)); + set_sbi_flag(sbi, SBI_NEED_FSCK); + err = -EINVAL; + } + return err; } From cb38cc4e1d02dcad7bbaad1bd7d5e1dc2ac2b78b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 24 Apr 2018 22:43:01 -0600 Subject: [PATCH 0720/1212] f2fs: enforce fsync_mode=strict for renamed directory This is to give a option for user to be able to recover B/foo in the below case. mkdir A sync() rename(A, B) creat (B/foo) fsync (B/foo) ---crash--- Sugessted-by: Velayudhan Pillai Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index dd77ecbd536d..e6ddc9be1e60 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -970,8 +970,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_put_page(old_dir_page, 0); f2fs_i_links_write(old_dir, false); } - if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) + if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) { add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); + if (S_ISDIR(old_inode->i_mode)) + add_ino_entry(sbi, old_inode->i_ino, TRANS_DIR_INO); + } f2fs_unlock_op(sbi); From 8bb9a8da75d1678f5c4fc9ec5ea8702960102221 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Tue, 24 Apr 2018 11:40:19 +0800 Subject: [PATCH 0721/1212] f2fs: fix missing clear FI_NO_PREALLOC in some error case This patch fix missing clear FI_NO_PREALLOC in some error case Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 4334683e5491..e1808ed8fc3e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2732,6 +2732,8 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) iov_iter_count(from)) || f2fs_has_inline_data(inode) || f2fs_force_buffered_io(inode, WRITE)) { + clear_inode_flag(inode, + FI_NO_PREALLOC); inode_unlock(inode); return -EAGAIN; } From bb015824532c007d9bfbfea91f731d4e02c36320 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Tue, 24 Apr 2018 11:40:30 +0800 Subject: [PATCH 0722/1212] f2fs: move mnt_want_write_file after range check This patch move mnt_want_write_file after range check, it's needless to check arguments with it. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index e1808ed8fc3e..86aa14819637 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2060,15 +2060,15 @@ static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) if (f2fs_readonly(sbi->sb)) return -EROFS; + end = range.start + range.len; + if (range.start < MAIN_BLKADDR(sbi) || end >= MAX_BLKADDR(sbi)) { + return -EINVAL; + } + ret = mnt_want_write_file(filp); if (ret) return ret; - end = range.start + range.len; - if (range.start < MAIN_BLKADDR(sbi) || end >= MAX_BLKADDR(sbi)) { - ret = -EINVAL; - goto out; - } do_more: if (!range.sync) { if (!mutex_trylock(&sbi->gc_mutex)) { From f46eddc4da48ec1b8fbee2a1f53356bfa67eec3d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 24 Apr 2018 10:55:28 +0800 Subject: [PATCH 0723/1212] f2fs: rename dio_rwsem to i_gc_rwsem RW semphore dio_rwsem in struct f2fs_inode_info is introduced to avoid race between dio and data gc, but now, it is more wildly used to avoid foreground operation vs data gc. So rename it to i_gc_rwsem to improve its readability. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 6 +++--- fs/f2fs/f2fs.h | 4 +++- fs/f2fs/file.c | 28 ++++++++++++++-------------- fs/f2fs/gc.c | 14 +++++++------- fs/f2fs/super.c | 4 ++-- 5 files changed, 29 insertions(+), 27 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b675d5dd5c91..4436aba07617 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2403,17 +2403,17 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (rw == WRITE && whint_mode == WHINT_MODE_OFF) iocb->ki_hint = WRITE_LIFE_NOT_SET; - if (!down_read_trylock(&F2FS_I(inode)->dio_rwsem[rw])) { + if (!down_read_trylock(&F2FS_I(inode)->i_gc_rwsem[rw])) { if (iocb->ki_flags & IOCB_NOWAIT) { iocb->ki_hint = hint; err = -EAGAIN; goto out; } - down_read(&F2FS_I(inode)->dio_rwsem[rw]); + down_read(&F2FS_I(inode)->i_gc_rwsem[rw]); } err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio); - up_read(&F2FS_I(inode)->dio_rwsem[rw]); + up_read(&F2FS_I(inode)->i_gc_rwsem[rw]); if (rw == WRITE) { if (whint_mode == WHINT_MODE_OFF) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 527999edc2a9..0408c9eafa3a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -717,7 +717,9 @@ struct f2fs_inode_info { struct task_struct *inmem_task; /* store inmemory task */ struct mutex inmem_lock; /* lock for inmemory pages */ struct extent_tree *extent_tree; /* cached extent_tree entry */ - struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */ + + /* avoid racing between foreground op and gc */ + struct rw_semaphore i_gc_rwsem[2]; struct rw_semaphore i_mmap_sem; struct rw_semaphore i_xattr_sem; /* avoid racing between reading and changing EAs */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 86aa14819637..79bf6ac9b568 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1193,7 +1193,7 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) pg_end = (offset + len) >> PAGE_SHIFT; /* avoid gc operation during block exchange */ - down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); down_write(&F2FS_I(inode)->i_mmap_sem); /* write out all dirty pages from offset */ @@ -1219,7 +1219,7 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) f2fs_i_size_write(inode, new_size); out_unlock: up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } @@ -1395,7 +1395,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); /* avoid gc operation during block exchange */ - down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); down_write(&F2FS_I(inode)->i_mmap_sem); ret = truncate_blocks(inode, i_size_read(inode), true); @@ -1436,7 +1436,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_i_size_write(inode, new_size); out: up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } @@ -1676,7 +1676,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) inode_lock(inode); - down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (f2fs_is_atomic_file(inode)) goto out; @@ -1703,7 +1703,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) stat_inc_atomic_write(inode); stat_update_max_atomic_write(inode); out: - up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -1723,7 +1723,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) inode_lock(inode); - down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (f2fs_is_volatile_file(inode)) { ret = -EINVAL; @@ -1745,7 +1745,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); } err_out: - up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -2332,12 +2332,12 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, } inode_lock(src); - down_write(&F2FS_I(src)->dio_rwsem[WRITE]); + down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); if (src != dst) { ret = -EBUSY; if (!inode_trylock(dst)) goto out; - if (!down_write_trylock(&F2FS_I(dst)->dio_rwsem[WRITE])) { + if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) { inode_unlock(dst); goto out; } @@ -2399,11 +2399,11 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, f2fs_unlock_op(sbi); out_unlock: if (src != dst) { - up_write(&F2FS_I(dst)->dio_rwsem[WRITE]); + up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); inode_unlock(dst); } out: - up_write(&F2FS_I(src)->dio_rwsem[WRITE]); + up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); inode_unlock(src); return ret; } @@ -2625,9 +2625,9 @@ int f2fs_precache_extents(struct inode *inode) while (map.m_lblk < end) { map.m_len = end - map.m_lblk; - down_write(&fi->dio_rwsem[WRITE]); + down_write(&fi->i_gc_rwsem[WRITE]); err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_PRECACHE); - up_write(&fi->dio_rwsem[WRITE]); + up_write(&fi->i_gc_rwsem[WRITE]); if (err) return err; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d28d31cbd7d2..96b151546279 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -858,7 +858,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, } if (!down_write_trylock( - &F2FS_I(inode)->dio_rwsem[WRITE])) { + &F2FS_I(inode)->i_gc_rwsem[WRITE])) { iput(inode); continue; } @@ -867,7 +867,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, data_page = get_read_data_page(inode, start_bidx + ofs_in_node, REQ_RAHEAD, true); - up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (IS_ERR(data_page)) { iput(inode); continue; @@ -885,11 +885,11 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, bool locked = false; if (S_ISREG(inode->i_mode)) { - if (!down_write_trylock(&fi->dio_rwsem[READ])) + if (!down_write_trylock(&fi->i_gc_rwsem[READ])) continue; if (!down_write_trylock( - &fi->dio_rwsem[WRITE])) { - up_write(&fi->dio_rwsem[READ]); + &fi->i_gc_rwsem[WRITE])) { + up_write(&fi->i_gc_rwsem[READ]); continue; } locked = true; @@ -907,8 +907,8 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, segno, off); if (locked) { - up_write(&fi->dio_rwsem[WRITE]); - up_write(&fi->dio_rwsem[READ]); + up_write(&fi->i_gc_rwsem[WRITE]); + up_write(&fi->i_gc_rwsem[READ]); } stat_inc_data_blk_count(sbi, 1, gc_type); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b6ce10f8128a..e83691880914 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -837,8 +837,8 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) INIT_LIST_HEAD(&fi->inmem_ilist); INIT_LIST_HEAD(&fi->inmem_pages); mutex_init(&fi->inmem_lock); - init_rwsem(&fi->dio_rwsem[READ]); - init_rwsem(&fi->dio_rwsem[WRITE]); + init_rwsem(&fi->i_gc_rwsem[READ]); + init_rwsem(&fi->i_gc_rwsem[WRITE]); init_rwsem(&fi->i_mmap_sem); init_rwsem(&fi->i_xattr_sem); From 03279ce90b4666931c32cebf089c49a223db0c09 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 25 Apr 2018 19:38:17 +0800 Subject: [PATCH 0724/1212] f2fs: fix potential overflow In build_sit_entries(), if valid_blocks in SIT block is smaller than valid_blocks in journal, for below calculation: sbi->discard_blks += old_valid_blocks - se->valid_blocks; There will be two times potential overflow: - old_valid_blocks - se->valid_blocks will overflow, and be a very large number. - sbi->discard_blks += result will overflow again, comes out a correct result accidently. Anyway, it should be fixed. Fixes: d600af236da5 ("f2fs: avoid unneeded loop in build_sit_entries") Fixes: 1f43e2ad7bff ("f2fs: introduce CP_TRIMMED_FLAG to avoid unneeded discard") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 29a648e01415..b6a420d65f4e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3795,14 +3795,17 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) } else { memcpy(se->discard_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += old_valid_blocks - - se->valid_blocks; + sbi->discard_blks += old_valid_blocks; + sbi->discard_blks -= se->valid_blocks; } } - if (sbi->segs_per_sec > 1) + if (sbi->segs_per_sec > 1) { get_sec_entry(sbi, start)->valid_blocks += - se->valid_blocks - old_valid_blocks; + se->valid_blocks; + get_sec_entry(sbi, start)->valid_blocks -= + old_valid_blocks; + } } up_read(&curseg->journal_rwsem); From 2cf64590361ec367f3d2b91ab29777eb087222bb Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 25 Apr 2018 17:38:29 +0800 Subject: [PATCH 0725/1212] f2fs: introduce release_discard_addr() for cleanup Introduce release_discard_addr() to include common codes for cleanup. Signed-off-by: Chao Yu [Fengguang Wu: declare static function, reported by kbuild test robot] Signed-off-by: Fengguang Wu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b6a420d65f4e..aa5da6ea4ff8 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1661,16 +1661,20 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, return false; } +static void release_discard_addr(struct discard_entry *entry) +{ + list_del(&entry->list); + kmem_cache_free(discard_entry_slab, entry); +} + void release_discard_addrs(struct f2fs_sb_info *sbi) { struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list); struct discard_entry *entry, *this; /* drop caches */ - list_for_each_entry_safe(entry, this, head, list) { - list_del(&entry->list); - kmem_cache_free(discard_entry_slab, entry); - } + list_for_each_entry_safe(entry, this, head, list) + release_discard_addr(entry); } /* @@ -1770,9 +1774,8 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (cur_pos < sbi->blocks_per_seg) goto find_next; - list_del(&entry->list); + release_discard_addr(entry); dcc->nr_discards -= total_len; - kmem_cache_free(discard_entry_slab, entry); } wake_up_discard_thread(sbi, false); From 9bb86b63dc0f16877a3014611bce29921c1b2ffa Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 26 Apr 2018 17:05:50 +0800 Subject: [PATCH 0726/1212] f2fs: treat volatile file's data as hot one Volatile file's data will be updated oftenly, so it'd better to place its data into hot data segment. In addition, for atomic file, we change to check FI_ATOMIC_FILE instead of FI_HOT_DATA to make code readability better. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 -- fs/f2fs/segment.c | 5 +++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 79bf6ac9b568..ae21400d3ad5 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1695,7 +1695,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (ret) goto out; skip_flush: - set_inode_flag(inode, FI_HOT_DATA); set_inode_flag(inode, FI_ATOMIC_FILE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); @@ -1738,7 +1737,6 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); if (!ret) { clear_inode_flag(inode, FI_ATOMIC_FILE); - clear_inode_flag(inode, FI_HOT_DATA); stat_dec_atomic_write(inode); } } else { diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index aa5da6ea4ff8..917d7acb12cf 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -309,7 +309,6 @@ void drop_inmem_pages(struct inode *inode) mutex_unlock(&fi->inmem_lock); clear_inode_flag(inode, FI_ATOMIC_FILE); - clear_inode_flag(inode, FI_HOT_DATA); stat_dec_atomic_write(inode); } @@ -2693,7 +2692,9 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) if (is_cold_data(fio->page) || file_is_cold(inode)) return CURSEG_COLD_DATA; if (file_is_hot(inode) || - is_inode_flag_set(inode, FI_HOT_DATA)) + is_inode_flag_set(inode, FI_HOT_DATA) || + is_inode_flag_set(inode, FI_ATOMIC_FILE) || + is_inode_flag_set(inode, FI_VOLATILE_FILE)) return CURSEG_HOT_DATA; /* rw_hint_to_seg_type(inode->i_write_hint); */ return CURSEG_WARM_DATA; From 2bba5b8eb867e9f8ab9b00ebfae3a2a833b4c9c0 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 27 Apr 2018 19:03:22 -0700 Subject: [PATCH 0727/1212] f2fs: enhance sanity_check_raw_super() to avoid potential overflows In order to avoid the below overflow issue, we should have checked the boundaries in superblock before reaching out to allocation. As Linus suggested, the right place should be sanity_check_raw_super(). Dr Silvio Cesare of InfoSect reported: There are integer overflows with using the cp_payload superblock field in the f2fs filesystem potentially leading to memory corruption. include/linux/f2fs_fs.h struct f2fs_super_block { ... __le32 cp_payload; fs/f2fs/f2fs.h typedef u32 block_t; /* * should not change u32, since it is the on-disk block * address format, __le32. */ ... static inline block_t __cp_payload(struct f2fs_sb_info *sbi) { return le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload); } fs/f2fs/checkpoint.c block_t start_blk, orphan_blocks, i, j; ... start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi); +++ integer overflows ... unsigned int cp_blks = 1 + __cp_payload(sbi); ... sbi->ckpt = kzalloc(cp_blks * blk_size, GFP_KERNEL); +++ integer overflow leading to incorrect heap allocation. int cp_payload_blks = __cp_payload(sbi); ... ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks + orphan_blocks); +++ sign bug and integer overflow ... for (i = 1; i < 1 + cp_payload_blks; i++) +++ integer overflow ... sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS - NR_CURSEG_TYPE - __cp_payload(sbi)) * F2FS_ORPHANS_PER_BLOCK; +++ integer overflow Reported-by: Greg KH Reported-by: Silvio Cesare Suggested-by: Linus Torvalds Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 75 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 68 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e83691880914..ee74aa4a5f84 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2137,6 +2137,8 @@ static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, static int sanity_check_raw_super(struct f2fs_sb_info *sbi, struct buffer_head *bh) { + block_t segment_count, segs_per_sec, secs_per_zone; + block_t total_sections, blocks_per_seg; struct f2fs_super_block *raw_super = (struct f2fs_super_block *) (bh->b_data + F2FS_SUPER_OFFSET); struct super_block *sb = sbi->sb; @@ -2193,6 +2195,72 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return 1; } + segment_count = le32_to_cpu(raw_super->segment_count); + segs_per_sec = le32_to_cpu(raw_super->segs_per_sec); + secs_per_zone = le32_to_cpu(raw_super->secs_per_zone); + total_sections = le32_to_cpu(raw_super->section_count); + + /* blocks_per_seg should be 512, given the above check */ + blocks_per_seg = 1 << le32_to_cpu(raw_super->log_blocks_per_seg); + + if (segment_count > F2FS_MAX_SEGMENT || + segment_count < F2FS_MIN_SEGMENTS) { + f2fs_msg(sb, KERN_INFO, + "Invalid segment count (%u)", + segment_count); + return 1; + } + + if (total_sections > segment_count || + total_sections < F2FS_MIN_SEGMENTS || + segs_per_sec > segment_count || !segs_per_sec) { + f2fs_msg(sb, KERN_INFO, + "Invalid segment/section count (%u, %u x %u)", + segment_count, total_sections, segs_per_sec); + return 1; + } + + if ((segment_count / segs_per_sec) < total_sections) { + f2fs_msg(sb, KERN_INFO, + "Small segment_count (%u < %u * %u)", + segment_count, segs_per_sec, total_sections); + return 1; + } + + if (segment_count > (le32_to_cpu(raw_super->block_count) >> 9)) { + f2fs_msg(sb, KERN_INFO, + "Wrong segment_count / block_count (%u > %u)", + segment_count, le32_to_cpu(raw_super->block_count)); + return 1; + } + + if (secs_per_zone > total_sections) { + f2fs_msg(sb, KERN_INFO, + "Wrong secs_per_zone (%u > %u)", + secs_per_zone, total_sections); + return 1; + } + if (le32_to_cpu(raw_super->extension_count) > F2FS_MAX_EXTENSION || + raw_super->hot_ext_count > F2FS_MAX_EXTENSION || + (le32_to_cpu(raw_super->extension_count) + + raw_super->hot_ext_count) > F2FS_MAX_EXTENSION) { + f2fs_msg(sb, KERN_INFO, + "Corrupted extension count (%u + %u > %u)", + le32_to_cpu(raw_super->extension_count), + raw_super->hot_ext_count, + F2FS_MAX_EXTENSION); + return 1; + } + + if (le32_to_cpu(raw_super->cp_payload) > + (blocks_per_seg - F2FS_CP_PACKS)) { + f2fs_msg(sb, KERN_INFO, + "Insane cp_payload (%u > %u)", + le32_to_cpu(raw_super->cp_payload), + blocks_per_seg - F2FS_CP_PACKS); + return 1; + } + /* check reserved ino info */ if (le32_to_cpu(raw_super->node_ino) != 1 || le32_to_cpu(raw_super->meta_ino) != 2 || @@ -2205,13 +2273,6 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return 1; } - if (le32_to_cpu(raw_super->segment_count) > F2FS_MAX_SEGMENT) { - f2fs_msg(sb, KERN_INFO, - "Invalid segment count (%u)", - le32_to_cpu(raw_super->segment_count)); - return 1; - } - /* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */ if (sanity_check_area_boundary(sbi, bh)) return 1; From 0037c639e63d9823f8d6cd00599e4e554f2c06b0 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 30 Apr 2018 16:27:44 +0100 Subject: [PATCH 0728/1212] f2fs: fix spelling mistake: "extenstion" -> "extension" Trivial fix to spelling mistake in extension list text Signed-off-by: Colin Ian King Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 2c53de9251be..6d8d8f41e517 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -147,13 +147,13 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, int len = 0, i; len += snprintf(buf + len, PAGE_SIZE - len, - "cold file extenstion:\n"); + "cold file extension:\n"); for (i = 0; i < cold_count; i++) len += snprintf(buf + len, PAGE_SIZE - len, "%s\n", extlist[i]); len += snprintf(buf + len, PAGE_SIZE - len, - "hot file extenstion:\n"); + "hot file extension:\n"); for (i = cold_count; i < cold_count + hot_count; i++) len += snprintf(buf + len, PAGE_SIZE - len, "%s\n", extlist[i]); From 2494cc7c0bcd945ec970568b1de44f3b80aeb6d9 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 4 May 2018 18:04:22 -0700 Subject: [PATCH 0729/1212] f2fs: don't drop any page on f2fs_cp_error() case We still provide readdir() after shtudown, so we should keep pages to avoid additional IOs. Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 7c3e8190cff2..a6c0e1023d13 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1380,11 +1380,8 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, trace_f2fs_writepage(page, NODE); - if (unlikely(f2fs_cp_error(sbi))) { - dec_page_count(sbi, F2FS_DIRTY_NODES); - unlock_page(page); - return 0; - } + if (unlikely(f2fs_cp_error(sbi))) + goto redirty_out; if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; From 331ae0c25b4412df8e4c75d64e33791f16d1a264 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 26 Apr 2018 17:05:51 +0800 Subject: [PATCH 0730/1212] Revert "f2fs: add ovp valid_blocks check for bg gc victim to fg_gc" For extreme case: 10 section, op = 10%, no_fggc_threshold = 90% All section usage: 85% 85% 85% 85% 90% 90% 95% 95% 95% 95% During foreground GC, if we skip select dirty section whose usage is larger than no_fggc_threshold, we can only recycle 80% invalid space from four 85% usage sections and two 90% usage sections, result in encountering out-of-space issue. This reverts commit e93b9865251a0503d83fd570e7d5a7c8bc351715 to fix this issue, besides, we keep the logic that we scan all dirty section when searching a victim, so that GC can select victim with least valid blocks. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 --- fs/f2fs/gc.c | 16 ---------------- fs/f2fs/segment.h | 9 --------- 3 files changed, 28 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0408c9eafa3a..dfbf59a0525d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1260,9 +1260,6 @@ struct f2fs_sb_info { struct f2fs_gc_kthread *gc_thread; /* GC thread */ unsigned int cur_victim_sec; /* current victim section num */ - /* threshold for converting bg victims for fg */ - u64 fggc_threshold; - /* threshold for gc trials on pinned files */ u64 gc_pin_file_threshold; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 96b151546279..ffcb744ffcfe 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -234,10 +234,6 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) { if (sec_usage_check(sbi, secno)) continue; - - if (no_fggc_candidate(sbi, secno)) - continue; - clear_bit(secno, dirty_i->victim_secmap); return GET_SEG_FROM_SEC(sbi, secno); } @@ -377,9 +373,6 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, goto next; if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) goto next; - if (gc_type == FG_GC && p.alloc_mode == LFS && - no_fggc_candidate(sbi, secno)) - goto next; cost = get_gc_cost(sbi, segno, &p); @@ -1105,17 +1098,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, void build_gc_manager(struct f2fs_sb_info *sbi) { - u64 main_count, resv_count, ovp_count; - DIRTY_I(sbi)->v_ops = &default_v_ops; - /* threshold of # of valid blocks in a section for victims of FG_GC */ - main_count = SM_I(sbi)->main_segments << sbi->log_blocks_per_seg; - resv_count = SM_I(sbi)->reserved_segments << sbi->log_blocks_per_seg; - ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg; - - sbi->fggc_threshold = div64_u64((main_count - ovp_count) * - BLKS_PER_SEC(sbi), (main_count - resv_count)); sbi->gc_pin_file_threshold = DEF_GC_FAILED_PINNED_FILES; /* give warm/cold data area from slower device */ diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index e352e01854b0..21c1cc89ee6e 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -771,15 +771,6 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) - (base + 1) + type; } -static inline bool no_fggc_candidate(struct f2fs_sb_info *sbi, - unsigned int secno) -{ - if (get_valid_blocks(sbi, GET_SEG_FROM_SEC(sbi, secno), true) > - sbi->fggc_threshold) - return true; - return false; -} - static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) { if (IS_CURSEC(sbi, secno) || (sbi->cur_victim_sec == secno)) From cdcf2b3e2559797ad166d6fbf8206dc13ff25c4e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 7 May 2018 20:28:52 +0800 Subject: [PATCH 0731/1212] f2fs: fix to initialize i_current_depth according to inode type i_current_depth is used only for directory inode, but its space is shared with i_gc_failures field used for regular inode, in order to avoid affecting i_gc_failures' value, this patch fixes to initialize the union's fields according to inode type. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 12 +++++++++--- fs/f2fs/namei.c | 3 +++ fs/f2fs/super.c | 1 - 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index bface995617b..e4d4b51fac31 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -232,8 +232,10 @@ static int do_read_inode(struct inode *inode) inode->i_ctime.tv_nsec = le32_to_cpu(ri->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec); inode->i_generation = le32_to_cpu(ri->i_generation); - - fi->i_current_depth = le32_to_cpu(ri->i_current_depth); + if (S_ISDIR(inode->i_mode)) + fi->i_current_depth = le32_to_cpu(ri->i_current_depth); + else if (S_ISREG(inode->i_mode)) + fi->i_gc_failures = le16_to_cpu(ri->i_gc_failures); fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid); fi->i_flags = le32_to_cpu(ri->i_flags); fi->flags = 0; @@ -422,7 +424,11 @@ void update_inode(struct inode *inode, struct page *node_page) ri->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); ri->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); ri->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); - ri->i_current_depth = cpu_to_le32(F2FS_I(inode)->i_current_depth); + if (S_ISDIR(inode->i_mode)) + ri->i_current_depth = + cpu_to_le32(F2FS_I(inode)->i_current_depth); + else if (S_ISREG(inode->i_mode)) + ri->i_gc_failures = cpu_to_le16(F2FS_I(inode)->i_gc_failures); ri->i_xattr_nid = cpu_to_le32(F2FS_I(inode)->i_xattr_nid); ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags); ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index e6ddc9be1e60..b32433d8667b 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -54,6 +54,9 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) F2FS_I(inode)->i_crtime = current_time(inode); inode->i_generation = sbi->s_next_generation++; + if (S_ISDIR(inode->i_mode)) + F2FS_I(inode)->i_current_depth = 1; + err = insert_inode_locked(inode); if (err) { err = -EINVAL; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ee74aa4a5f84..cc55475832e2 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -830,7 +830,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) /* Initialize f2fs-specific inode info */ atomic_set(&fi->dirty_pages, 0); - fi->i_current_depth = 1; init_rwsem(&fi->i_sem); INIT_LIST_HEAD(&fi->dirty_list); INIT_LIST_HEAD(&fi->gdirty_list); From 9bb4d22cf5de448a6d5ebad67f7b8a27c9eacd0a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 8 May 2018 14:06:03 +0800 Subject: [PATCH 0732/1212] f2fs: fix to let checkpoint guarantee atomic page persistence 1. thread A: commit_inmem_pages submit data into block layer, but haven't waited it writeback. 2. thread A: commit_inmem_pages update related node. 3. thread B: do checkpoint, flush all nodes to disk. 4. SPOR Then, atomic file becomes corrupted since nodes is flushed before data. This patch fixes to treat atomic page as checkpoint guaranteed one, then in checkpoint, we can make sure all atomic page can be writebacked with metadata of atomic file. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 4436aba07617..67e3b59da064 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -47,6 +47,8 @@ static bool __is_cp_guaranteed(struct page *page) if (inode->i_ino == F2FS_META_INO(sbi) || inode->i_ino == F2FS_NODE_INO(sbi) || S_ISDIR(inode->i_mode) || + (S_ISREG(inode->i_mode) && + is_inode_flag_set(inode, FI_ATOMIC_FILE)) || is_cold_data(page)) return true; return false; From a5d0ccbc189a02a0931d7a3ee092d64f89d69f0f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 15 May 2018 18:59:55 +0800 Subject: [PATCH 0733/1212] f2fs: fix to initialize min_mtime with ULLONG_MAX Since sit_i.min_mtime's type is unsigned long long, so we should initialize it with max value of the type ULLONG_MAX instead of LLONG_MAX. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 917d7acb12cf..719022d62d8f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3919,7 +3919,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi) down_write(&sit_i->sentry_lock); - sit_i->min_mtime = LLONG_MAX; + sit_i->min_mtime = ULLONG_MAX; for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { unsigned int i; From 461247b21fde524b9022dcadb2a8e751ab520a55 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 23 May 2018 22:25:08 +0800 Subject: [PATCH 0734/1212] f2fs: clean up with is_valid_blkaddr() - rename is_valid_blkaddr() to is_valid_meta_blkaddr() for readability. - introduce is_valid_blkaddr() for cleanup. No logic change in this patch. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 ++-- fs/f2fs/data.c | 18 +++++------------- fs/f2fs/f2fs.h | 9 ++++++++- fs/f2fs/file.c | 2 +- fs/f2fs/inode.c | 2 +- fs/f2fs/node.c | 5 ++--- fs/f2fs/recovery.c | 6 +++--- fs/f2fs/segment.c | 4 ++-- fs/f2fs/segment.h | 2 +- 9 files changed, 25 insertions(+), 27 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 6d331c21f7ce..4e50459b3ad3 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -119,7 +119,7 @@ struct page *get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index) return __get_meta_page(sbi, index, false); } -bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) +bool is_valid_meta_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { switch (type) { case META_NAT: @@ -176,7 +176,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, blk_start_plug(&plug); for (; nrpages-- > 0; blkno++) { - if (!is_valid_blkaddr(sbi, blkno, type)) + if (!is_valid_meta_blkaddr(sbi, blkno, type)) goto out; switch (type) { diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 67e3b59da064..3000fa45b34d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -484,7 +484,7 @@ int f2fs_submit_page_write(struct f2fs_io_info *fio) spin_unlock(&io->io_lock); } - if (fio->old_blkaddr != NEW_ADDR) + if (is_valid_blkaddr(fio->old_blkaddr)) verify_block_addr(fio, fio->old_blkaddr); verify_block_addr(fio, fio->new_blkaddr); @@ -1044,7 +1044,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, next_block: blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); - if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) { + if (!is_valid_blkaddr(blkaddr)) { if (create) { if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; @@ -1678,15 +1678,6 @@ static inline bool need_inplace_update(struct f2fs_io_info *fio) return should_update_inplace(inode, fio); } -static inline bool valid_ipu_blkaddr(struct f2fs_io_info *fio) -{ - if (fio->old_blkaddr == NEW_ADDR) - return false; - if (fio->old_blkaddr == NULL_ADDR) - return false; - return true; -} - int do_write_data_page(struct f2fs_io_info *fio) { struct page *page = fio->page; @@ -1701,7 +1692,7 @@ int do_write_data_page(struct f2fs_io_info *fio) f2fs_lookup_extent_cache(inode, page->index, &ei)) { fio->old_blkaddr = ei.blk + page->index - ei.fofs; - if (valid_ipu_blkaddr(fio)) { + if (is_valid_blkaddr(fio->old_blkaddr)) { ipu_force = true; fio->need_lock = LOCK_DONE; goto got_it; @@ -1728,7 +1719,8 @@ int do_write_data_page(struct f2fs_io_info *fio) * If current allocation needs SSR, * it had better in-place writes for updated data. */ - if (ipu_force || (valid_ipu_blkaddr(fio) && need_inplace_update(fio))) { + if (ipu_force || (is_valid_blkaddr(fio->old_blkaddr) && + need_inplace_update(fio))) { err = encrypt_one_page(fio); if (err) goto out_writepage; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index dfbf59a0525d..4d4a344f2a55 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2728,6 +2728,13 @@ static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, spin_unlock(&sbi->iostat_lock); } +static inline bool is_valid_blkaddr(block_t blkaddr) +{ + if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) + return false; + return true; +} + /* * file.c */ @@ -2946,7 +2953,7 @@ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io); struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); struct page *get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index); -bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); +bool is_valid_meta_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync); void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ae21400d3ad5..0fb9f15f2068 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -354,7 +354,7 @@ static bool __found_offset(block_t blkaddr, pgoff_t dirty, pgoff_t pgofs, switch (whence) { case SEEK_DATA: if ((blkaddr == NEW_ADDR && dirty == pgofs) || - (blkaddr != NEW_ADDR && blkaddr != NULL_ADDR)) + is_valid_blkaddr(blkaddr)) return true; break; case SEEK_HOLE: diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index e4d4b51fac31..a814dd221eed 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -72,7 +72,7 @@ static bool __written_first_block(struct f2fs_inode *ri) { block_t addr = le32_to_cpu(ri->i_addr[offset_in_addr(ri)]); - if (addr != NEW_ADDR && addr != NULL_ADDR) + if (is_valid_blkaddr(addr)) return true; return false; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index a6c0e1023d13..50f6ee79f3f7 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -379,8 +379,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, new_blkaddr == NULL_ADDR); f2fs_bug_on(sbi, nat_get_blkaddr(e) == NEW_ADDR && new_blkaddr == NEW_ADDR); - f2fs_bug_on(sbi, nat_get_blkaddr(e) != NEW_ADDR && - nat_get_blkaddr(e) != NULL_ADDR && + f2fs_bug_on(sbi, is_valid_blkaddr(nat_get_blkaddr(e)) && new_blkaddr == NEW_ADDR); /* increment version no as node is removed */ @@ -391,7 +390,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, /* change address */ nat_set_blkaddr(e, new_blkaddr); - if (new_blkaddr == NEW_ADDR || new_blkaddr == NULL_ADDR) + if (!is_valid_blkaddr(new_blkaddr)) set_nat_flag(e, IS_CHECKPOINTED, false); __set_nat_cache_dirty(nm_i, e); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 7305226a7476..3c3551811134 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -252,7 +252,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, while (1) { struct fsync_inode_entry *entry; - if (!is_valid_blkaddr(sbi, blkaddr, META_POR)) + if (!is_valid_meta_blkaddr(sbi, blkaddr, META_POR)) return 0; page = get_tmp_page(sbi, blkaddr); @@ -506,7 +506,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, } /* dest is valid block, try to recover from src to dest */ - if (is_valid_blkaddr(sbi, dest, META_POR)) { + if (is_valid_meta_blkaddr(sbi, dest, META_POR)) { if (src == NULL_ADDR) { err = reserve_new_block(&dn); @@ -567,7 +567,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, while (1) { struct fsync_inode_entry *entry; - if (!is_valid_blkaddr(sbi, blkaddr, META_POR)) + if (!is_valid_meta_blkaddr(sbi, blkaddr, META_POR)) break; ra_meta_pages_cond(sbi, blkaddr); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 719022d62d8f..4412c506c6ad 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1975,7 +1975,7 @@ bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) struct seg_entry *se; bool is_cp = false; - if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) + if (!is_valid_blkaddr(blkaddr)) return true; down_read(&sit_i->sentry_lock); @@ -3040,7 +3040,7 @@ void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr) { struct page *cpage; - if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) + if (!is_valid_blkaddr(blkaddr)) return; cpage = find_lock_page(META_MAPPING(sbi), blkaddr); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 21c1cc89ee6e..3367ce263fb9 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -85,7 +85,7 @@ (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & ((sbi)->blocks_per_seg - 1)) #define GET_SEGNO(sbi, blk_addr) \ - ((((blk_addr) == NULL_ADDR) || ((blk_addr) == NEW_ADDR)) ? \ + ((!is_valid_blkaddr(blk_addr)) ? \ NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ GET_SEGNO_FROM_SEG0(sbi, blk_addr))) #define BLKS_PER_SEC(sbi) \ From bf9510b162c4d0f19d4a7f834efe065b2e6b0659 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:34:33 -0800 Subject: [PATCH 0735/1212] mm: implement find_get_pages_range_tag() Patch series "Ranged pagevec tagged lookup", v3. In this series I provide a ranged variant of pagevec_lookup_tag() and use it in places where it makes sense. This series removes some common code and it also has a potential for speeding up some operations similarly as for pagevec_lookup_range() (but for now I can think of only artificial cases where this happens). This patch (of 16): Implement a variant of find_get_pages_tag() that stops iterating at given index. Lots of users of this function (through pagevec_lookup()) actually want a range lookup and all of them are currently open-coding this. Also create corresponding pagevec_lookup_range_tag() function. Link: http://lkml.kernel.org/r/20171009151359.31984-2-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Cc: Bob Peterson Cc: Chao Yu Cc: David Howells Cc: David Sterba Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Ryusuke Konishi Cc: Steve French Cc: "Theodore Ts'o" Cc: "Yan, Zheng" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 12 ++++++++++-- include/linux/pagevec.h | 11 +++++++++-- mm/filemap.c | 33 ++++++++++++++++++++++++--------- mm/swap.c | 9 +++++---- 4 files changed, 48 insertions(+), 17 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index fbfadba81c5a..81ddfdc5d1d8 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -359,8 +359,16 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages); unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages); -unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, - int tag, unsigned int nr_pages, struct page **pages); +unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, + pgoff_t end, int tag, unsigned int nr_pages, + struct page **pages); +static inline unsigned find_get_pages_tag(struct address_space *mapping, + pgoff_t *index, int tag, unsigned int nr_pages, + struct page **pages) +{ + return find_get_pages_range_tag(mapping, index, (pgoff_t)-1, tag, + nr_pages, pages); +} struct page *grab_cache_page_write_begin(struct address_space *mapping, pgoff_t index, unsigned flags); diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index b45d391b4540..b59927938834 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -29,9 +29,16 @@ unsigned pagevec_lookup_entries(struct pagevec *pvec, void pagevec_remove_exceptionals(struct pagevec *pvec); unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, pgoff_t start, unsigned nr_pages); -unsigned pagevec_lookup_tag(struct pagevec *pvec, +unsigned pagevec_lookup_range_tag(struct pagevec *pvec, + struct address_space *mapping, pgoff_t *index, pgoff_t end, + int tag, unsigned nr_pages); +static inline unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, int tag, - unsigned nr_pages); + unsigned nr_pages) +{ + return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag, + nr_pages); +} static inline void pagevec_init(struct pagevec *pvec, int cold) { diff --git a/mm/filemap.c b/mm/filemap.c index 1bb007624b53..4d16907c0684 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1415,9 +1415,10 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, EXPORT_SYMBOL(find_get_pages_contig); /** - * find_get_pages_tag - find and return pages that match @tag + * find_get_pages_range_tag - find and return pages in given range matching @tag * @mapping: the address_space to search * @index: the starting page index + * @end: The final page index (inclusive) * @tag: the tag index * @nr_pages: the maximum number of pages * @pages: where the resulting pages are placed @@ -1425,8 +1426,9 @@ EXPORT_SYMBOL(find_get_pages_contig); * Like find_get_pages, except we only return pages which are tagged with * @tag. We update @index to index the next page for the traversal. */ -unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, - int tag, unsigned int nr_pages, struct page **pages) +unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, + pgoff_t end, int tag, unsigned int nr_pages, + struct page **pages) { struct radix_tree_iter iter; void **slot; @@ -1440,6 +1442,9 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, *index, tag) { struct page *page; + + if (iter.index > end) + break; repeat: page = radix_tree_deref_slot(slot); if (unlikely(!page)) @@ -1478,18 +1483,28 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, } pages[ret] = page; - if (++ret == nr_pages) - break; + if (++ret == nr_pages) { + *index = pages[ret - 1]->index + 1; + goto out; + } } + /* + * We come here when we got at @end. We take care to not overflow the + * index @index as it confuses some of the callers. This breaks the + * iteration when there is page at index -1 but that is already broken + * anyway. + */ + if (end == (pgoff_t)-1) + *index = (pgoff_t)-1; + else + *index = end + 1; +out: rcu_read_unlock(); - if (ret) - *index = pages[ret - 1]->index + 1; - return ret; } -EXPORT_SYMBOL(find_get_pages_tag); +EXPORT_SYMBOL(find_get_pages_range_tag); /* * CD/DVDs are error prone. When a medium error occurs, the driver may fail diff --git a/mm/swap.c b/mm/swap.c index 39395fb549c0..6eefbfabafc0 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1130,14 +1130,15 @@ unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, } EXPORT_SYMBOL(pagevec_lookup); -unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, - pgoff_t *index, int tag, unsigned nr_pages) +unsigned pagevec_lookup_range_tag(struct pagevec *pvec, + struct address_space *mapping, pgoff_t *index, pgoff_t end, + int tag, unsigned nr_pages) { - pvec->nr = find_get_pages_tag(mapping, index, tag, + pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, nr_pages, pvec->pages); return pagevec_count(pvec); } -EXPORT_SYMBOL(pagevec_lookup_tag); +EXPORT_SYMBOL(pagevec_lookup_range_tag); /* * Perform any setup for the swap system From e25fadabb5c779787b33198d97890e8c9b3c1c7a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:34:37 -0800 Subject: [PATCH 0736/1212] btrfs: use pagevec_lookup_range_tag() We want only pages from given range in btree_write_cache_pages() and extent_write_cache_pages(). Use pagevec_lookup_range_tag() instead of pagevec_lookup_tag() and remove unnecessary code. Link: http://lkml.kernel.org/r/20171009151359.31984-3-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: David Sterba Reviewed-by: Daniel Jordan Cc: David Sterba Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/extent_io.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 257bbdcb5df6..bc6b8635917f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3932,8 +3932,8 @@ int btree_write_cache_pages(struct address_space *mapping, if (wbc->sync_mode == WB_SYNC_ALL) tag_pages_for_writeback(mapping, index, end); while (!done && !nr_to_write_done && (index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, + tag, PAGEVEC_SIZE))) { unsigned i; scanned = 1; @@ -3943,11 +3943,6 @@ int btree_write_cache_pages(struct address_space *mapping, if (!PagePrivate(page)) continue; - if (!wbc->range_cyclic && page->index > end) { - done = 1; - break; - } - spin_lock(&mapping->private_lock); if (!PagePrivate(page)) { spin_unlock(&mapping->private_lock); @@ -4076,8 +4071,8 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, if (wbc->sync_mode == WB_SYNC_ALL) tag_pages_for_writeback(mapping, index, end); while (!done && !nr_to_write_done && (index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, + tag, PAGEVEC_SIZE))) { unsigned i; scanned = 1; @@ -4101,12 +4096,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, continue; } - if (!wbc->range_cyclic && page->index > end) { - done = 1; - unlock_page(page); - continue; - } - if (wbc->sync_mode != WB_SYNC_NONE) { if (PageWriteback(page)) flush_fn(data); From 1c7be24f65cdd4d053ef8c2b4ff83a150167fb80 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:34:41 -0800 Subject: [PATCH 0737/1212] ceph: use pagevec_lookup_range_tag() We want only pages from given range in ceph_writepages_start(). Use pagevec_lookup_range_tag() instead of pagevec_lookup_tag() and remove unnecessary code. Link: http://lkml.kernel.org/r/20171009151359.31984-4-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Reviewed-by: "Yan, Zheng" Cc: Ilya Dryomov Cc: "Yan, Zheng" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ceph/addr.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index b7d218a168fb..c720b5032c68 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -797,10 +797,10 @@ static int ceph_writepages_start(struct address_space *mapping, min((pgoff_t)PAGEVEC_SIZE, max_pages - (pgoff_t)locked_pages) - 1) + 1; - pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, + pvec_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, + end, PAGECACHE_TAG_DIRTY, want); - dout("pagevec_lookup_tag got %d\n", pvec_pages); + dout("pagevec_lookup_range_tag got %d\n", pvec_pages); if (!pvec_pages && !locked_pages) break; for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) { From 18a4848ffded01d4d6f9102ce0af7fa2dd40bc7c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:34:44 -0800 Subject: [PATCH 0738/1212] ext4: use pagevec_lookup_range_tag() We want only pages from given range in ext4_writepages(). Use pagevec_lookup_range_tag() instead of pagevec_lookup_tag() and remove unnecessary code. Link: http://lkml.kernel.org/r/20171009151359.31984-5-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext4/inode.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index df30d04f6760..3eed917db1e7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2367,24 +2367,14 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) mpd->map.m_len = 0; mpd->next_page = index; while (index <= end) { - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, + tag, PAGEVEC_SIZE); if (nr_pages == 0) goto out; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - /* - * At this point, the page may be truncated or - * invalidated (changing page->mapping to NULL), or - * even swizzled back from swapper_space to tmpfs file - * mapping. However, page->index will not change - * because we have a reference on the page. - */ - if (page->index > end) - goto out; - /* * Accumulated enough dirty pages? This doesn't apply * to WB_SYNC_ALL mode. For integrity sync we have to From a05d8a6a2bdec7ed7200390d33dd45656df2d2eb Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:34:48 -0800 Subject: [PATCH 0739/1212] f2fs: use pagevec_lookup_range_tag() We want only pages from given range in f2fs_write_cache_pages(). Use pagevec_lookup_range_tag() instead of pagevec_lookup_tag() and remove unnecessary code. Link: http://lkml.kernel.org/r/20171009151359.31984-6-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Chao Yu Reviewed-by: Daniel Jordan Cc: Jaegeuk Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/f2fs/data.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 3000fa45b34d..a7dc2cbeb3d1 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1969,8 +1969,8 @@ static int f2fs_write_cache_pages(struct address_space *mapping, while (!done && (index <= end)) { int i; - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1); + nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, + tag, PAGEVEC_SIZE); if (nr_pages == 0) break; @@ -1978,11 +1978,6 @@ static int f2fs_write_cache_pages(struct address_space *mapping, struct page *page = pvec.pages[i]; bool submitted = false; - if (page->index > end) { - done = 1; - break; - } - done_index = page->index; retry_write: lock_page(page); From 6cf6fb8645ffa50459fc2a1599e9ccbcce2eb87a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:34:51 -0800 Subject: [PATCH 0740/1212] f2fs: simplify page iteration loops In several places we want to iterate over all tagged pages in a mapping. However the code was apparently copied from places that iterate only over a limited range and thus it checks for index <= end, optimizes the case where we are coming close to range end which is all pointless when end == ULONG_MAX. So just remove this dead code. [akpm@linux-foundation.org: fix warnings] Link: http://lkml.kernel.org/r/20171009151359.31984-7-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Reviewed-by: Chao Yu Cc: Jaegeuk Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/f2fs/checkpoint.c | 13 ++++------ fs/f2fs/node.c | 59 +++++++++++++++----------------------------- 2 files changed, 25 insertions(+), 47 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 4e50459b3ad3..46799d35c632 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -312,9 +312,10 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, long nr_to_write, enum iostat_type io_type) { struct address_space *mapping = META_MAPPING(sbi); - pgoff_t index = 0, end = ULONG_MAX, prev = ULONG_MAX; + pgoff_t index = 0, prev = ULONG_MAX; struct pagevec pvec; long nwritten = 0; + int nr_pages; struct writeback_control wbc = { .for_reclaim = 0, }; @@ -324,13 +325,9 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, blk_start_plug(&plug); - while (index <= end) { - int i, nr_pages; - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); - if (unlikely(nr_pages == 0)) - break; + while ((nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) { + int i; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 50f6ee79f3f7..8ddd435ea50b 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1299,21 +1299,17 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) { - pgoff_t index, end; + pgoff_t index; struct pagevec pvec; struct page *last_page = NULL; + int nr_pages; pagevec_init(&pvec, 0); index = 0; - end = ULONG_MAX; - while (index <= end) { - int i, nr_pages; - nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); - if (nr_pages == 0) - break; + while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) { + int i; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -1482,13 +1478,14 @@ static int f2fs_write_node_page(struct page *page, int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic) { - pgoff_t index, end; + pgoff_t index; pgoff_t last_idx = ULONG_MAX; struct pagevec pvec; int ret = 0; struct page *last_page = NULL; bool marked = false; nid_t ino = inode->i_ino; + int nr_pages; if (atomic) { last_page = last_fsync_dnode(sbi, ino); @@ -1498,15 +1495,10 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, retry: pagevec_init(&pvec, 0); index = 0; - end = ULONG_MAX; - while (index <= end) { - int i, nr_pages; - nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); - if (nr_pages == 0) - break; + while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) { + int i; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -1605,25 +1597,21 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, bool do_balance, enum iostat_type io_type) { - pgoff_t index, end; + pgoff_t index; struct pagevec pvec; int step = 0; int nwritten = 0; int ret = 0; + int nr_pages; pagevec_init(&pvec, 0); next_step: index = 0; - end = ULONG_MAX; - while (index <= end) { - int i, nr_pages; - nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); - if (nr_pages == 0) - break; + while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) { + int i; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -1709,27 +1697,20 @@ int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) { - pgoff_t index = 0, end = ULONG_MAX; + pgoff_t index = 0; struct pagevec pvec; int ret2 = 0, ret = 0; + int nr_pages; pagevec_init(&pvec, 0); - while (index <= end) { - int i, nr_pages; - nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_WRITEBACK, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); - if (nr_pages == 0) - break; + while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_WRITEBACK, PAGEVEC_SIZE))) { + int i; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - /* until radix tree lookup accepts end_index */ - if (unlikely(page->index > end)) - continue; - if (ino && ino_of_node(page) == ino) { f2fs_wait_on_page_writeback(page, NODE, true); if (TestClearPageError(page)) From 564108e83a74d8dff72fb2e8b37ebb302fc2b8ad Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:34:55 -0800 Subject: [PATCH 0741/1212] f2fs: use find_get_pages_tag() for looking up single page __get_first_dirty_index() wants to lookup only the first dirty page after given index. There's no point in using pagevec_lookup_tag() for that. Just use find_get_pages_tag() directly. Link: http://lkml.kernel.org/r/20171009151359.31984-8-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Chao Yu Reviewed-by: Daniel Jordan Cc: Jaegeuk Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/f2fs/file.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 0fb9f15f2068..2a87f4531e0e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -333,18 +333,19 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) static pgoff_t __get_first_dirty_index(struct address_space *mapping, pgoff_t pgofs, int whence) { - struct pagevec pvec; + struct page *page; int nr_pages; if (whence != SEEK_DATA) return 0; /* find first dirty page index */ - pagevec_init(&pvec, 0); - nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs, - PAGECACHE_TAG_DIRTY, 1); - pgofs = nr_pages ? pvec.pages[0]->index : ULONG_MAX; - pagevec_release(&pvec); + nr_pages = find_get_pages_tag(mapping, &pgofs, PAGECACHE_TAG_DIRTY, + 1, &page); + if (!nr_pages) + return ULONG_MAX; + pgofs = page->index; + put_page(page); return pgofs; } From 160355d69f4610cccc570fec7d72a8e87da4428c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:34:58 -0800 Subject: [PATCH 0742/1212] gfs2: use pagevec_lookup_range_tag() We want only pages from given range in gfs2_write_cache_jdata(). Use pagevec_lookup_range_tag() instead of pagevec_lookup_tag() and remove unnecessary code. Link: http://lkml.kernel.org/r/20171009151359.31984-9-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Cc: Bob Peterson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/gfs2/aops.c | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 1caee0534587..2505627f024e 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -249,22 +249,6 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping, for(i = 0; i < nr_pages; i++) { struct page *page = pvec->pages[i]; - /* - * At this point, the page may be truncated or - * invalidated (changing page->mapping to NULL), or - * even swizzled back from swapper_space to tmpfs file - * mapping. However, page->index will not change - * because we have a reference on the page. - */ - if (page->index > end) { - /* - * can't be range_cyclic (1st pass) because - * end == -1 in that case. - */ - ret = 1; - break; - } - *done_index = page->index; lock_page(page); @@ -382,8 +366,8 @@ static int gfs2_write_cache_jdata(struct address_space *mapping, tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && (index <= end)) { - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, + tag, PAGEVEC_SIZE); if (nr_pages == 0) break; From 94f1b99298bd5d82b855664f721a6f543617df4e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:35:02 -0800 Subject: [PATCH 0743/1212] nilfs2: use pagevec_lookup_range_tag() We want only pages from given range in nilfs_lookup_dirty_data_buffers(). Use pagevec_lookup_range_tag() instead of pagevec_lookup_tag() and remove unnecessary code. Link: http://lkml.kernel.org/r/20171009151359.31984-10-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Acked-by: Ryusuke Konishi Cc: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/segment.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 3b65adaae7e4..5bcd2f32449b 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -705,18 +705,14 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, pagevec_init(&pvec, 0); repeat: if (unlikely(index > last) || - !pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, - min_t(pgoff_t, last - index, - PAGEVEC_SIZE - 1) + 1)) + !pagevec_lookup_range_tag(&pvec, mapping, &index, last, + PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE)) return ndirties; for (i = 0; i < pagevec_count(&pvec); i++) { struct buffer_head *bh, *head; struct page *page = pvec.pages[i]; - if (unlikely(page->index > last)) - break; - lock_page(page); if (!page_has_buffers(page)) create_empty_buffers(page, 1 << inode->i_blkbits, 0); From 26778b87a0067fd32b061d1977000c17c4685ffd Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:35:05 -0800 Subject: [PATCH 0744/1212] mm: use pagevec_lookup_range_tag() in __filemap_fdatawait_range() Use pagevec_lookup_range_tag() in __filemap_fdatawait_range() as it is interested only in pages from given range. Remove unnecessary code resulting from this. Link: http://lkml.kernel.org/r/20171009151359.31984-11-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 4d16907c0684..1544865fa64a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -344,19 +344,17 @@ static int __filemap_fdatawait_range(struct address_space *mapping, goto out; pagevec_init(&pvec, 0); - while ((index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_WRITEBACK, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { + while (index <= end) { unsigned i; + nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, + end, PAGECACHE_TAG_WRITEBACK, PAGEVEC_SIZE); + if (!nr_pages) + break; + for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - /* until radix tree lookup accepts end_index */ - if (page->index > end) - continue; - wait_on_page_writeback(page); if (TestClearPageError(page)) ret = -EIO; From 8914877e374a1bef6834d1adfab32a4564943f12 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:35:09 -0800 Subject: [PATCH 0745/1212] mm: use pagevec_lookup_range_tag() in write_cache_pages() Use pagevec_lookup_range_tag() in write_cache_pages() as it is interested only in pages from given range. Remove unnecessary code resulting from this. Link: http://lkml.kernel.org/r/20171009151359.31984-12-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index fd51ebfc423f..4bdd7ef43f6e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2187,30 +2187,14 @@ int write_cache_pages(struct address_space *mapping, while (!done && (index <= end)) { int i; - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, + tag, PAGEVEC_SIZE); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - /* - * At this point, the page may be truncated or - * invalidated (changing page->mapping to NULL), or - * even swizzled back from swapper_space to tmpfs file - * mapping. However, page->index will not change - * because we have a reference on the page. - */ - if (page->index > end) { - /* - * can't be range_cyclic (1st pass) because - * end == -1 in that case. - */ - done = 1; - break; - } - done_index = page->index; lock_page(page); From f3aa4a25b8b0f92f537b8f55538a4e687ead1336 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:35:12 -0800 Subject: [PATCH 0746/1212] mm: add variant of pagevec_lookup_range_tag() taking number of pages Currently pagevec_lookup_range_tag() takes number of pages to look up but most users don't need this. Create a new function pagevec_lookup_range_nr_tag() that takes maximum number of pages to lookup for Ceph which wants this functionality so that we can drop nr_pages argument from pagevec_lookup_range_tag(). Link: http://lkml.kernel.org/r/20171009151359.31984-13-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagevec.h | 3 +++ mm/swap.c | 9 +++++++++ 2 files changed, 12 insertions(+) diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index b59927938834..cfed0c5ec659 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -32,6 +32,9 @@ unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, unsigned pagevec_lookup_range_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, pgoff_t end, int tag, unsigned nr_pages); +unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec, + struct address_space *mapping, pgoff_t *index, pgoff_t end, + int tag, unsigned max_pages); static inline unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, int tag, unsigned nr_pages) diff --git a/mm/swap.c b/mm/swap.c index 6eefbfabafc0..bef40c04f864 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1140,6 +1140,15 @@ unsigned pagevec_lookup_range_tag(struct pagevec *pvec, } EXPORT_SYMBOL(pagevec_lookup_range_tag); +unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec, + struct address_space *mapping, pgoff_t *index, pgoff_t end, + int tag, unsigned max_pages) +{ + pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, + min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages); + return pagevec_count(pvec); +} +EXPORT_SYMBOL(pagevec_lookup_range_nr_tag); /* * Perform any setup for the swap system */ From feb94dc82928286a323576eaaadfec057d474112 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:35:16 -0800 Subject: [PATCH 0747/1212] ceph: use pagevec_lookup_range_nr_tag() Use new function for looking up pages since nr_pages argument from pagevec_lookup_range_tag() is going away. Link: http://lkml.kernel.org/r/20171009151359.31984-14-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: "Yan, Zheng" Reviewed-by: Daniel Jordan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ceph/addr.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index c720b5032c68..e6bb73963914 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -783,8 +783,7 @@ static int ceph_writepages_start(struct address_space *mapping, struct page **pages = NULL; mempool_t *pool = NULL; /* Becomes non-null if mempool used */ struct page *page; - int want; - u64 offset, len; + u64 offset = 0, len = 0; long writeback_stat; next = 0; @@ -793,13 +792,9 @@ static int ceph_writepages_start(struct address_space *mapping, get_more_pages: first = -1; - want = min(end - index, - min((pgoff_t)PAGEVEC_SIZE, - max_pages - (pgoff_t)locked_pages) - 1) - + 1; pvec_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, PAGECACHE_TAG_DIRTY, - want); + max_pages - locked_pages); dout("pagevec_lookup_range_tag got %d\n", pvec_pages); if (!pvec_pages && !locked_pages) break; From 91e7d9d2ddbfda40393c8400e3d0b4852ea3c6d0 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:35:19 -0800 Subject: [PATCH 0748/1212] mm: remove nr_pages argument from pagevec_lookup_{,range}_tag() All users of pagevec_lookup() and pagevec_lookup_range() now pass PAGEVEC_SIZE as a desired number of pages. Just drop the argument. Link: http://lkml.kernel.org/r/20171009151359.31984-15-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/extent_io.c | 6 +++--- fs/ceph/addr.c | 3 +-- fs/ext4/inode.c | 2 +- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/data.c | 2 +- fs/f2fs/node.c | 8 ++++---- fs/gfs2/aops.c | 2 +- fs/nilfs2/btree.c | 4 ++-- fs/nilfs2/page.c | 7 +++---- fs/nilfs2/segment.c | 6 +++--- include/linux/pagevec.h | 8 +++----- mm/filemap.c | 2 +- mm/page-writeback.c | 2 +- mm/swap.c | 4 ++-- 14 files changed, 27 insertions(+), 31 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index bc6b8635917f..322a4046a23a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3933,7 +3933,7 @@ int btree_write_cache_pages(struct address_space *mapping, tag_pages_for_writeback(mapping, index, end); while (!done && !nr_to_write_done && (index <= end) && (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag, PAGEVEC_SIZE))) { + tag))) { unsigned i; scanned = 1; @@ -4071,8 +4071,8 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, if (wbc->sync_mode == WB_SYNC_ALL) tag_pages_for_writeback(mapping, index, end); while (!done && !nr_to_write_done && (index <= end) && - (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag, PAGEVEC_SIZE))) { + (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, + &index, end, tag))) { unsigned i; scanned = 1; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index e6bb73963914..c30366bb034e 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -793,8 +793,7 @@ static int ceph_writepages_start(struct address_space *mapping, get_more_pages: first = -1; pvec_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, - end, PAGECACHE_TAG_DIRTY, - max_pages - locked_pages); + end, PAGECACHE_TAG_DIRTY); dout("pagevec_lookup_range_tag got %d\n", pvec_pages); if (!pvec_pages && !locked_pages) break; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3eed917db1e7..0550beb2b255 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2368,7 +2368,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) mpd->next_page = index; while (index <= end) { nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag, PAGEVEC_SIZE); + tag); if (nr_pages == 0) goto out; diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 46799d35c632..0159a84ba02d 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -326,7 +326,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, blk_start_plug(&plug); while ((nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) { + PAGECACHE_TAG_DIRTY))) { int i; for (i = 0; i < nr_pages; i++) { diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a7dc2cbeb3d1..02be804b1226 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1970,7 +1970,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, int i; nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag, PAGEVEC_SIZE); + tag); if (nr_pages == 0) break; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 8ddd435ea50b..0d6bb27370ff 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1308,7 +1308,7 @@ static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) index = 0; while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) { + PAGECACHE_TAG_DIRTY))) { int i; for (i = 0; i < nr_pages; i++) { @@ -1497,7 +1497,7 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, index = 0; while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) { + PAGECACHE_TAG_DIRTY))) { int i; for (i = 0; i < nr_pages; i++) { @@ -1610,7 +1610,7 @@ int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, index = 0; while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) { + PAGECACHE_TAG_DIRTY))) { int i; for (i = 0; i < nr_pages; i++) { @@ -1705,7 +1705,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) pagevec_init(&pvec, 0); while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_WRITEBACK, PAGEVEC_SIZE))) { + PAGECACHE_TAG_WRITEBACK))) { int i; for (i = 0; i < nr_pages; i++) { diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 2505627f024e..582ef53f2104 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -367,7 +367,7 @@ static int gfs2_write_cache_jdata(struct address_space *mapping, done_index = index; while (!done && (index <= end)) { nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag, PAGEVEC_SIZE); + tag); if (nr_pages == 0) break; diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 3a3821b00486..9deca59be7e5 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -2147,8 +2147,8 @@ static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree, pagevec_init(&pvec, 0); - while (pagevec_lookup_tag(&pvec, btcache, &index, PAGECACHE_TAG_DIRTY, - PAGEVEC_SIZE)) { + while (pagevec_lookup_tag(&pvec, btcache, &index, + PAGECACHE_TAG_DIRTY)) { for (i = 0; i < pagevec_count(&pvec); i++) { bh = head = page_buffers(pvec.pages[i]); do { diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 45d650addd56..447999563737 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -262,8 +262,7 @@ int nilfs_copy_dirty_pages(struct address_space *dmap, pagevec_init(&pvec, 0); repeat: - if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY, - PAGEVEC_SIZE)) + if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY)) return 0; for (i = 0; i < pagevec_count(&pvec); i++) { @@ -382,8 +381,8 @@ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent) pagevec_init(&pvec, 0); - while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, - PAGEVEC_SIZE)) { + while (pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 5bcd2f32449b..37781eaffc00 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -706,7 +706,7 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, repeat: if (unlikely(index > last) || !pagevec_lookup_range_tag(&pvec, mapping, &index, last, - PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE)) + PAGECACHE_TAG_DIRTY)) return ndirties; for (i = 0; i < pagevec_count(&pvec); i++) { @@ -749,8 +749,8 @@ static void nilfs_lookup_dirty_node_buffers(struct inode *inode, pagevec_init(&pvec, 0); - while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, - PAGEVEC_SIZE)) { + while (pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY)) { for (i = 0; i < pagevec_count(&pvec); i++) { bh = head = page_buffers(pvec.pages[i]); do { diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index cfed0c5ec659..cead4419f933 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -31,16 +31,14 @@ unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, pgoff_t start, unsigned nr_pages); unsigned pagevec_lookup_range_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, pgoff_t end, - int tag, unsigned nr_pages); + int tag); unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, pgoff_t end, int tag, unsigned max_pages); static inline unsigned pagevec_lookup_tag(struct pagevec *pvec, - struct address_space *mapping, pgoff_t *index, int tag, - unsigned nr_pages) + struct address_space *mapping, pgoff_t *index, int tag) { - return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag, - nr_pages); + return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag); } static inline void pagevec_init(struct pagevec *pvec, int cold) diff --git a/mm/filemap.c b/mm/filemap.c index 1544865fa64a..226e4d89ef5d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -348,7 +348,7 @@ static int __filemap_fdatawait_range(struct address_space *mapping, unsigned i; nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, - end, PAGECACHE_TAG_WRITEBACK, PAGEVEC_SIZE); + end, PAGECACHE_TAG_WRITEBACK); if (!nr_pages) break; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 4bdd7ef43f6e..38979615c7ad 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2188,7 +2188,7 @@ int write_cache_pages(struct address_space *mapping, int i; nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag, PAGEVEC_SIZE); + tag); if (nr_pages == 0) break; diff --git a/mm/swap.c b/mm/swap.c index bef40c04f864..8e6bcb688779 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1132,10 +1132,10 @@ EXPORT_SYMBOL(pagevec_lookup); unsigned pagevec_lookup_range_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, pgoff_t end, - int tag, unsigned nr_pages) + int tag) { pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, - nr_pages, pvec->pages); + PAGEVEC_SIZE, pvec->pages); return pagevec_count(pvec); } EXPORT_SYMBOL(pagevec_lookup_range_tag); From ed74404955cd8eeaa41ff1aa57a5af6f8e6f62a8 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 23 May 2018 22:25:09 +0800 Subject: [PATCH 0749/1212] f2fs: detect synchronous writeback more earlier This patch changes to detect synchronous writeback more earlier before, in order to avoid unnecessary page writeback before exiting asynchronous writeback. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 02be804b1226..9deff7960bb2 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1978,6 +1978,13 @@ static int f2fs_write_cache_pages(struct address_space *mapping, struct page *page = pvec.pages[i]; bool submitted = false; + /* give a priority to WB_SYNC threads */ + if (atomic_read(&F2FS_M_SB(mapping)->wb_sync_req) && + wbc->sync_mode == WB_SYNC_NONE) { + done = 1; + break; + } + done_index = page->index; retry_write: lock_page(page); @@ -2032,9 +2039,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, last_idx = page->index; } - /* give a priority to WB_SYNC threads */ - if ((atomic_read(&F2FS_M_SB(mapping)->wb_sync_req) || - --wbc->nr_to_write <= 0) && + if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; From 9db5be4af890fdacab65a4c746f5e330537d1e16 Mon Sep 17 00:00:00 2001 From: Sahitya Tummala Date: Fri, 18 May 2018 11:51:52 +0530 Subject: [PATCH 0750/1212] f2fs: Fix deadlock in shutdown ioctl f2fs_ioc_shutdown() ioctl gets stuck in the below path when issued with F2FS_GOING_DOWN_FULLSYNC option. __switch_to+0x90/0xc4 percpu_down_write+0x8c/0xc0 freeze_super+0xec/0x1e4 freeze_bdev+0xc4/0xcc f2fs_ioctl+0xc0c/0x1ce0 f2fs_compat_ioctl+0x98/0x1f0 Signed-off-by: Sahitya Tummala Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 2a87f4531e0e..ab476867c30b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1857,9 +1857,11 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) if (get_user(in, (__u32 __user *)arg)) return -EFAULT; - ret = mnt_want_write_file(filp); - if (ret) - return ret; + if (in != F2FS_GOING_DOWN_FULLSYNC) { + ret = mnt_want_write_file(filp); + if (ret) + return ret; + } switch (in) { case F2FS_GOING_DOWN_FULLSYNC: @@ -1900,7 +1902,8 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) f2fs_update_time(sbi, REQ_TIME); out: - mnt_drop_write_file(filp); + if (in != F2FS_GOING_DOWN_FULLSYNC) + mnt_drop_write_file(filp); return ret; } From c4408c238722fdfd3302be50dcce1f89c12d6666 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 23 Apr 2018 10:36:13 +0800 Subject: [PATCH 0751/1212] f2fs: fix to wait page writeback during revoking atomic write After revoking atomic write, related LBA can be reused by others, so we need to wait page writeback before reusing the LBA, in order to avoid interference between old atomic written in-flight IO and new IO. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 4412c506c6ad..a31517e231b6 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -230,6 +230,8 @@ static int __revoke_inmem_pages(struct inode *inode, lock_page(page); + f2fs_wait_on_page_writeback(page, DATA, true); + if (recover) { struct dnode_of_data dn; struct node_info ni; From 1f62e4702a34d1fc33be8734777fd858b1147bb7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 26 May 2018 09:00:13 +0800 Subject: [PATCH 0752/1212] f2fs: keep migration IO order in LFS mode For non-migration IO, we will keep order of data/node blocks' submitting as allocation sequence by sorting IOs in per log io_list list, but for migration IO, it could be out-of-order. In LFS mode, we should keep all IOs including migration IO be ordered, so that this patch fixes to add an additional lock to keep submitting order. Signed-off-by: Chao Yu Signed-off-by: Yunlong Song Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/gc.c | 6 ++++++ fs/f2fs/segment.c | 5 +++++ fs/f2fs/super.c | 1 + 4 files changed, 14 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4d4a344f2a55..ca5dc3b8d066 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1178,6 +1178,8 @@ struct f2fs_sb_info { struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */ struct mutex wio_mutex[NR_PAGE_TYPE - 1][NR_TEMP_TYPE]; /* bio ordering for NODE/DATA */ + /* keep migration IO order for LFS mode */ + struct rw_semaphore io_order_lock; mempool_t *write_io_dummy; /* Dummy pages */ /* for checkpoint */ diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index ffcb744ffcfe..bd189e5bc4d7 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -614,6 +614,7 @@ static void move_data_block(struct inode *inode, block_t bidx, struct page *page; block_t newaddr; int err; + bool lfs_mode = test_opt(fio.sbi, LFS); /* do not read out */ page = f2fs_grab_cache_page(inode->i_mapping, bidx, false); @@ -654,6 +655,9 @@ static void move_data_block(struct inode *inode, block_t bidx, fio.page = page; fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; + if (lfs_mode) + down_write(&fio.sbi->io_order_lock); + allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, &sum, CURSEG_COLD_DATA, NULL, false); @@ -710,6 +714,8 @@ static void move_data_block(struct inode *inode, block_t bidx, put_page_out: f2fs_put_page(fio.encrypted_page, 1); recover_block: + if (lfs_mode) + up_write(&fio.sbi->io_order_lock); if (err) __f2fs_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr, true, true); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a31517e231b6..ce5a2bd19e4b 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2831,7 +2831,10 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { int type = __get_segment_type(fio); int err; + bool keep_order = (test_opt(fio->sbi, LFS) && type == CURSEG_COLD_DATA); + if (keep_order) + down_read(&fio->sbi->io_order_lock); reallocate: allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, &fio->new_blkaddr, sum, type, fio, true); @@ -2844,6 +2847,8 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) } else if (!err) { update_device_state(fio); } + if (keep_order) + up_read(&fio->sbi->io_order_lock); } void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index cc55475832e2..7064f6e33211 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2367,6 +2367,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) for (i = 0; i < NR_PAGE_TYPE - 1; i++) for (j = HOT; j < NR_TEMP_TYPE; j++) mutex_init(&sbi->wio_mutex[i][j]); + init_rwsem(&sbi->io_order_lock); spin_lock_init(&sbi->cp_lock); sbi->dirty_device = 0; From 405909e7f53293a13c9a0fad5c81ce1472e9fd32 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 7 May 2018 14:22:40 -0700 Subject: [PATCH 0753/1212] f2fs: introduce sbi->gc_mode to determine the policy This is to avoid sbi->gc_thread pointer access. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 8 ++++++++ fs/f2fs/gc.c | 28 ++++++++++++---------------- fs/f2fs/gc.h | 2 -- fs/f2fs/segment.c | 4 ++-- fs/f2fs/sysfs.c | 33 +++++++++++++++++++++++++-------- 5 files changed, 47 insertions(+), 28 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ca5dc3b8d066..6b3573cf7f10 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1130,6 +1130,13 @@ enum { MAX_TIME, }; +enum { + GC_NORMAL, + GC_IDLE_CB, + GC_IDLE_GREEDY, + GC_URGENT, +}; + enum { WHINT_MODE_OFF, /* not pass down write hints */ WHINT_MODE_USER, /* try to pass down hints given by users */ @@ -1261,6 +1268,7 @@ struct f2fs_sb_info { struct mutex gc_mutex; /* mutex for GC */ struct f2fs_gc_kthread *gc_thread; /* GC thread */ unsigned int cur_victim_sec; /* current victim section num */ + unsigned int gc_mode; /* current GC state */ /* threshold for gc trials on pinned files */ u64 gc_pin_file_threshold; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index bd189e5bc4d7..1df27eb2ec14 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -76,7 +76,7 @@ static int gc_thread_func(void *data) * invalidated soon after by user update or deletion. * So, I'd like to wait some time to collect dirty segments. */ - if (gc_th->gc_urgent) { + if (sbi->gc_mode == GC_URGENT) { wait_ms = gc_th->urgent_sleep_time; mutex_lock(&sbi->gc_mutex); goto do_gc; @@ -131,8 +131,6 @@ int start_gc_thread(struct f2fs_sb_info *sbi) gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME; gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME; - gc_th->gc_idle = 0; - gc_th->gc_urgent = 0; gc_th->gc_wake= 0; sbi->gc_thread = gc_th; @@ -158,21 +156,19 @@ void stop_gc_thread(struct f2fs_sb_info *sbi) sbi->gc_thread = NULL; } -static int select_gc_type(struct f2fs_gc_kthread *gc_th, int gc_type) +static int select_gc_type(struct f2fs_sb_info *sbi, int gc_type) { int gc_mode = (gc_type == BG_GC) ? GC_CB : GC_GREEDY; - if (!gc_th) - return gc_mode; - - if (gc_th->gc_idle) { - if (gc_th->gc_idle == 1) - gc_mode = GC_CB; - else if (gc_th->gc_idle == 2) - gc_mode = GC_GREEDY; - } - if (gc_th->gc_urgent) + switch (sbi->gc_mode) { + case GC_IDLE_CB: + gc_mode = GC_CB; + break; + case GC_IDLE_GREEDY: + case GC_URGENT: gc_mode = GC_GREEDY; + break; + } return gc_mode; } @@ -187,7 +183,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, p->max_search = dirty_i->nr_dirty[type]; p->ofs_unit = 1; } else { - p->gc_mode = select_gc_type(sbi->gc_thread, gc_type); + p->gc_mode = select_gc_type(sbi, gc_type); p->dirty_segmap = dirty_i->dirty_segmap[DIRTY]; p->max_search = dirty_i->nr_dirty[DIRTY]; p->ofs_unit = sbi->segs_per_sec; @@ -195,7 +191,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, /* we need to check every dirty segments in the FG_GC case */ if (gc_type != FG_GC && - (sbi->gc_thread && !sbi->gc_thread->gc_urgent) && + (sbi->gc_mode != GC_URGENT) && p->max_search > sbi->max_victim_search) p->max_search = sbi->max_victim_search; diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index b0045d4c8d1e..c8619e408009 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -36,8 +36,6 @@ struct f2fs_gc_kthread { unsigned int no_gc_sleep_time; /* for changing gc mode */ - unsigned int gc_idle; - unsigned int gc_urgent; unsigned int gc_wake; }; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ce5a2bd19e4b..8656295c76e3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -177,7 +177,7 @@ bool need_SSR(struct f2fs_sb_info *sbi) if (test_opt(sbi, LFS)) return false; - if (sbi->gc_thread && sbi->gc_thread->gc_urgent) + if (sbi->gc_mode == GC_URGENT) return true; return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + @@ -1485,7 +1485,7 @@ static int issue_discard_thread(void *data) if (dcc->discard_wake) dcc->discard_wake = 0; - if (sbi->gc_thread && sbi->gc_thread->gc_urgent) + if (sbi->gc_mode == GC_URGENT) __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); sb_start_intwrite(sbi->sb); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 6d8d8f41e517..dd940d156af6 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -248,16 +248,33 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, if (!strcmp(a->attr.name, "trim_sections")) return -EINVAL; + if (!strcmp(a->attr.name, "gc_urgent")) { + if (t >= 1) { + sbi->gc_mode = GC_URGENT; + if (sbi->gc_thread) { + wake_up_interruptible_all( + &sbi->gc_thread->gc_wait_queue_head); + wake_up_discard_thread(sbi, true); + } + } else { + sbi->gc_mode = GC_NORMAL; + } + return count; + } + if (!strcmp(a->attr.name, "gc_idle")) { + if (t == GC_IDLE_CB) + sbi->gc_mode = GC_IDLE_CB; + else if (t == GC_IDLE_GREEDY) + sbi->gc_mode = GC_IDLE_GREEDY; + else + sbi->gc_mode = GC_NORMAL; + return count; + } + *ui = t; if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0) f2fs_reset_iostat(sbi); - if (!strcmp(a->attr.name, "gc_urgent") && t == 1 && sbi->gc_thread) { - sbi->gc_thread->gc_wake = 1; - wake_up_interruptible_all(&sbi->gc_thread->gc_wait_queue_head); - wake_up_discard_thread(sbi, true); - } - return count; } @@ -349,8 +366,8 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent_sleep_time, F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent, gc_urgent); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle, gc_mode); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent, gc_mode); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); From b125dfb20d18db91eac671aa241346cd1e1c0106 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 7 May 2018 20:28:54 +0800 Subject: [PATCH 0754/1212] f2fs: avoid stucking GC due to atomic write f2fs doesn't allow abuse on atomic write class interface, so except limiting in-mem pages' total memory usage capacity, we need to limit atomic-write usage as well when filesystem is seriously fragmented, otherwise we may run into infinite loop during foreground GC because target blocks in victim segment are belong to atomic opened file for long time. Now, we will detect failure due to atomic write in foreground GC, if the count exceeds threshold, we will drop all atomic written data in cache, by this, I expect it can keep our system running safely to prevent Dos attack. In addition, his patch adds to show GC skip information in debugfs, now it just shows count of skipped caused by atomic write. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/debug.c | 6 ++++++ fs/f2fs/f2fs.h | 21 +++++++++++++++------ fs/f2fs/file.c | 20 ++++++++++++++------ fs/f2fs/gc.c | 27 +++++++++++++++++++++++---- fs/f2fs/inode.c | 6 ++++-- fs/f2fs/segment.c | 11 ++++++++++- fs/f2fs/segment.h | 2 ++ 8 files changed, 75 insertions(+), 20 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9deff7960bb2..0c9e3e186f79 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2322,7 +2322,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, f2fs_put_page(page, 1); f2fs_write_failed(mapping, pos + len); if (drop_atomic) - drop_inmem_pages_all(sbi); + drop_inmem_pages_all(sbi, false); return err; } diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index a66107b5cfff..2d65e77ae5cf 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -104,6 +104,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->avail_nids = NM_I(sbi)->available_nids; si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID]; si->bg_gc = sbi->bg_gc; + si->skipped_atomic_files[BG_GC] = sbi->skipped_atomic_files[BG_GC]; + si->skipped_atomic_files[FG_GC] = sbi->skipped_atomic_files[FG_GC]; si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) / 2; @@ -342,6 +344,10 @@ static int stat_show(struct seq_file *s, void *v) si->bg_data_blks); seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks, si->bg_node_blks); + seq_printf(s, "Skipped : atomic write %llu (%llu)\n", + si->skipped_atomic_files[BG_GC] + + si->skipped_atomic_files[FG_GC], + si->skipped_atomic_files[BG_GC]); seq_puts(s, "\nExtent Cache:\n"); seq_printf(s, " - Hit Count: L1-1:%llu L1-2:%llu L2:%llu\n", si->hit_largest, si->hit_cached, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6b3573cf7f10..6a04aae9480e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -681,15 +681,20 @@ enum { #define DEF_DIR_LEVEL 0 +enum { + GC_FAILURE_PIN, + GC_FAILURE_ATOMIC, + MAX_GC_FAILURE +}; + struct f2fs_inode_info { struct inode vfs_inode; /* serve a vfs inode */ unsigned long i_flags; /* keep an inode flags for ioctl */ unsigned char i_advise; /* use to give file attribute hints */ unsigned char i_dir_level; /* use for dentry level for large dir */ - union { - unsigned int i_current_depth; /* only for directory depth */ - unsigned short i_gc_failures; /* only for regular file */ - }; + unsigned int i_current_depth; /* only for directory depth */ + /* for gc failure statistic */ + unsigned int i_gc_failures[MAX_GC_FAILURE]; unsigned int i_pino; /* parent inode number */ umode_t i_acl_mode; /* keep file acl mode temporarily */ @@ -1269,6 +1274,8 @@ struct f2fs_sb_info { struct f2fs_gc_kthread *gc_thread; /* GC thread */ unsigned int cur_victim_sec; /* current victim section num */ unsigned int gc_mode; /* current GC state */ + /* for skip statistic */ + unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ /* threshold for gc trials on pinned files */ u64 gc_pin_file_threshold; @@ -2312,6 +2319,7 @@ enum { FI_EXTRA_ATTR, /* indicate file has extra attribute */ FI_PROJ_INHERIT, /* indicate file inherits projectid */ FI_PIN_FILE, /* indicate file should not be gced */ + FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */ }; static inline void __mark_inode_dirty_flag(struct inode *inode, @@ -2410,7 +2418,7 @@ static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth) static inline void f2fs_i_gc_failures_write(struct inode *inode, unsigned int count) { - F2FS_I(inode)->i_gc_failures = count; + F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN] = count; f2fs_mark_inode_dirty_sync(inode, true); } @@ -2901,7 +2909,7 @@ void destroy_node_manager_caches(void); */ bool need_SSR(struct f2fs_sb_info *sbi); void register_inmem_page(struct inode *inode, struct page *page); -void drop_inmem_pages_all(struct f2fs_sb_info *sbi); +void drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure); void drop_inmem_pages(struct inode *inode); void drop_inmem_page(struct inode *inode, struct page *page); int commit_inmem_pages(struct inode *inode); @@ -3092,6 +3100,7 @@ struct f2fs_stat_info { int bg_node_segs, bg_data_segs; int tot_blks, data_blks, node_blks; int bg_data_blks, bg_node_blks; + unsigned long long skipped_atomic_files[2]; int curseg[NR_CURSEG_TYPE]; int cursec[NR_CURSEG_TYPE]; int curzone[NR_CURSEG_TYPE]; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ab476867c30b..4b7629f2422e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1697,6 +1697,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) goto out; skip_flush: set_inode_flag(inode, FI_ATOMIC_FILE); + clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); F2FS_I(inode)->inmem_task = current; @@ -1738,12 +1739,17 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); if (!ret) { clear_inode_flag(inode, FI_ATOMIC_FILE); + F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC] = 0; stat_dec_atomic_write(inode); } } else { ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); } err_out: + if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) { + clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); + ret = -EINVAL; + } up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); inode_unlock(inode); mnt_drop_write_file(filp); @@ -2532,12 +2538,14 @@ int f2fs_pin_file_control(struct inode *inode, bool inc) /* Use i_gc_failures for normal file as a risk signal. */ if (inc) - f2fs_i_gc_failures_write(inode, fi->i_gc_failures + 1); + f2fs_i_gc_failures_write(inode, + fi->i_gc_failures[GC_FAILURE_PIN] + 1); - if (fi->i_gc_failures > sbi->gc_pin_file_threshold) { + if (fi->i_gc_failures[GC_FAILURE_PIN] > sbi->gc_pin_file_threshold) { f2fs_msg(sbi->sb, KERN_WARNING, "%s: Enable GC = ino %lx after %x GC trials\n", - __func__, inode->i_ino, fi->i_gc_failures); + __func__, inode->i_ino, + fi->i_gc_failures[GC_FAILURE_PIN]); clear_inode_flag(inode, FI_PIN_FILE); return -EAGAIN; } @@ -2575,7 +2583,7 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) if (!pin) { clear_inode_flag(inode, FI_PIN_FILE); - F2FS_I(inode)->i_gc_failures = 1; + F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN] = 1; goto done; } @@ -2588,7 +2596,7 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) goto out; set_inode_flag(inode, FI_PIN_FILE); - ret = F2FS_I(inode)->i_gc_failures; + ret = F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN]; done: f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); out: @@ -2603,7 +2611,7 @@ static int f2fs_ioc_get_pin_file(struct file *filp, unsigned long arg) __u32 pin = 0; if (is_inode_flag_set(inode, FI_PIN_FILE)) - pin = F2FS_I(inode)->i_gc_failures; + pin = F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN]; return put_user(pin, (u32 __user *)arg); } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 1df27eb2ec14..ef8291f705dc 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -592,7 +592,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, * This can be used to move blocks, aka LBAs, directly on disk. */ static void move_data_block(struct inode *inode, block_t bidx, - unsigned int segno, int off) + int gc_type, unsigned int segno, int off) { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), @@ -620,8 +620,11 @@ static void move_data_block(struct inode *inode, block_t bidx, if (!check_valid_map(F2FS_I_SB(inode), segno, off)) goto out; - if (f2fs_is_atomic_file(inode)) + if (f2fs_is_atomic_file(inode)) { + F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++; + F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++; goto out; + } if (f2fs_is_pinned_file(inode)) { f2fs_pin_file_control(inode, true); @@ -733,8 +736,11 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, if (!check_valid_map(F2FS_I_SB(inode), segno, off)) goto out; - if (f2fs_is_atomic_file(inode)) + if (f2fs_is_atomic_file(inode)) { + F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++; + F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++; goto out; + } if (f2fs_is_pinned_file(inode)) { if (gc_type == FG_GC) f2fs_pin_file_control(inode, true); @@ -896,7 +902,8 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, start_bidx = start_bidx_of_node(nofs, inode) + ofs_in_node; if (f2fs_post_read_required(inode)) - move_data_block(inode, start_bidx, segno, off); + move_data_block(inode, start_bidx, gc_type, + segno, off); else move_data_page(inode, start_bidx, gc_type, segno, off); @@ -1013,6 +1020,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, .ilist = LIST_HEAD_INIT(gc_list.ilist), .iroot = RADIX_TREE_INIT(GFP_NOFS), }; + unsigned long long last_skipped = sbi->skipped_atomic_files[FG_GC]; + unsigned int skipped_round = 0, round = 0; trace_f2fs_gc_begin(sbi->sb, sync, background, get_pages(sbi, F2FS_DIRTY_NODES), @@ -1064,11 +1073,21 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, sec_freed++; total_freed += seg_freed; + if (gc_type == FG_GC) { + if (sbi->skipped_atomic_files[FG_GC] > last_skipped) + skipped_round++; + last_skipped = sbi->skipped_atomic_files[FG_GC]; + round++; + } + if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; if (!sync) { if (has_not_enough_free_secs(sbi, sec_freed, 0)) { + if (skipped_round > MAX_SKIP_ATOMIC_COUNT && + skipped_round * 2 >= round) + drop_inmem_pages_all(sbi, true); segno = NULL_SEGNO; goto gc_more; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index a814dd221eed..2f8c99ab99f7 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -235,7 +235,8 @@ static int do_read_inode(struct inode *inode) if (S_ISDIR(inode->i_mode)) fi->i_current_depth = le32_to_cpu(ri->i_current_depth); else if (S_ISREG(inode->i_mode)) - fi->i_gc_failures = le16_to_cpu(ri->i_gc_failures); + fi->i_gc_failures[GC_FAILURE_PIN] = + le16_to_cpu(ri->i_gc_failures); fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid); fi->i_flags = le32_to_cpu(ri->i_flags); fi->flags = 0; @@ -428,7 +429,8 @@ void update_inode(struct inode *inode, struct page *node_page) ri->i_current_depth = cpu_to_le32(F2FS_I(inode)->i_current_depth); else if (S_ISREG(inode->i_mode)) - ri->i_gc_failures = cpu_to_le16(F2FS_I(inode)->i_gc_failures); + ri->i_gc_failures = + cpu_to_le16(F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN]); ri->i_xattr_nid = cpu_to_le32(F2FS_I(inode)->i_xattr_nid); ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags); ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 8656295c76e3..e2317c6c1080 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -273,7 +273,7 @@ static int __revoke_inmem_pages(struct inode *inode, return err; } -void drop_inmem_pages_all(struct f2fs_sb_info *sbi) +void drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure) { struct list_head *head = &sbi->inode_list[ATOMIC_FILE]; struct inode *inode; @@ -289,9 +289,17 @@ void drop_inmem_pages_all(struct f2fs_sb_info *sbi) spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); if (inode) { + if (gc_failure) { + if (fi->i_gc_failures[GC_FAILURE_ATOMIC]) + goto drop; + goto skip; + } +drop: + set_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); drop_inmem_pages(inode); iput(inode); } +skip: congestion_wait(BLK_RW_ASYNC, HZ/50); cond_resched(); goto next; @@ -311,6 +319,7 @@ void drop_inmem_pages(struct inode *inode) mutex_unlock(&fi->inmem_lock); clear_inode_flag(inode, FI_ATOMIC_FILE); + fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; stat_dec_atomic_write(inode); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 3367ce263fb9..3e7ef7c6771f 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -215,6 +215,8 @@ struct segment_allocation { #define IS_DUMMY_WRITTEN_PAGE(page) \ (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) +#define MAX_SKIP_ATOMIC_COUNT 16 + struct inmem_pages { struct list_head list; struct page *page; From b25a1872e9a518c8ea5c76bb8441209db3117574 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Sun, 8 Apr 2018 15:11:11 +0800 Subject: [PATCH 0755/1212] f2fs: let discard thread wait a little longer if dev is busy This patch modify discard thread wait policy as below: issued io_interrupted wait time(ms) 1. 8 0 50 2. (0,8) 1 50 3. 0 1 500 (dev is busy) 4. 0 0 60000 (no candidates) Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/segment.c | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6a04aae9480e..fab0ccf95a9f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -247,6 +247,7 @@ enum { #define DEF_MAX_DISCARD_REQUEST 8 /* issue 8 discards per round */ #define DEF_MAX_DISCARD_LEN 512 /* Max. 2MB per discard */ #define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */ +#define DEF_MID_DISCARD_ISSUE_TIME 500 /* 500 ms, if device busy */ #define DEF_MAX_DISCARD_ISSUE_TIME 60000 /* 60 s, if no candidates */ #define DEF_DISCARD_URGENT_UTIL 80 /* do more discard over 80% */ #define DEF_CP_INTERVAL 60 /* 60 secs */ @@ -349,6 +350,7 @@ enum { struct discard_policy { int type; /* type of discard */ unsigned int min_interval; /* used for candidates exist */ + unsigned int mid_interval; /* used for device busy */ unsigned int max_interval; /* used for candidates not exist */ unsigned int max_requests; /* # of discards issued per round */ unsigned int io_aware_gran; /* minimum granularity discard not be aware of I/O */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e2317c6c1080..fe3b6c3e7553 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1018,6 +1018,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, if (discard_type == DPOLICY_BG) { dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; + dpolicy->mid_interval = DEF_MID_DISCARD_ISSUE_TIME; dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; dpolicy->io_aware = true; dpolicy->sync = false; @@ -1027,6 +1028,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, } } else if (discard_type == DPOLICY_FORCE) { dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; + dpolicy->mid_interval = DEF_MID_DISCARD_ISSUE_TIME; dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; dpolicy->io_aware = false; } else if (discard_type == DPOLICY_FSTRIM) { @@ -1500,9 +1502,11 @@ static int issue_discard_thread(void *data) sb_start_intwrite(sbi->sb); issued = __issue_discard_cmd(sbi, &dpolicy); - if (issued) { + if (issued > 0) { __wait_all_discard_cmd(sbi, &dpolicy); wait_ms = dpolicy.min_interval; + } else if (issued == -1){ + wait_ms = dpolicy.mid_interval; } else { wait_ms = dpolicy.max_interval; } From e72a2cca82d8e8809be75012ded23781434d31fa Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Tue, 8 May 2018 17:51:34 +0800 Subject: [PATCH 0756/1212] f2fs: clear discard_wake earlier If SBI_NEED_FSCK is set, discard_wake will never be cleared. As a result, the condition of wait_event_interruptible_timeout() is always true, which gets discard thread run too frequently. Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index fe3b6c3e7553..0caabb0f42bc 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1482,6 +1482,10 @@ static int issue_discard_thread(void *data) kthread_should_stop() || freezing(current) || dcc->discard_wake, msecs_to_jiffies(wait_ms)); + + if (dcc->discard_wake) + dcc->discard_wake = 0; + if (try_to_freeze()) continue; if (f2fs_readonly(sbi->sb)) @@ -1493,9 +1497,6 @@ static int issue_discard_thread(void *data) continue; } - if (dcc->discard_wake) - dcc->discard_wake = 0; - if (sbi->gc_mode == GC_URGENT) __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); From c74034518fdc8b21a2b3f0aace06965cea5fa09d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 26 May 2018 18:03:34 +0800 Subject: [PATCH 0757/1212] f2fs: fix to don't trigger writeback during recovery - f2fs_fill_super - recover_fsync_data - recover_data - del_fsync_inode - iput - iput_final - write_inode_now - f2fs_write_inode - f2fs_balance_fs - f2fs_balance_fs_bg - sync_dirty_inodes With data_flush mount option, during recovery, in order to avoid entering above writeback flow, let's detect recovery status and do skip in f2fs_balance_fs_bg. Signed-off-by: Chao Yu Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0caabb0f42bc..4557704a852e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -486,6 +486,9 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) { + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + return; + /* try to shrink extent cache when there is no enough memory */ if (!available_free_memory(sbi, EXTENT_CACHE)) f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER); From d2e0f2f786a68136d2fb5c57c669896eba2f4d7f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 26 May 2018 18:03:35 +0800 Subject: [PATCH 0758/1212] f2fs: clean up with clear_radix_tree_dirty_tag Introduce clear_radix_tree_dirty_tag to include common codes for cleanup. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 11 +++++++++++ fs/f2fs/dir.c | 8 +------- fs/f2fs/f2fs.h | 1 + fs/f2fs/inline.c | 7 +------ fs/f2fs/node.c | 12 ++---------- 5 files changed, 16 insertions(+), 23 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0c9e3e186f79..da09ba77f874 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2588,6 +2588,17 @@ const struct address_space_operations f2fs_dblock_aops = { #endif }; +void clear_radix_tree_dirty_tag(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + unsigned long flags; + + spin_lock_irqsave(&mapping->tree_lock, flags); + radix_tree_tag_clear(&mapping->page_tree, page_index(page), + PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); +} + int __init f2fs_init_post_read_processing(void) { bio_post_read_ctx_cache = KMEM_CACHE(bio_post_read_ctx, 0); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 41d32171bd52..2887bcef118b 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -698,8 +698,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, struct f2fs_dentry_block *dentry_blk; unsigned int bit_pos; int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); - struct address_space *mapping = page_mapping(page); - unsigned long flags; int i; f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); @@ -732,11 +730,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, if (bit_pos == NR_DENTRY_IN_BLOCK && !truncate_hole(dir, page->index, page->index + 1)) { - spin_lock_irqsave(&mapping->tree_lock, flags); - radix_tree_tag_clear(&mapping->page_tree, page_index(page), - PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); - + clear_radix_tree_dirty_tag(page); clear_page_dirty_for_io(page); ClearPagePrivate(page); ClearPageUptodate(page); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fab0ccf95a9f..ccdcf5865fb9 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3049,6 +3049,7 @@ int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode); #endif bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len); +void clear_radix_tree_dirty_tag(struct page *page); /* * gc.c diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 85371b0971d9..b51cc241f354 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -204,8 +204,6 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) { void *src_addr, *dst_addr; struct dnode_of_data dn; - struct address_space *mapping = page_mapping(page); - unsigned long flags; int err; set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -227,10 +225,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) kunmap_atomic(src_addr); set_page_dirty(dn.inode_page); - spin_lock_irqsave(&mapping->tree_lock, flags); - radix_tree_tag_clear(&mapping->page_tree, page_index(page), - PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + clear_radix_tree_dirty_tag(page); set_inode_flag(inode, FI_APPEND_WRITE); set_inode_flag(inode, FI_DATA_EXIST); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 0d6bb27370ff..2902e1fadebc 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -102,18 +102,10 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) static void clear_node_page_dirty(struct page *page) { - struct address_space *mapping = page->mapping; - unsigned int long flags; - if (PageDirty(page)) { - spin_lock_irqsave(&mapping->tree_lock, flags); - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), - PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); - + clear_radix_tree_dirty_tag(page); clear_page_dirty_for_io(page); - dec_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES); + dec_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); } ClearPageUptodate(page); } From a1259450b6dba27306a065b4e079d8fb234d0a4e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 28 May 2018 16:57:32 +0800 Subject: [PATCH 0759/1212] f2fs: fix to avoid race during access gc_thread pointer Thread A Thread B - f2fs_remount - stop_gc_thread - f2fs_sbi_store sbi->gc_thread = NULL; access sbi->gc_thread->gc_* Previously, we allocate memory for sbi->gc_thread based on background gc thread mount option, the memory can be released if we turn off that mount option, but still there are several places access gc_thread pointer without considering race condition, result in NULL point dereference. In order to fix this issue, use sb->s_umount to exclude those operations. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index dd940d156af6..ac3ea6044936 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -165,7 +165,7 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, return snprintf(buf, PAGE_SIZE, "%u\n", *ui); } -static ssize_t f2fs_sbi_store(struct f2fs_attr *a, +static ssize_t __f2fs_sbi_store(struct f2fs_attr *a, struct f2fs_sb_info *sbi, const char *buf, size_t count) { @@ -278,6 +278,23 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, return count; } +static ssize_t f2fs_sbi_store(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, + const char *buf, size_t count) +{ + ssize_t ret; + bool gc_entry = (!strcmp(a->attr.name, "gc_urgent") || + a->struct_type == GC_THREAD); + + if (gc_entry) + down_read(&sbi->sb->s_umount); + ret = __f2fs_sbi_store(a, sbi, buf, count); + if (gc_entry) + up_read(&sbi->sb->s_umount); + + return ret; +} + static ssize_t f2fs_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { From 0291bd36d076f2ff8c6c6cc3b8715b3399680e00 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 28 May 2018 16:59:26 +0800 Subject: [PATCH 0760/1212] f2fs: don't drop dentry pages after fs shutdown As description in commit "f2fs: don't drop any page on f2fs_cp_error() case": "We still provide readdir() after shtudown, so we should keep pages to avoid additional IOs." In order to provider lastest directory structure, let's keep dentry pages in cache after fs shutdown. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index da09ba77f874..65e7669155e8 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1798,6 +1798,12 @@ static int __write_data_page(struct page *page, bool *submitted, /* we should bypass data pages to proceed the kworkder jobs */ if (unlikely(f2fs_cp_error(sbi))) { mapping_set_error(page->mapping, -EIO); + /* + * don't drop any dirty dentry pages for keeping lastest + * directory structure. + */ + if (S_ISDIR(inode->i_mode)) + goto redirty_out; goto out; } From 02afc275a5bd35825e77a91a3aaad8d5aa730dbd Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 28 May 2018 16:59:27 +0800 Subject: [PATCH 0761/1212] f2fs: fix error path of move_data_page This patch fixes error path of move_data_page: - clear cold data flag if it fails to write page. - redirty page for non-ENOMEM case. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index ef8291f705dc..2b81537387c9 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -780,9 +780,14 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, set_cold_data(page); err = do_write_data_page(&fio); - if (err == -ENOMEM && is_dirty) { - congestion_wait(BLK_RW_ASYNC, HZ/50); - goto retry; + if (err) { + clear_cold_data(page); + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } + if (is_dirty) + set_page_dirty(page); } } out: From e48fcd857657d3328d9f148f490c64c5e147fb85 Mon Sep 17 00:00:00 2001 From: Anatoly Pugachev Date: Mon, 28 May 2018 02:06:37 +0300 Subject: [PATCH 0762/1212] disable loading f2fs module on PAGE_SIZE > 4KB The following patch disables loading of f2fs module on architectures which have PAGE_SIZE > 4096 , since it is impossible to mount f2fs on such architectures , log messages are: mount: /mnt: wrong fs type, bad option, bad superblock on /dev/vdiskb1, missing codepage or helper program, or other error. /dev/vdiskb1: F2FS filesystem, UUID=1d8b9ca4-2389-4910-af3b-10998969f09c, volume name "" May 15 18:03:13 ttip kernel: F2FS-fs (vdiskb1): Invalid page_cache_size (8192), supports only 4KB May 15 18:03:13 ttip kernel: F2FS-fs (vdiskb1): Can't find valid F2FS filesystem in 1th superblock May 15 18:03:13 ttip kernel: F2FS-fs (vdiskb1): Invalid page_cache_size (8192), supports only 4KB May 15 18:03:13 ttip kernel: F2FS-fs (vdiskb1): Can't find valid F2FS filesystem in 2th superblock May 15 18:03:13 ttip kernel: F2FS-fs (vdiskb1): Invalid page_cache_size (8192), supports only 4KB which was introduced by git commit 5c9b469295fb6b10d98923eab5e79c4edb80ed20 tested on git kernel 4.17.0-rc6-00309-gec30dcf7f425 with patch applied: modprobe: ERROR: could not insert 'f2fs': Invalid argument May 28 01:40:28 v215 kernel: F2FS not supported on PAGE_SIZE(8192) != 4096 Signed-off-by: Anatoly Pugachev Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7064f6e33211..9819c04e6848 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3130,6 +3130,12 @@ static int __init init_f2fs_fs(void) { int err; + if (PAGE_SIZE != F2FS_BLKSIZE) { + printk("F2FS not supported on PAGE_SIZE(%lu) != %d\n", + PAGE_SIZE, F2FS_BLKSIZE); + return -EINVAL; + } + f2fs_build_trace_ios(); err = init_inodecache(); From b7f55946709538653ce33f06e67219c15b140039 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 28 May 2018 23:47:18 +0800 Subject: [PATCH 0763/1212] f2fs: fix to let caller retry allocating block address Configure io_bits with 2 and enable LFS mode, generic/013 reports below dmesg: BUG: unable to handle kernel NULL pointer dereference at 00000104 *pdpt = 0000000029b7b001 *pde = 0000000000000000 Oops: 0002 [#1] PREEMPT SMP Modules linked in: crc32_generic zram f2fs(O) rfcomm bnep bluetooth ecdh_generic snd_intel8x0 snd_ac97_codec ac97_bus snd_pcm snd_seq_midi snd_seq_midi_event snd_rawmidi snd_seq pcbc joydev snd_seq_device aesni_intel snd_timer aes_i586 snd crypto_simd cryptd soundcore i2c_piix4 serio_raw mac_hid video parport_pc ppdev lp parport hid_generic psmouse usbhid hid e1000 CPU: 0 PID: 11161 Comm: fsstress Tainted: G O 4.17.0-rc2 #38 Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 EIP: f2fs_submit_page_write+0x28d/0x550 [f2fs] EFLAGS: 00010206 CPU: 0 EAX: e863dcd8 EBX: 00000000 ECX: 00000100 EDX: 00000200 ESI: e863dcf4 EDI: f6f82768 EBP: e863dbb0 ESP: e863db74 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 80050033 CR2: 00000104 CR3: 29a62020 CR4: 000406f0 Call Trace: do_write_page+0x6f/0xc0 [f2fs] write_data_page+0x4a/0xd0 [f2fs] do_write_data_page+0x327/0x630 [f2fs] __write_data_page+0x34b/0x820 [f2fs] __f2fs_write_data_pages+0x42d/0x8c0 [f2fs] f2fs_write_data_pages+0x27/0x30 [f2fs] do_writepages+0x1a/0x70 __filemap_fdatawrite_range+0x94/0xd0 filemap_write_and_wait_range+0x3d/0xa0 __generic_file_write_iter+0x11a/0x1f0 f2fs_file_write_iter+0xdd/0x3b0 [f2fs] __vfs_write+0xd2/0x150 vfs_write+0x9b/0x190 ksys_write+0x45/0x90 sys_write+0x16/0x20 do_fast_syscall_32+0xaa/0x22c entry_SYSENTER_32+0x4c/0x7b EIP: 0xb7fc8c51 EFLAGS: 00000246 CPU: 0 EAX: ffffffda EBX: 00000003 ECX: 09cde000 EDX: 00001000 ESI: 00000003 EDI: 00001000 EBP: 00000000 ESP: bfbded38 DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 007b Code: e8 f9 77 34 c9 8b 45 e0 8b 80 b8 00 00 00 39 45 d8 0f 84 bb 02 00 00 8b 45 e0 8b 80 b8 00 00 00 8d 50 d8 8b 08 89 55 f0 8b 50 04 <89> 51 04 89 0a c7 00 00 01 00 00 c7 40 04 00 02 00 00 8b 45 dc EIP: f2fs_submit_page_write+0x28d/0x550 [f2fs] SS:ESP: 0068:e863db74 CR2: 0000000000000104 ---[ end trace 4cac79c0d1305ee6 ]--- allocate_data_block will submit all sequential pending IOs sorted by a FIFO list, If we failed to submit other user's IO due to unaligned write, we will retry to allocate new block address for current IO, then it will initialize fio.list again, if fio was in the list before, it can break FIFO list, result in above panic. Thread A Thread B - do_write_page - allocate_data_block - list_add_tail : fioA cached in FIFO list. - do_write_page - allocate_data_block - list_add_tail : fioB cached in FIFO list. - f2fs_submit_page_write : fail to submit IO - allocate_data_block - INIT_LIST_HEAD - f2fs_submit_page_write - list_del <-- NULL pointer dereference This patch adds fio.retry parameter to indicate failure status for each IO, and avoid bailing out if there is still pending IO in FIFO list for fixing. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 14 ++++++-------- fs/f2fs/f2fs.h | 3 ++- fs/f2fs/gc.c | 5 +++-- fs/f2fs/segment.c | 11 ++++++----- 4 files changed, 17 insertions(+), 16 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 65e7669155e8..0b7806cb4e7b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -460,13 +460,12 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) return 0; } -int f2fs_submit_page_write(struct f2fs_io_info *fio) +void f2fs_submit_page_write(struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = fio->sbi; enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); struct f2fs_bio_info *io = sbi->write_io[btype] + fio->temp; struct page *bio_page; - int err = 0; f2fs_bug_on(sbi, is_read_io(fio->op)); @@ -476,7 +475,7 @@ int f2fs_submit_page_write(struct f2fs_io_info *fio) spin_lock(&io->io_lock); if (list_empty(&io->io_list)) { spin_unlock(&io->io_lock); - goto out_fail; + goto out; } fio = list_first_entry(&io->io_list, struct f2fs_io_info, list); @@ -503,9 +502,9 @@ int f2fs_submit_page_write(struct f2fs_io_info *fio) if (io->bio == NULL) { if ((fio->type == DATA || fio->type == NODE) && fio->new_blkaddr & F2FS_IO_SIZE_MASK(sbi)) { - err = -EAGAIN; dec_page_count(sbi, WB_DATA_TYPE(bio_page)); - goto out_fail; + fio->retry = true; + goto skip; } io->bio = __bio_alloc(sbi, fio->new_blkaddr, fio->io_wbc, BIO_MAX_PAGES, false, @@ -525,12 +524,11 @@ int f2fs_submit_page_write(struct f2fs_io_info *fio) f2fs_trace_ios(fio, 0); trace_f2fs_submit_page_write(fio->page, fio); - +skip: if (fio->in_list) goto next; -out_fail: +out: up_write(&io->io_rwsem); - return err; } static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ccdcf5865fb9..f702aeaf6f38 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1076,6 +1076,7 @@ struct f2fs_io_info { int need_lock; /* indicate we need to lock cp_rwsem */ bool in_list; /* indicate fio is in io_list */ bool is_meta; /* indicate borrow meta inode mapping or not */ + bool retry; /* need to reallocate block address */ enum iostat_type io_type; /* io type */ struct writeback_control *io_wbc; /* writeback control */ }; @@ -3013,7 +3014,7 @@ void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, enum page_type type); void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi); int f2fs_submit_page_bio(struct f2fs_io_info *fio); -int f2fs_submit_page_write(struct f2fs_io_info *fio); +void f2fs_submit_page_write(struct f2fs_io_info *fio); struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, block_t blk_addr, struct bio *bio); int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 2b81537387c9..45713a64612d 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -603,6 +603,7 @@ static void move_data_block(struct inode *inode, block_t bidx, .op_flags = REQ_SYNC, .encrypted_page = NULL, .in_list = false, + .retry = false, }; struct dnode_of_data dn; struct f2fs_summary sum; @@ -697,8 +698,8 @@ static void move_data_block(struct inode *inode, block_t bidx, fio.op = REQ_OP_WRITE; fio.op_flags = REQ_SYNC | REQ_NOIDLE; fio.new_blkaddr = newaddr; - err = f2fs_submit_page_write(&fio); - if (err) { + f2fs_submit_page_write(&fio); + if (fio.retry) { if (PageWriteback(fio.encrypted_page)) end_page_writeback(fio.encrypted_page); goto put_page_out; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 4557704a852e..507f697178b6 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2812,6 +2812,7 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, INIT_LIST_HEAD(&fio->list); fio->in_list = true; + fio->retry = false; io = sbi->write_io[fio->type] + fio->temp; spin_lock(&io->io_lock); list_add_tail(&fio->list, &io->io_list); @@ -2847,7 +2848,6 @@ static void update_device_state(struct f2fs_io_info *fio) static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { int type = __get_segment_type(fio); - int err; bool keep_order = (test_opt(fio->sbi, LFS) && type == CURSEG_COLD_DATA); if (keep_order) @@ -2857,13 +2857,14 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) &fio->new_blkaddr, sum, type, fio, true); /* writeout dirty page into bdev */ - err = f2fs_submit_page_write(fio); - if (err == -EAGAIN) { + f2fs_submit_page_write(fio); + if (fio->retry) { fio->old_blkaddr = fio->new_blkaddr; goto reallocate; - } else if (!err) { - update_device_state(fio); } + + update_device_state(fio); + if (keep_order) up_read(&fio->sbi->io_order_lock); } From cc8093af7c420333d412ec5ef748900a53c433df Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 28 May 2018 23:47:19 +0800 Subject: [PATCH 0764/1212] f2fs: fix to avoid accessing cross the boundary Configure io_bits with 2 and enable LFS mode, generic/017 reports below dmesg: BUG: unable to handle kernel NULL pointer dereference at 00000039 *pdpt = 000000002fcb2001 *pde = 0000000000000000 Oops: 0000 [#1] PREEMPT SMP Modules linked in: crc32_generic zram f2fs(O) bnep rfcomm bluetooth ecdh_generic snd_intel8x0 snd_ac97_codec ac97_bus snd_pcm snd_seq_midi snd_seq_midi_event snd_rawmidi pcbc snd_seq joydev aesni_intel aes_i586 snd_seq_device snd_timer crypto_simd cryptd snd soundcore i2c_piix4 serio_raw mac_hid video parport_pc ppdev lp parport hid_generic usbhid psmouse hid e1000 CPU: 2 PID: 20779 Comm: xfs_io Tainted: G O 4.17.0-rc2 #38 Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 EIP: is_checkpointed_data+0x84/0xd0 [f2fs] EFLAGS: 00010207 CPU: 2 EAX: 00000000 EBX: f5cd7000 ECX: fffffe32 EDX: 00000039 ESI: 000001cd EDI: ec95fb6c EBP: e264bd80 ESP: e264bd6c DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 80050033 CR2: 00000039 CR3: 2fe55660 CR4: 000406f0 Call Trace: __exchange_data_block+0xb3f/0x1000 [f2fs] f2fs_fallocate+0xab9/0x16b0 [f2fs] vfs_fallocate+0x17c/0x2d0 ksys_fallocate+0x42/0x70 sys_fallocate+0x31/0x40 do_fast_syscall_32+0xaa/0x22c entry_SYSENTER_32+0x4c/0x7b EIP: 0xb7f98c51 EFLAGS: 00000293 CPU: 2 EAX: ffffffda EBX: 00000003 ECX: 00000008 EDX: 01001000 ESI: 00000000 EDI: 00001000 EBP: 00000000 ESP: bfc0357c DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 007b Code: 00 00 d3 e8 8b 4d ec 2b 02 8b 55 f0 6b c0 1c 03 41 70 29 d6 8b 93 d0 06 00 00 8b 40 0c 83 ea 01 21 d6 89 f2 89 f1 c1 ea 03 f7 d1 <0f> be 14 10 83 e1 07 b8 01 00 00 00 d3 e0 85 c2 89 f8 0f 95 c3 EIP: is_checkpointed_data+0x84/0xd0 [f2fs] SS:ESP: 0068:e264bd6c CR2: 0000000000000039 ---[ end trace 9a4d4087cce6080a ]--- This is because in recovery flow of __exchange_data_block, we didn't pass olen to __roll_back_blkaddrs, instead we passed len, which indicates wrong array size, result in copying random block address into dnode page. Later, once that random block address was accessed by is_checkpointed_data, it can cause NULL pointer dereference. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 4b7629f2422e..0c2af49be162 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1151,7 +1151,7 @@ static int __exchange_data_block(struct inode *src_inode, return 0; roll_back: - __roll_back_blkaddrs(src_inode, src_blkaddr, do_replace, src, len); + __roll_back_blkaddrs(src_inode, src_blkaddr, do_replace, src, olen); kvfree(src_blkaddr); kvfree(do_replace); return ret; From 5d1633aa1071aa481442434a540c084a10efd810 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 30 May 2018 00:20:39 +0800 Subject: [PATCH 0765/1212] f2fs: make __f2fs_write_data_pages() static Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/f2fs.h | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0b7806cb4e7b..bf55ee0dfccd 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2069,7 +2069,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, return ret; } -int __f2fs_write_data_pages(struct address_space *mapping, +static int __f2fs_write_data_pages(struct address_space *mapping, struct writeback_control *wbc, enum iostat_type io_type) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f702aeaf6f38..7ce0272733e2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3039,9 +3039,6 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); bool should_update_inplace(struct inode *inode, struct f2fs_io_info *fio); bool should_update_outplace(struct inode *inode, struct f2fs_io_info *fio); -int __f2fs_write_data_pages(struct address_space *mapping, - struct writeback_control *wbc, - enum iostat_type io_type); void f2fs_invalidate_page(struct page *page, unsigned int offset, unsigned int length); int f2fs_release_page(struct page *page, gfp_t wait); From fcf37e16f3cb91bad01a7ca1df5ebd032de614f3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 30 May 2018 00:20:40 +0800 Subject: [PATCH 0766/1212] f2fs: make set_de_type() static Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 2 +- fs/f2fs/f2fs.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 2887bcef118b..f8e7bafd092a 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -60,7 +60,7 @@ static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFLNK >> S_SHIFT] = F2FS_FT_SYMLINK, }; -void set_de_type(struct f2fs_dir_entry *de, umode_t mode) +static void set_de_type(struct f2fs_dir_entry *de, umode_t mode) { de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7ce0272733e2..c80ee4b1fa51 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2798,7 +2798,6 @@ struct dentry *f2fs_get_parent(struct dentry *child); /* * dir.c */ -void set_de_type(struct f2fs_dir_entry *de, umode_t mode); unsigned char get_de_type(struct f2fs_dir_entry *de); struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, f2fs_hash_t namehash, int *max_slots, From c35da89531b3cf7939498e4e1f39bf9338ebc10f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 30 May 2018 00:20:41 +0800 Subject: [PATCH 0767/1212] f2fs: clean up symbol namespace As Ted reported: "Hi, I was looking at f2fs's sources recently, and I noticed that there is a very large number of non-static symbols which don't have a f2fs prefix. There's well over a hundred (see attached below). As one example, in fs/f2fs/dir.c there is: unsigned char get_de_type(struct f2fs_dir_entry *de) This function is clearly only useful for f2fs, but it has a generic name. This means that if any other file system tries to have the same symbol name, there will be a symbol conflict and the kernel would not successfully build. It also means that when someone is looking f2fs sources, it's not at all obvious whether a function such as read_data_page(), invalidate_blocks(), is a generic kernel function found in the fs, mm, or block layers, or a f2fs specific function. You might want to fix this at some point. Hopefully Kent's bcachefs isn't similarly using genericly named functions, since that might cause conflicts with f2fs's functions --- but just as this would be a problem that we would rightly insist that Kent fix, this is something that we should have rightly insisted that f2fs should have fixed before it was integrated into the mainline kernel. acquire_orphan_inode add_ino_entry add_orphan_inode allocate_data_block allocate_new_segments alloc_nid alloc_nid_done alloc_nid_failed available_free_memory ...." This patch adds "f2fs_" prefix for all non-static symbols in order to: a) avoid conflict with other kernel generic symbols; b) to indicate the function is f2fs specific one instead of generic one; Reported-by: Theodore Ts'o Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 134 +++++++++--------- fs/f2fs/data.c | 115 ++++++++-------- fs/f2fs/dir.c | 68 ++++----- fs/f2fs/extent_cache.c | 22 +-- fs/f2fs/f2fs.h | 305 +++++++++++++++++++++-------------------- fs/f2fs/file.c | 136 +++++++++--------- fs/f2fs/gc.c | 64 ++++----- fs/f2fs/inline.c | 69 +++++----- fs/f2fs/inode.c | 52 +++---- fs/f2fs/namei.c | 55 ++++---- fs/f2fs/node.c | 223 +++++++++++++++--------------- fs/f2fs/recovery.c | 63 ++++----- fs/f2fs/segment.c | 185 +++++++++++++------------ fs/f2fs/shrinker.c | 4 +- fs/f2fs/super.c | 94 ++++++------- fs/f2fs/sysfs.c | 8 +- fs/f2fs/xattr.c | 26 ++-- 17 files changed, 826 insertions(+), 797 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 0159a84ba02d..b00c807c8c8b 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -24,7 +24,7 @@ #include static struct kmem_cache *ino_entry_slab; -struct kmem_cache *inode_entry_slab; +struct kmem_cache *f2fs_inode_entry_slab; void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) { @@ -36,7 +36,7 @@ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) /* * We guarantee no failure on the returned page. */ -struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) +struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) { struct address_space *mapping = META_MAPPING(sbi); struct page *page = NULL; @@ -108,18 +108,19 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, return page; } -struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) +struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) { return __get_meta_page(sbi, index, true); } /* for POR only */ -struct page *get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index) +struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index) { return __get_meta_page(sbi, index, false); } -bool is_valid_meta_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) +bool f2fs_is_valid_meta_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr, int type) { switch (type) { case META_NAT: @@ -153,7 +154,7 @@ bool is_valid_meta_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) /* * Readahead CP/NAT/SIT/SSA pages */ -int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, +int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync) { struct page *page; @@ -176,7 +177,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, blk_start_plug(&plug); for (; nrpages-- > 0; blkno++) { - if (!is_valid_meta_blkaddr(sbi, blkno, type)) + if (!f2fs_is_valid_meta_blkaddr(sbi, blkno, type)) goto out; switch (type) { @@ -220,7 +221,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, return blkno - start; } -void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) +void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) { struct page *page; bool readahead = false; @@ -231,7 +232,7 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) f2fs_put_page(page, 0); if (readahead) - ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true); + f2fs_ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true); } static int __f2fs_write_meta_page(struct page *page, @@ -252,7 +253,7 @@ static int __f2fs_write_meta_page(struct page *page, if (wbc->for_reclaim && page->index < GET_SUM_BLOCK(sbi, 0)) goto redirty_out; - write_meta_page(sbi, page, io_type); + f2fs_do_write_meta_page(sbi, page, io_type); dec_page_count(sbi, F2FS_DIRTY_META); if (wbc->for_reclaim) @@ -297,7 +298,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping, trace_f2fs_writepages(mapping->host, wbc, META); diff = nr_pages_to_write(sbi, META, wbc); - written = sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO); + written = f2fs_sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO); mutex_unlock(&sbi->cp_mutex); wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); return 0; @@ -308,7 +309,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping, return 0; } -long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, +long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, long nr_to_write, enum iostat_type io_type) { struct address_space *mapping = META_MAPPING(sbi); @@ -458,20 +459,20 @@ static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) spin_unlock(&im->ino_lock); } -void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) +void f2fs_add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { /* add new dirty ino entry into list */ __add_ino_entry(sbi, ino, 0, type); } -void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) +void f2fs_remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { /* remove dirty ino entry from list */ __remove_ino_entry(sbi, ino, type); } /* mode should be APPEND_INO or UPDATE_INO */ -bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode) +bool f2fs_exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode) { struct inode_management *im = &sbi->im[mode]; struct ino_entry *e; @@ -482,7 +483,7 @@ bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode) return e ? true : false; } -void release_ino_entry(struct f2fs_sb_info *sbi, bool all) +void f2fs_release_ino_entry(struct f2fs_sb_info *sbi, bool all) { struct ino_entry *e, *tmp; int i; @@ -501,13 +502,13 @@ void release_ino_entry(struct f2fs_sb_info *sbi, bool all) } } -void set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, +void f2fs_set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, unsigned int devidx, int type) { __add_ino_entry(sbi, ino, devidx, type); } -bool is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, +bool f2fs_is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, unsigned int devidx, int type) { struct inode_management *im = &sbi->im[type]; @@ -522,7 +523,7 @@ bool is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, return is_dirty; } -int acquire_orphan_inode(struct f2fs_sb_info *sbi) +int f2fs_acquire_orphan_inode(struct f2fs_sb_info *sbi) { struct inode_management *im = &sbi->im[ORPHAN_INO]; int err = 0; @@ -545,7 +546,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi) return err; } -void release_orphan_inode(struct f2fs_sb_info *sbi) +void f2fs_release_orphan_inode(struct f2fs_sb_info *sbi) { struct inode_management *im = &sbi->im[ORPHAN_INO]; @@ -555,14 +556,14 @@ void release_orphan_inode(struct f2fs_sb_info *sbi) spin_unlock(&im->ino_lock); } -void add_orphan_inode(struct inode *inode) +void f2fs_add_orphan_inode(struct inode *inode) { /* add new orphan ino entry into list */ __add_ino_entry(F2FS_I_SB(inode), inode->i_ino, 0, ORPHAN_INO); - update_inode_page(inode); + f2fs_update_inode_page(inode); } -void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) +void f2fs_remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { /* remove orphan entry from orphan list */ __remove_ino_entry(sbi, ino, ORPHAN_INO); @@ -572,7 +573,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { struct inode *inode; struct node_info ni; - int err = acquire_orphan_inode(sbi); + int err = f2fs_acquire_orphan_inode(sbi); if (err) goto err_out; @@ -600,7 +601,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) /* truncate all the data during iput */ iput(inode); - get_node_info(sbi, ino, &ni); + f2fs_get_node_info(sbi, ino, &ni); /* ENOMEM was fully retried in f2fs_evict_inode. */ if (ni.blk_addr != NULL_ADDR) { @@ -618,7 +619,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) return err; } -int recover_orphan_inodes(struct f2fs_sb_info *sbi) +int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi) { block_t start_blk, orphan_blocks, i, j; unsigned int s_flags = sbi->sb->s_flags; @@ -646,10 +647,10 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi); - ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP, true); + f2fs_ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP, true); for (i = 0; i < orphan_blocks; i++) { - struct page *page = get_meta_page(sbi, start_blk + i); + struct page *page = f2fs_get_meta_page(sbi, start_blk + i); struct f2fs_orphan_block *orphan_blk; orphan_blk = (struct f2fs_orphan_block *)page_address(page); @@ -699,7 +700,7 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) /* loop for each orphan inode entry and write them in Jornal block */ list_for_each_entry(orphan, head, list) { if (!page) { - page = grab_meta_page(sbi, start_blk++); + page = f2fs_grab_meta_page(sbi, start_blk++); orphan_blk = (struct f2fs_orphan_block *)page_address(page); memset(orphan_blk, 0, sizeof(*orphan_blk)); @@ -741,7 +742,7 @@ static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, size_t crc_offset = 0; __u32 crc = 0; - *cp_page = get_meta_page(sbi, cp_addr); + *cp_page = f2fs_get_meta_page(sbi, cp_addr); *cp_block = (struct f2fs_checkpoint *)page_address(*cp_page); crc_offset = le32_to_cpu((*cp_block)->checksum_offset); @@ -794,7 +795,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, return NULL; } -int get_valid_checkpoint(struct f2fs_sb_info *sbi) +int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi) { struct f2fs_checkpoint *cp_block; struct f2fs_super_block *fsb = sbi->raw_super; @@ -838,7 +839,7 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) memcpy(sbi->ckpt, cp_block, blk_size); /* Sanity checking of checkpoint */ - if (sanity_check_ckpt(sbi)) + if (f2fs_sanity_check_ckpt(sbi)) goto free_fail_no_cp; if (cur_page == cp1) @@ -857,7 +858,7 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) void *sit_bitmap_ptr; unsigned char *ckpt = (unsigned char *)sbi->ckpt; - cur_page = get_meta_page(sbi, cp_blk_no + i); + cur_page = f2fs_get_meta_page(sbi, cp_blk_no + i); sit_bitmap_ptr = page_address(cur_page); memcpy(ckpt + i * blk_size, sit_bitmap_ptr, blk_size); f2fs_put_page(cur_page, 1); @@ -902,7 +903,7 @@ static void __remove_dirty_inode(struct inode *inode, enum inode_type type) stat_dec_dirty_inode(F2FS_I_SB(inode), type); } -void update_dirty_page(struct inode *inode, struct page *page) +void f2fs_update_dirty_page(struct inode *inode, struct page *page) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; @@ -921,7 +922,7 @@ void update_dirty_page(struct inode *inode, struct page *page) f2fs_trace_pid(page); } -void remove_dirty_inode(struct inode *inode) +void f2fs_remove_dirty_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; @@ -938,7 +939,7 @@ void remove_dirty_inode(struct inode *inode) spin_unlock(&sbi->inode_lock[type]); } -int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) +int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) { struct list_head *head; struct inode *inode; @@ -1021,7 +1022,7 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) /* it's on eviction */ if (is_inode_flag_set(inode, FI_DIRTY_INODE)) - update_inode_page(inode); + f2fs_update_inode_page(inode); iput(inode); } } @@ -1061,7 +1062,7 @@ static int block_operations(struct f2fs_sb_info *sbi) /* write all the dirty dentry pages */ if (get_pages(sbi, F2FS_DIRTY_DENTS)) { f2fs_unlock_all(sbi); - err = sync_dirty_inodes(sbi, DIR_INODE); + err = f2fs_sync_dirty_inodes(sbi, DIR_INODE); if (err) goto out; cond_resched(); @@ -1089,7 +1090,7 @@ static int block_operations(struct f2fs_sb_info *sbi) if (get_pages(sbi, F2FS_DIRTY_NODES)) { up_write(&sbi->node_write); - err = sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO); + err = f2fs_sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO); if (err) { up_write(&sbi->node_change); f2fs_unlock_all(sbi); @@ -1183,10 +1184,10 @@ static void commit_checkpoint(struct f2fs_sb_info *sbi, /* * pagevec_lookup_tag and lock_page again will take - * some extra time. Therefore, update_meta_pages and - * sync_meta_pages are combined in this function. + * some extra time. Therefore, f2fs_update_meta_pages and + * f2fs_sync_meta_pages are combined in this function. */ - struct page *page = grab_meta_page(sbi, blk_addr); + struct page *page = f2fs_grab_meta_page(sbi, blk_addr); int err; memcpy(page_address(page), src, PAGE_SIZE); @@ -1224,7 +1225,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) { - sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); if (unlikely(f2fs_cp_error(sbi))) return -EIO; } @@ -1253,7 +1254,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) } /* 2 cp + n data seg summary + orphan inode blocks */ - data_sum_blocks = npages_for_summary_flush(sbi, false); + data_sum_blocks = f2fs_npages_for_summary_flush(sbi, false); spin_lock_irqsave(&sbi->cp_lock, flags); if (data_sum_blocks < NR_CURSEG_DATA_TYPE) __set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); @@ -1298,22 +1299,23 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) blk = start_blk + sbi->blocks_per_seg - nm_i->nat_bits_blocks; for (i = 0; i < nm_i->nat_bits_blocks; i++) - update_meta_page(sbi, nm_i->nat_bits + + f2fs_update_meta_page(sbi, nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS), blk + i); /* Flush all the NAT BITS pages */ while (get_pages(sbi, F2FS_DIRTY_META)) { - sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + f2fs_sync_meta_pages(sbi, META, LONG_MAX, + FS_CP_META_IO); if (unlikely(f2fs_cp_error(sbi))) return -EIO; } } /* write out checkpoint buffer at block 0 */ - update_meta_page(sbi, ckpt, start_blk++); + f2fs_update_meta_page(sbi, ckpt, start_blk++); for (i = 1; i < 1 + cp_payload_blks; i++) - update_meta_page(sbi, (char *)ckpt + i * F2FS_BLKSIZE, + f2fs_update_meta_page(sbi, (char *)ckpt + i * F2FS_BLKSIZE, start_blk++); if (orphan_num) { @@ -1321,7 +1323,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) start_blk += orphan_blocks; } - write_data_summaries(sbi, start_blk); + f2fs_write_data_summaries(sbi, start_blk); start_blk += data_sum_blocks; /* Record write statistics in the hot node summary */ @@ -1332,7 +1334,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) seg_i->journal->info.kbytes_written = cpu_to_le64(kbytes_written); if (__remain_node_summaries(cpc->reason)) { - write_node_summaries(sbi, start_blk); + f2fs_write_node_summaries(sbi, start_blk); start_blk += NR_CURSEG_NODE_TYPE; } @@ -1341,7 +1343,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) percpu_counter_set(&sbi->alloc_valid_block_count, 0); /* Here, we have one bio having CP pack except cp pack 2 page */ - sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); /* wait for previous submitted meta pages writeback */ wait_on_all_pages_writeback(sbi); @@ -1358,7 +1360,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) commit_checkpoint(sbi, ckpt, start_blk); wait_on_all_pages_writeback(sbi); - release_ino_entry(sbi, false); + f2fs_release_ino_entry(sbi, false); if (unlikely(f2fs_cp_error(sbi))) return -EIO; @@ -1383,7 +1385,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* * We guarantee that this checkpoint procedure will not fail. */ -int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) +int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long long ckpt_ver; @@ -1416,7 +1418,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* this is the case of multiple fstrims without any changes */ if (cpc->reason & CP_DISCARD) { - if (!exist_trim_candidates(sbi, cpc)) { + if (!f2fs_exist_trim_candidates(sbi, cpc)) { unblock_operations(sbi); goto out; } @@ -1424,8 +1426,8 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (NM_I(sbi)->dirty_nat_cnt == 0 && SIT_I(sbi)->dirty_sentries == 0 && prefree_segments(sbi) == 0) { - flush_sit_entries(sbi, cpc); - clear_prefree_segments(sbi, cpc); + f2fs_flush_sit_entries(sbi, cpc); + f2fs_clear_prefree_segments(sbi, cpc); unblock_operations(sbi); goto out; } @@ -1440,15 +1442,15 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver); /* write cached NAT/SIT entries to NAT/SIT area */ - flush_nat_entries(sbi, cpc); - flush_sit_entries(sbi, cpc); + f2fs_flush_nat_entries(sbi, cpc); + f2fs_flush_sit_entries(sbi, cpc); /* unlock all the fs_lock[] in do_checkpoint() */ err = do_checkpoint(sbi, cpc); if (err) - release_discard_addrs(sbi); + f2fs_release_discard_addrs(sbi); else - clear_prefree_segments(sbi, cpc); + f2fs_clear_prefree_segments(sbi, cpc); unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); @@ -1465,7 +1467,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) return err; } -void init_ino_entry_info(struct f2fs_sb_info *sbi) +void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi) { int i; @@ -1483,23 +1485,23 @@ void init_ino_entry_info(struct f2fs_sb_info *sbi) F2FS_ORPHANS_PER_BLOCK; } -int __init create_checkpoint_caches(void) +int __init f2fs_create_checkpoint_caches(void) { ino_entry_slab = f2fs_kmem_cache_create("f2fs_ino_entry", sizeof(struct ino_entry)); if (!ino_entry_slab) return -ENOMEM; - inode_entry_slab = f2fs_kmem_cache_create("f2fs_inode_entry", + f2fs_inode_entry_slab = f2fs_kmem_cache_create("f2fs_inode_entry", sizeof(struct inode_entry)); - if (!inode_entry_slab) { + if (!f2fs_inode_entry_slab) { kmem_cache_destroy(ino_entry_slab); return -ENOMEM; } return 0; } -void destroy_checkpoint_caches(void) +void f2fs_destroy_checkpoint_caches(void) { kmem_cache_destroy(ino_entry_slab); - kmem_cache_destroy(inode_entry_slab); + kmem_cache_destroy(f2fs_inode_entry_slab); } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index bf55ee0dfccd..4b0db685e5d5 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -245,7 +245,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, } else { bio->bi_end_io = f2fs_write_end_io; bio->bi_private = sbi; - bio->bi_write_hint = io_type_to_rw_hint(sbi, type, temp); + bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi, type, temp); } if (wbc) wbc_init_bio(wbc, bio); @@ -602,7 +602,7 @@ static void __set_data_blkaddr(struct dnode_of_data *dn) * ->node_page * update block addresses in the node page */ -void set_data_blkaddr(struct dnode_of_data *dn) +void f2fs_set_data_blkaddr(struct dnode_of_data *dn) { f2fs_wait_on_page_writeback(dn->node_page, NODE, true); __set_data_blkaddr(dn); @@ -613,12 +613,12 @@ void set_data_blkaddr(struct dnode_of_data *dn) void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) { dn->data_blkaddr = blkaddr; - set_data_blkaddr(dn); + f2fs_set_data_blkaddr(dn); f2fs_update_extent_cache(dn); } /* dn->ofs_in_node will be returned with up-to-date last block pointer */ -int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) +int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); int err; @@ -652,12 +652,12 @@ int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) } /* Should keep dn->ofs_in_node unchanged */ -int reserve_new_block(struct dnode_of_data *dn) +int f2fs_reserve_new_block(struct dnode_of_data *dn) { unsigned int ofs_in_node = dn->ofs_in_node; int ret; - ret = reserve_new_blocks(dn, 1); + ret = f2fs_reserve_new_blocks(dn, 1); dn->ofs_in_node = ofs_in_node; return ret; } @@ -667,12 +667,12 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) bool need_put = dn->inode_page ? false : true; int err; - err = get_dnode_of_data(dn, index, ALLOC_NODE); + err = f2fs_get_dnode_of_data(dn, index, ALLOC_NODE); if (err) return err; if (dn->data_blkaddr == NULL_ADDR) - err = reserve_new_block(dn); + err = f2fs_reserve_new_block(dn); if (err || need_put) f2fs_put_dnode(dn); return err; @@ -691,7 +691,7 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) return f2fs_reserve_block(dn, index); } -struct page *get_read_data_page(struct inode *inode, pgoff_t index, +struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, int op_flags, bool for_write) { struct address_space *mapping = inode->i_mapping; @@ -710,7 +710,7 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, } set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, LOOKUP_NODE); + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err) goto put_err; f2fs_put_dnode(&dn); @@ -729,7 +729,8 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, * A new dentry page is allocated but not able to be written, since its * new inode page couldn't be allocated due to -ENOSPC. * In such the case, its blkaddr can be remained as NEW_ADDR. - * see, f2fs_add_link -> get_new_data_page -> init_inode_metadata. + * see, f2fs_add_link -> f2fs_get_new_data_page -> + * f2fs_init_inode_metadata. */ if (dn.data_blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_SIZE); @@ -749,7 +750,7 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, return ERR_PTR(err); } -struct page *find_data_page(struct inode *inode, pgoff_t index) +struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index) { struct address_space *mapping = inode->i_mapping; struct page *page; @@ -759,7 +760,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index) return page; f2fs_put_page(page, 0); - page = get_read_data_page(inode, index, REQ_SYNC, false); + page = f2fs_get_read_data_page(inode, index, REQ_SYNC, false); if (IS_ERR(page)) return page; @@ -779,13 +780,13 @@ struct page *find_data_page(struct inode *inode, pgoff_t index) * Because, the callers, functions in dir.c and GC, should be able to know * whether this page exists or not. */ -struct page *get_lock_data_page(struct inode *inode, pgoff_t index, +struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, bool for_write) { struct address_space *mapping = inode->i_mapping; struct page *page; repeat: - page = get_read_data_page(inode, index, REQ_SYNC, for_write); + page = f2fs_get_read_data_page(inode, index, REQ_SYNC, for_write); if (IS_ERR(page)) return page; @@ -811,7 +812,7 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index, * Note that, ipage is set only by make_empty_dir, and if any error occur, * ipage should be released by this function. */ -struct page *get_new_data_page(struct inode *inode, +struct page *f2fs_get_new_data_page(struct inode *inode, struct page *ipage, pgoff_t index, bool new_i_size) { struct address_space *mapping = inode->i_mapping; @@ -850,7 +851,7 @@ struct page *get_new_data_page(struct inode *inode, /* if ipage exists, blkaddr should be NEW_ADDR */ f2fs_bug_on(F2FS_I_SB(inode), ipage); - page = get_lock_data_page(inode, index, true); + page = f2fs_get_lock_data_page(inode, index, true); if (IS_ERR(page)) return page; } @@ -882,15 +883,15 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) return err; alloc: - get_node_info(sbi, dn->nid, &ni); + f2fs_get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); - allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr, + f2fs_allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr, &sum, seg_type, NULL, false); - set_data_blkaddr(dn); + f2fs_set_data_blkaddr(dn); /* update i_size */ - fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + + fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + dn->ofs_in_node; if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_SHIFT)) f2fs_i_size_write(dn->inode, @@ -928,7 +929,7 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) map.m_seg_type = NO_CHECK_TYPE; if (direct_io) { - map.m_seg_type = rw_hint_to_seg_type(iocb->ki_hint); + map.m_seg_type = f2fs_rw_hint_to_seg_type(iocb->ki_hint); flag = f2fs_force_buffered_io(inode, WRITE) ? F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO; @@ -1018,7 +1019,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, pgofs, mode); + err = f2fs_get_dnode_of_data(&dn, pgofs, mode); if (err) { if (flag == F2FS_GET_BLOCK_BMAP) map->m_pblk = 0; @@ -1026,10 +1027,10 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, err = 0; if (map->m_next_pgofs) *map->m_next_pgofs = - get_next_page_offset(&dn, pgofs); + f2fs_get_next_page_offset(&dn, pgofs); if (map->m_next_extent) *map->m_next_extent = - get_next_page_offset(&dn, pgofs); + f2fs_get_next_page_offset(&dn, pgofs); } goto unlock_out; } @@ -1115,7 +1116,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, (pgofs == end || dn.ofs_in_node == end_offset)) { dn.ofs_in_node = ofs_in_node; - err = reserve_new_blocks(&dn, prealloc); + err = f2fs_reserve_new_blocks(&dn, prealloc); if (err) goto sync_out; @@ -1234,7 +1235,7 @@ static int get_data_block_dio(struct inode *inode, sector_t iblock, { return __get_data_block(inode, iblock, bh_result, create, F2FS_GET_BLOCK_DEFAULT, NULL, - rw_hint_to_seg_type( + f2fs_rw_hint_to_seg_type( inode->i_write_hint)); } @@ -1279,7 +1280,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, if (!page) return -ENOMEM; - get_node_info(sbi, inode->i_ino, &ni); + f2fs_get_node_info(sbi, inode->i_ino, &ni); phys = (__u64)blk_to_logical(inode, ni.blk_addr); offset = offsetof(struct f2fs_inode, i_addr) + @@ -1306,7 +1307,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, if (!page) return -ENOMEM; - get_node_info(sbi, xnid, &ni); + f2fs_get_node_info(sbi, xnid, &ni); phys = (__u64)blk_to_logical(inode, ni.blk_addr); len = inode->i_sb->s_blocksize; @@ -1609,12 +1610,12 @@ static inline bool check_inplace_update_policy(struct inode *inode, if (policy & (0x1 << F2FS_IPU_FORCE)) return true; - if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi)) + if (policy & (0x1 << F2FS_IPU_SSR) && f2fs_need_SSR(sbi)) return true; if (policy & (0x1 << F2FS_IPU_UTIL) && utilization(sbi) > SM_I(sbi)->min_ipu_util) return true; - if (policy & (0x1 << F2FS_IPU_SSR_UTIL) && need_SSR(sbi) && + if (policy & (0x1 << F2FS_IPU_SSR_UTIL) && f2fs_need_SSR(sbi) && utilization(sbi) > SM_I(sbi)->min_ipu_util) return true; @@ -1635,7 +1636,7 @@ static inline bool check_inplace_update_policy(struct inode *inode, return false; } -bool should_update_inplace(struct inode *inode, struct f2fs_io_info *fio) +bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio) { if (f2fs_is_pinned_file(inode)) return true; @@ -1647,7 +1648,7 @@ bool should_update_inplace(struct inode *inode, struct f2fs_io_info *fio) return check_inplace_update_policy(inode, fio); } -bool should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) +bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -1670,13 +1671,13 @@ static inline bool need_inplace_update(struct f2fs_io_info *fio) { struct inode *inode = fio->page->mapping->host; - if (should_update_outplace(inode, fio)) + if (f2fs_should_update_outplace(inode, fio)) return false; - return should_update_inplace(inode, fio); + return f2fs_should_update_inplace(inode, fio); } -int do_write_data_page(struct f2fs_io_info *fio) +int f2fs_do_write_data_page(struct f2fs_io_info *fio) { struct page *page = fio->page; struct inode *inode = page->mapping->host; @@ -1701,7 +1702,7 @@ int do_write_data_page(struct f2fs_io_info *fio) if (fio->need_lock == LOCK_REQ && !f2fs_trylock_op(fio->sbi)) return -EAGAIN; - err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); + err = f2fs_get_dnode_of_data(&dn, page->index, LOOKUP_NODE); if (err) goto out; @@ -1728,7 +1729,7 @@ int do_write_data_page(struct f2fs_io_info *fio) f2fs_put_dnode(&dn); if (fio->need_lock == LOCK_REQ) f2fs_unlock_op(fio->sbi); - err = rewrite_data_page(fio); + err = f2fs_inplace_write_data(fio); trace_f2fs_do_write_data_page(fio->page, IPU); set_inode_flag(inode, FI_UPDATE_WRITE); return err; @@ -1750,7 +1751,7 @@ int do_write_data_page(struct f2fs_io_info *fio) ClearPageError(page); /* LFS mode write path */ - write_data_page(&dn, fio); + f2fs_outplace_write_data(&dn, fio); trace_f2fs_do_write_data_page(page, OPU); set_inode_flag(inode, FI_APPEND_WRITE); if (page->index == 0) @@ -1826,13 +1827,13 @@ static int __write_data_page(struct page *page, bool *submitted, /* we should not write 0'th page having journal header */ if (f2fs_is_volatile_file(inode) && (!page->index || (!wbc->for_reclaim && - available_free_memory(sbi, BASE_CHECK)))) + f2fs_available_free_memory(sbi, BASE_CHECK)))) goto redirty_out; /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { fio.need_lock = LOCK_DONE; - err = do_write_data_page(&fio); + err = f2fs_do_write_data_page(&fio); goto done; } @@ -1851,10 +1852,10 @@ static int __write_data_page(struct page *page, bool *submitted, } if (err == -EAGAIN) { - err = do_write_data_page(&fio); + err = f2fs_do_write_data_page(&fio); if (err == -EAGAIN) { fio.need_lock = LOCK_REQ; - err = do_write_data_page(&fio); + err = f2fs_do_write_data_page(&fio); } } @@ -1879,7 +1880,7 @@ static int __write_data_page(struct page *page, bool *submitted, if (wbc->for_reclaim) { f2fs_submit_merged_write_cond(sbi, inode, 0, page->index, DATA); clear_inode_flag(inode, FI_HOT_DATA); - remove_dirty_inode(inode); + f2fs_remove_dirty_inode(inode); submitted = NULL; } @@ -2092,7 +2093,7 @@ static int __f2fs_write_data_pages(struct address_space *mapping, if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) && - available_free_memory(sbi, DIRTY_DENTS)) + f2fs_available_free_memory(sbi, DIRTY_DENTS)) goto skip_write; /* skip writing during file defragment */ @@ -2118,7 +2119,7 @@ static int __f2fs_write_data_pages(struct address_space *mapping, * to detect pending bios. */ - remove_dirty_inode(inode); + f2fs_remove_dirty_inode(inode); return ret; skip_write: @@ -2145,7 +2146,7 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to) if (to > i_size) { down_write(&F2FS_I(inode)->i_mmap_sem); truncate_pagecache(inode, i_size); - truncate_blocks(inode, i_size, true); + f2fs_truncate_blocks(inode, i_size, true); up_write(&F2FS_I(inode)->i_mmap_sem); } } @@ -2177,7 +2178,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, } restart: /* check inline_data */ - ipage = get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto unlock_out; @@ -2187,7 +2188,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, if (f2fs_has_inline_data(inode)) { if (pos + len <= MAX_INLINE_DATA(inode)) { - read_inline_data(page, ipage); + f2fs_do_read_inline_data(page, ipage); set_inode_flag(inode, FI_DATA_EXIST); if (inode->i_nlink) set_inline_node(ipage); @@ -2205,7 +2206,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, dn.data_blkaddr = ei.blk + index - ei.fofs; } else { /* hole case */ - err = get_dnode_of_data(&dn, index, LOOKUP_NODE); + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err || dn.data_blkaddr == NULL_ADDR) { f2fs_put_dnode(&dn); __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, @@ -2242,7 +2243,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, trace_f2fs_write_begin(inode, pos, len, flags); if (f2fs_is_atomic_file(inode) && - !available_free_memory(sbi, INMEM_PAGES)) { + !f2fs_available_free_memory(sbi, INMEM_PAGES)) { err = -ENOMEM; drop_atomic = true; goto fail; @@ -2326,7 +2327,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, f2fs_put_page(page, 1); f2fs_write_failed(mapping, pos + len); if (drop_atomic) - drop_inmem_pages_all(sbi, false); + f2fs_drop_inmem_pages_all(sbi, false); return err; } @@ -2448,13 +2449,13 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, dec_page_count(sbi, F2FS_DIRTY_NODES); } else { inode_dec_dirty_pages(inode); - remove_dirty_inode(inode); + f2fs_remove_dirty_inode(inode); } } /* This is atomic written page, keep Private */ if (IS_ATOMIC_WRITTEN_PAGE(page)) - return drop_inmem_page(inode, page); + return f2fs_drop_inmem_page(inode, page); set_page_private(page, 0); ClearPagePrivate(page); @@ -2487,7 +2488,7 @@ static int f2fs_set_data_page_dirty(struct page *page) if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) { if (!IS_ATOMIC_WRITTEN_PAGE(page)) { - register_inmem_page(inode, page); + f2fs_register_inmem_page(inode, page); return 1; } /* @@ -2499,7 +2500,7 @@ static int f2fs_set_data_page_dirty(struct page *page) if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); - update_dirty_page(inode, page); + f2fs_update_dirty_page(inode, page); return 1; } return 0; @@ -2592,7 +2593,7 @@ const struct address_space_operations f2fs_dblock_aops = { #endif }; -void clear_radix_tree_dirty_tag(struct page *page) +void f2fs_clear_radix_tree_dirty_tag(struct page *page) { struct address_space *mapping = page_mapping(page); unsigned long flags; diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index f8e7bafd092a..a7feed756592 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -65,7 +65,7 @@ static void set_de_type(struct f2fs_dir_entry *de, umode_t mode) de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; } -unsigned char get_de_type(struct f2fs_dir_entry *de) +unsigned char f2fs_get_de_type(struct f2fs_dir_entry *de) { if (de->file_type < F2FS_FT_MAX) return f2fs_filetype_table[de->file_type]; @@ -97,14 +97,14 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, dentry_blk = (struct f2fs_dentry_block *)page_address(dentry_page); make_dentry_ptr_block(NULL, &d, dentry_blk); - de = find_target_dentry(fname, namehash, max_slots, &d); + de = f2fs_find_target_dentry(fname, namehash, max_slots, &d); if (de) *res_page = dentry_page; return de; } -struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, +struct f2fs_dir_entry *f2fs_find_target_dentry(struct fscrypt_name *fname, f2fs_hash_t namehash, int *max_slots, struct f2fs_dentry_ptr *d) { @@ -171,7 +171,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, for (; bidx < end_block; bidx++) { /* no need to allocate new dentry pages to all the indices */ - dentry_page = find_data_page(dir, bidx); + dentry_page = f2fs_find_data_page(dir, bidx); if (IS_ERR(dentry_page)) { if (PTR_ERR(dentry_page) == -ENOENT) { room = true; @@ -210,7 +210,7 @@ struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, if (f2fs_has_inline_dentry(dir)) { *res_page = NULL; - de = find_in_inline_dir(dir, fname, res_page); + de = f2fs_find_in_inline_dir(dir, fname, res_page); goto out; } @@ -319,7 +319,7 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage) set_page_dirty(ipage); } -void do_make_empty_dir(struct inode *inode, struct inode *parent, +void f2fs_do_make_empty_dir(struct inode *inode, struct inode *parent, struct f2fs_dentry_ptr *d) { struct qstr dot = QSTR_INIT(".", 1); @@ -340,23 +340,23 @@ static int make_empty_dir(struct inode *inode, struct f2fs_dentry_ptr d; if (f2fs_has_inline_dentry(inode)) - return make_empty_inline_dir(inode, parent, page); + return f2fs_make_empty_inline_dir(inode, parent, page); - dentry_page = get_new_data_page(inode, page, 0, true); + dentry_page = f2fs_get_new_data_page(inode, page, 0, true); if (IS_ERR(dentry_page)) return PTR_ERR(dentry_page); dentry_blk = page_address(dentry_page); make_dentry_ptr_block(NULL, &d, dentry_blk); - do_make_empty_dir(inode, parent, &d); + f2fs_do_make_empty_dir(inode, parent, &d); set_page_dirty(dentry_page); f2fs_put_page(dentry_page, 1); return 0; } -struct page *init_inode_metadata(struct inode *inode, struct inode *dir, +struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, const struct qstr *new_name, const struct qstr *orig_name, struct page *dpage) { @@ -365,7 +365,7 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, int err; if (is_inode_flag_set(inode, FI_NEW_INODE)) { - page = new_inode_page(inode); + page = f2fs_new_inode_page(inode); if (IS_ERR(page)) return page; @@ -395,7 +395,7 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, goto put_error; } } else { - page = get_node_page(F2FS_I_SB(dir), inode->i_ino); + page = f2fs_get_node_page(F2FS_I_SB(dir), inode->i_ino); if (IS_ERR(page)) return page; } @@ -418,19 +418,19 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, * we should remove this inode from orphan list. */ if (inode->i_nlink == 0) - remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino); + f2fs_remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino); f2fs_i_links_write(inode, true); } return page; put_error: clear_nlink(inode); - update_inode(inode, page); + f2fs_update_inode(inode, page); f2fs_put_page(page, 1); return ERR_PTR(err); } -void update_parent_metadata(struct inode *dir, struct inode *inode, +void f2fs_update_parent_metadata(struct inode *dir, struct inode *inode, unsigned int current_depth) { if (inode && is_inode_flag_set(inode, FI_NEW_INODE)) { @@ -448,7 +448,7 @@ void update_parent_metadata(struct inode *dir, struct inode *inode, clear_inode_flag(inode, FI_INC_LINK); } -int room_for_filename(const void *bitmap, int slots, int max_slots) +int f2fs_room_for_filename(const void *bitmap, int slots, int max_slots) { int bit_start = 0; int zero_start, zero_end; @@ -537,12 +537,12 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, (le32_to_cpu(dentry_hash) % nbucket)); for (block = bidx; block <= (bidx + nblock - 1); block++) { - dentry_page = get_new_data_page(dir, NULL, block, true); + dentry_page = f2fs_get_new_data_page(dir, NULL, block, true); if (IS_ERR(dentry_page)) return PTR_ERR(dentry_page); dentry_blk = page_address(dentry_page); - bit_pos = room_for_filename(&dentry_blk->dentry_bitmap, + bit_pos = f2fs_room_for_filename(&dentry_blk->dentry_bitmap, slots, NR_DENTRY_IN_BLOCK); if (bit_pos < NR_DENTRY_IN_BLOCK) goto add_dentry; @@ -558,7 +558,7 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, if (inode) { down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, new_name, + page = f2fs_init_inode_metadata(inode, dir, new_name, orig_name, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); @@ -576,7 +576,7 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, f2fs_put_page(page, 1); } - update_parent_metadata(dir, inode, current_depth); + f2fs_update_parent_metadata(dir, inode, current_depth); fail: if (inode) up_write(&F2FS_I(inode)->i_sem); @@ -586,7 +586,7 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, return err; } -int __f2fs_do_add_link(struct inode *dir, struct fscrypt_name *fname, +int f2fs_add_dentry(struct inode *dir, struct fscrypt_name *fname, struct inode *inode, nid_t ino, umode_t mode) { struct qstr new_name; @@ -610,7 +610,7 @@ int __f2fs_do_add_link(struct inode *dir, struct fscrypt_name *fname, * Caller should grab and release a rwsem by calling f2fs_lock_op() and * f2fs_unlock_op(). */ -int __f2fs_add_link(struct inode *dir, const struct qstr *name, +int f2fs_do_add_link(struct inode *dir, const struct qstr *name, struct inode *inode, nid_t ino, umode_t mode) { struct fscrypt_name fname; @@ -639,7 +639,7 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, } else if (IS_ERR(page)) { err = PTR_ERR(page); } else { - err = __f2fs_do_add_link(dir, &fname, inode, ino, mode); + err = f2fs_add_dentry(dir, &fname, inode, ino, mode); } fscrypt_free_filename(&fname); return err; @@ -651,7 +651,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir) int err = 0; down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, NULL, NULL, NULL); + page = f2fs_init_inode_metadata(inode, dir, NULL, NULL, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; @@ -683,9 +683,9 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode) up_write(&F2FS_I(inode)->i_sem); if (inode->i_nlink == 0) - add_orphan_inode(inode); + f2fs_add_orphan_inode(inode); else - release_orphan_inode(sbi); + f2fs_release_orphan_inode(sbi); } /* @@ -703,7 +703,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); if (F2FS_OPTION(F2FS_I_SB(dir)).fsync_mode == FSYNC_MODE_STRICT) - add_ino_entry(F2FS_I_SB(dir), dir->i_ino, TRANS_DIR_INO); + f2fs_add_ino_entry(F2FS_I_SB(dir), dir->i_ino, TRANS_DIR_INO); if (f2fs_has_inline_dentry(dir)) return f2fs_delete_inline_entry(dentry, page, dir, inode); @@ -729,13 +729,13 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, f2fs_drop_nlink(dir, inode); if (bit_pos == NR_DENTRY_IN_BLOCK && - !truncate_hole(dir, page->index, page->index + 1)) { - clear_radix_tree_dirty_tag(page); + !f2fs_truncate_hole(dir, page->index, page->index + 1)) { + f2fs_clear_radix_tree_dirty_tag(page); clear_page_dirty_for_io(page); ClearPagePrivate(page); ClearPageUptodate(page); inode_dec_dirty_pages(dir); - remove_dirty_inode(dir); + f2fs_remove_dirty_inode(dir); } f2fs_put_page(page, 1); } @@ -752,7 +752,7 @@ bool f2fs_empty_dir(struct inode *dir) return f2fs_empty_inline_dir(dir); for (bidx = 0; bidx < nblock; bidx++) { - dentry_page = get_lock_data_page(dir, bidx, false); + dentry_page = f2fs_get_lock_data_page(dir, bidx, false); if (IS_ERR(dentry_page)) { if (PTR_ERR(dentry_page) == -ENOENT) continue; @@ -800,7 +800,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, continue; } - d_type = get_de_type(de); + d_type = f2fs_get_de_type(de); de_name.name = d->filename[bit_pos]; de_name.len = le16_to_cpu(de->name_len); @@ -824,7 +824,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, return 1; if (sbi->readdir_ra == 1) - ra_node_page(sbi, le32_to_cpu(de->ino)); + f2fs_ra_node_page(sbi, le32_to_cpu(de->ino)); bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); ctx->pos = start_pos + bit_pos; @@ -874,7 +874,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) page_cache_sync_readahead(inode->i_mapping, ra, file, n, min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); - dentry_page = get_lock_data_page(inode, n, false); + dentry_page = f2fs_get_lock_data_page(inode, n, false); if (IS_ERR(dentry_page)) { err = PTR_ERR(dentry_page); if (err == -ENOENT) { diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index d5a861bf2b42..231b77ef5a53 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -49,7 +49,7 @@ static struct rb_entry *__lookup_rb_tree_slow(struct rb_root *root, return NULL; } -struct rb_entry *__lookup_rb_tree(struct rb_root *root, +struct rb_entry *f2fs_lookup_rb_tree(struct rb_root *root, struct rb_entry *cached_re, unsigned int ofs) { struct rb_entry *re; @@ -61,7 +61,7 @@ struct rb_entry *__lookup_rb_tree(struct rb_root *root, return re; } -struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, +struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, struct rb_root *root, struct rb_node **parent, unsigned int ofs) { @@ -92,7 +92,7 @@ struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, * in order to simpfy the insertion after. * tree must stay unchanged between lookup and insertion. */ -struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root, +struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root *root, struct rb_entry *cached_re, unsigned int ofs, struct rb_entry **prev_entry, @@ -159,7 +159,7 @@ struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root, return re; } -bool __check_rb_tree_consistence(struct f2fs_sb_info *sbi, +bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, struct rb_root *root) { #ifdef CONFIG_F2FS_CHECK_FS @@ -390,7 +390,7 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, goto out; } - en = (struct extent_node *)__lookup_rb_tree(&et->root, + en = (struct extent_node *)f2fs_lookup_rb_tree(&et->root, (struct rb_entry *)et->cached_en, pgofs); if (!en) goto out; @@ -470,7 +470,7 @@ static struct extent_node *__insert_extent_tree(struct inode *inode, goto do_insert; } - p = __lookup_rb_tree_for_insert(sbi, &et->root, &parent, ei->fofs); + p = f2fs_lookup_rb_tree_for_insert(sbi, &et->root, &parent, ei->fofs); do_insert: en = __attach_extent_node(sbi, et, ei, parent, p); if (!en) @@ -520,7 +520,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode, __drop_largest_extent(inode, fofs, len); /* 1. lookup first extent node in range [fofs, fofs + len - 1] */ - en = (struct extent_node *)__lookup_rb_tree_ret(&et->root, + en = (struct extent_node *)f2fs_lookup_rb_tree_ret(&et->root, (struct rb_entry *)et->cached_en, fofs, (struct rb_entry **)&prev_en, (struct rb_entry **)&next_en, @@ -773,7 +773,7 @@ void f2fs_update_extent_cache(struct dnode_of_data *dn) else blkaddr = dn->data_blkaddr; - fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + + fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + dn->ofs_in_node; f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, 1); } @@ -788,7 +788,7 @@ void f2fs_update_extent_cache_range(struct dnode_of_data *dn, f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len); } -void init_extent_cache_info(struct f2fs_sb_info *sbi) +void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi) { INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO); mutex_init(&sbi->extent_tree_lock); @@ -800,7 +800,7 @@ void init_extent_cache_info(struct f2fs_sb_info *sbi) atomic_set(&sbi->total_ext_node, 0); } -int __init create_extent_cache(void) +int __init f2fs_create_extent_cache(void) { extent_tree_slab = f2fs_kmem_cache_create("f2fs_extent_tree", sizeof(struct extent_tree)); @@ -815,7 +815,7 @@ int __init create_extent_cache(void) return 0; } -void destroy_extent_cache(void) +void f2fs_destroy_extent_cache(void) { kmem_cache_destroy(extent_node_slab); kmem_cache_destroy(extent_tree_slab); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c80ee4b1fa51..e91f7ff71dc6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2710,7 +2710,7 @@ static inline int get_inline_xattr_addrs(struct inode *inode) return F2FS_I(inode)->i_inline_xattr_size; } -#define get_inode_mode(i) \ +#define f2fs_get_inode_mode(i) \ ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) @@ -2760,14 +2760,14 @@ static inline bool is_valid_blkaddr(block_t blkaddr) * file.c */ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); -void truncate_data_blocks(struct dnode_of_data *dn); -int truncate_blocks(struct inode *inode, u64 from, bool lock); +void f2fs_truncate_data_blocks(struct dnode_of_data *dn); +int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock); int f2fs_truncate(struct inode *inode); int f2fs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); int f2fs_setattr(struct dentry *dentry, struct iattr *attr); -int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end); -void truncate_data_blocks_range(struct dnode_of_data *dn, int count); +int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end); +void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count); int f2fs_precache_extents(struct inode *inode); long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); @@ -2781,37 +2781,37 @@ bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page); void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page); struct inode *f2fs_iget(struct super_block *sb, unsigned long ino); struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino); -int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink); -void update_inode(struct inode *inode, struct page *node_page); -void update_inode_page(struct inode *inode); +int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink); +void f2fs_update_inode(struct inode *inode, struct page *node_page); +void f2fs_update_inode_page(struct inode *inode); int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc); void f2fs_evict_inode(struct inode *inode); -void handle_failed_inode(struct inode *inode); +void f2fs_handle_failed_inode(struct inode *inode); /* * namei.c */ -int update_extension_list(struct f2fs_sb_info *sbi, const char *name, +int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, bool hot, bool set); struct dentry *f2fs_get_parent(struct dentry *child); /* * dir.c */ -unsigned char get_de_type(struct f2fs_dir_entry *de); -struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, +unsigned char f2fs_get_de_type(struct f2fs_dir_entry *de); +struct f2fs_dir_entry *f2fs_find_target_dentry(struct fscrypt_name *fname, f2fs_hash_t namehash, int *max_slots, struct f2fs_dentry_ptr *d); int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, unsigned int start_pos, struct fscrypt_str *fstr); -void do_make_empty_dir(struct inode *inode, struct inode *parent, +void f2fs_do_make_empty_dir(struct inode *inode, struct inode *parent, struct f2fs_dentry_ptr *d); -struct page *init_inode_metadata(struct inode *inode, struct inode *dir, +struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, const struct qstr *new_name, const struct qstr *orig_name, struct page *dpage); -void update_parent_metadata(struct inode *dir, struct inode *inode, +void f2fs_update_parent_metadata(struct inode *dir, struct inode *inode, unsigned int current_depth); -int room_for_filename(const void *bitmap, int slots, int max_slots); +int f2fs_room_for_filename(const void *bitmap, int slots, int max_slots); void f2fs_drop_nlink(struct inode *dir, struct inode *inode); struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, struct fscrypt_name *fname, struct page **res_page); @@ -2828,9 +2828,9 @@ void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, const struct qstr *orig_name, struct inode *inode, nid_t ino, umode_t mode); -int __f2fs_do_add_link(struct inode *dir, struct fscrypt_name *fname, +int f2fs_add_dentry(struct inode *dir, struct fscrypt_name *fname, struct inode *inode, nid_t ino, umode_t mode); -int __f2fs_add_link(struct inode *dir, const struct qstr *name, +int f2fs_do_add_link(struct inode *dir, const struct qstr *name, struct inode *inode, nid_t ino, umode_t mode); void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, struct inode *dir, struct inode *inode); @@ -2839,7 +2839,7 @@ bool f2fs_empty_dir(struct inode *dir); static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) { - return __f2fs_add_link(d_inode(dentry->d_parent), &dentry->d_name, + return f2fs_do_add_link(d_inode(dentry->d_parent), &dentry->d_name, inode, inode->i_ino, inode->i_mode); } @@ -2854,7 +2854,7 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); extern __printf(3, 4) void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...); -int sanity_check_ckpt(struct f2fs_sb_info *sbi); +int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi); /* * hash.c @@ -2868,139 +2868,146 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, struct dnode_of_data; struct node_info; -int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid); -bool available_free_memory(struct f2fs_sb_info *sbi, int type); -int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid); -bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid); -bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino); -void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni); -pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs); -int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode); -int truncate_inode_blocks(struct inode *inode, pgoff_t from); -int truncate_xattr_node(struct inode *inode); -int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino); -int remove_inode_page(struct inode *inode); -struct page *new_inode_page(struct inode *inode); -struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs); -void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); -struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); -struct page *get_node_page_ra(struct page *parent, int start); -void move_node_page(struct page *node_page, int gc_type); -int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, +int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid); +bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type); +int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid); +bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid); +bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino); +void f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, + struct node_info *ni); +pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs); +int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode); +int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from); +int f2fs_truncate_xattr_node(struct inode *inode); +int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino); +int f2fs_remove_inode_page(struct inode *inode); +struct page *f2fs_new_inode_page(struct inode *inode); +struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs); +void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); +struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); +struct page *f2fs_get_node_page_ra(struct page *parent, int start); +void f2fs_move_node_page(struct page *node_page, int gc_type); +int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic); -int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, +int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, + struct writeback_control *wbc, bool do_balance, enum iostat_type io_type); -void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount); -bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid); -void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); -void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid); -int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink); -void recover_inline_xattr(struct inode *inode, struct page *page); -int recover_xattr_data(struct inode *inode, struct page *page); -int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); -void restore_node_summary(struct f2fs_sb_info *sbi, +void f2fs_build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount); +bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid); +void f2fs_alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); +void f2fs_alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid); +int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink); +void f2fs_recover_inline_xattr(struct inode *inode, struct page *page); +int f2fs_recover_xattr_data(struct inode *inode, struct page *page); +int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); +void f2fs_restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum); -void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); -int build_node_manager(struct f2fs_sb_info *sbi); -void destroy_node_manager(struct f2fs_sb_info *sbi); -int __init create_node_manager_caches(void); -void destroy_node_manager_caches(void); +void f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); +int f2fs_build_node_manager(struct f2fs_sb_info *sbi); +void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi); +int __init f2fs_create_node_manager_caches(void); +void f2fs_destroy_node_manager_caches(void); /* * segment.c */ -bool need_SSR(struct f2fs_sb_info *sbi); -void register_inmem_page(struct inode *inode, struct page *page); -void drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure); -void drop_inmem_pages(struct inode *inode); -void drop_inmem_page(struct inode *inode, struct page *page); -int commit_inmem_pages(struct inode *inode); +bool f2fs_need_SSR(struct f2fs_sb_info *sbi); +void f2fs_register_inmem_page(struct inode *inode, struct page *page); +void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure); +void f2fs_drop_inmem_pages(struct inode *inode); +void f2fs_drop_inmem_page(struct inode *inode, struct page *page); +int f2fs_commit_inmem_pages(struct inode *inode); void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi); int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino); -int create_flush_cmd_control(struct f2fs_sb_info *sbi); +int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi); int f2fs_flush_device_cache(struct f2fs_sb_info *sbi); -void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); -void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); -bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); -void drop_discard_cmd(struct f2fs_sb_info *sbi); -void stop_discard_thread(struct f2fs_sb_info *sbi); +void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); +void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); +bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); +void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi); +void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi); bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); -void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); -void release_discard_addrs(struct f2fs_sb_info *sbi); -int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); -void allocate_new_segments(struct f2fs_sb_info *sbi); +void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, + struct cp_control *cpc); +void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi); +int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); +void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi); int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range); -bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc); -struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno); -void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr); -void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, +bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, + struct cp_control *cpc); +struct page *f2fs_get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno); +void f2fs_update_meta_page(struct f2fs_sb_info *sbi, void *src, + block_t blk_addr); +void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page, enum iostat_type io_type); -void write_node_page(unsigned int nid, struct f2fs_io_info *fio); -void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio); -int rewrite_data_page(struct f2fs_io_info *fio); -void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, +void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio); +void f2fs_outplace_write_data(struct dnode_of_data *dn, + struct f2fs_io_info *fio); +int f2fs_inplace_write_data(struct f2fs_io_info *fio); +void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, block_t old_blkaddr, block_t new_blkaddr, bool recover_curseg, bool recover_newaddr); void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, block_t old_addr, block_t new_addr, unsigned char version, bool recover_curseg, bool recover_newaddr); -void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, +void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, struct f2fs_io_info *fio, bool add_list); void f2fs_wait_on_page_writeback(struct page *page, enum page_type type, bool ordered); void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr); -void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk); -void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk); -int lookup_journal_in_cursum(struct f2fs_journal *journal, int type, +void f2fs_write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk); +void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk); +int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, unsigned int val, int alloc); -void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); -int build_segment_manager(struct f2fs_sb_info *sbi); -void destroy_segment_manager(struct f2fs_sb_info *sbi); -int __init create_segment_manager_caches(void); -void destroy_segment_manager_caches(void); -int rw_hint_to_seg_type(enum rw_hint hint); -enum rw_hint io_type_to_rw_hint(struct f2fs_sb_info *sbi, enum page_type type, - enum temp_type temp); +void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); +int f2fs_build_segment_manager(struct f2fs_sb_info *sbi); +void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi); +int __init f2fs_create_segment_manager_caches(void); +void f2fs_destroy_segment_manager_caches(void); +int f2fs_rw_hint_to_seg_type(enum rw_hint hint); +enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, + enum page_type type, enum temp_type temp); /* * checkpoint.c */ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io); -struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); -struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); -struct page *get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index); -bool is_valid_meta_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); -int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, +struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); +struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); +struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index); +bool f2fs_is_valid_meta_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr, int type); +int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync); -void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index); -long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, +void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index); +long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, long nr_to_write, enum iostat_type io_type); -void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); -void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); -void release_ino_entry(struct f2fs_sb_info *sbi, bool all); -bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode); -void set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, +void f2fs_add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); +void f2fs_remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); +void f2fs_release_ino_entry(struct f2fs_sb_info *sbi, bool all); +bool f2fs_exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode); +void f2fs_set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, unsigned int devidx, int type); -bool is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, +bool f2fs_is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, unsigned int devidx, int type); int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi); -int acquire_orphan_inode(struct f2fs_sb_info *sbi); -void release_orphan_inode(struct f2fs_sb_info *sbi); -void add_orphan_inode(struct inode *inode); -void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino); -int recover_orphan_inodes(struct f2fs_sb_info *sbi); -int get_valid_checkpoint(struct f2fs_sb_info *sbi); -void update_dirty_page(struct inode *inode, struct page *page); -void remove_dirty_inode(struct inode *inode); -int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type); -int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); -void init_ino_entry_info(struct f2fs_sb_info *sbi); -int __init create_checkpoint_caches(void); -void destroy_checkpoint_caches(void); +int f2fs_acquire_orphan_inode(struct f2fs_sb_info *sbi); +void f2fs_release_orphan_inode(struct f2fs_sb_info *sbi); +void f2fs_add_orphan_inode(struct inode *inode); +void f2fs_remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino); +int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi); +int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi); +void f2fs_update_dirty_page(struct inode *inode, struct page *page); +void f2fs_remove_dirty_inode(struct inode *inode); +int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type); +int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); +void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi); +int __init f2fs_create_checkpoint_caches(void); +void f2fs_destroy_checkpoint_caches(void); /* * data.c @@ -3017,27 +3024,27 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio); struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, block_t blk_addr, struct bio *bio); int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr); -void set_data_blkaddr(struct dnode_of_data *dn); +void f2fs_set_data_blkaddr(struct dnode_of_data *dn); void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr); -int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count); -int reserve_new_block(struct dnode_of_data *dn); +int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count); +int f2fs_reserve_new_block(struct dnode_of_data *dn); int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index); int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from); int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); -struct page *get_read_data_page(struct inode *inode, pgoff_t index, +struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, int op_flags, bool for_write); -struct page *find_data_page(struct inode *inode, pgoff_t index); -struct page *get_lock_data_page(struct inode *inode, pgoff_t index, +struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index); +struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, bool for_write); -struct page *get_new_data_page(struct inode *inode, +struct page *f2fs_get_new_data_page(struct inode *inode, struct page *ipage, pgoff_t index, bool new_i_size); -int do_write_data_page(struct f2fs_io_info *fio); +int f2fs_do_write_data_page(struct f2fs_io_info *fio); int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int create, int flag); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); -bool should_update_inplace(struct inode *inode, struct f2fs_io_info *fio); -bool should_update_outplace(struct inode *inode, struct f2fs_io_info *fio); +bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio); +bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio); void f2fs_invalidate_page(struct page *page, unsigned int offset, unsigned int length); int f2fs_release_page(struct page *page, gfp_t wait); @@ -3046,23 +3053,23 @@ int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode); #endif bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len); -void clear_radix_tree_dirty_tag(struct page *page); +void f2fs_clear_radix_tree_dirty_tag(struct page *page); /* * gc.c */ -int start_gc_thread(struct f2fs_sb_info *sbi); -void stop_gc_thread(struct f2fs_sb_info *sbi); -block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode); +int f2fs_start_gc_thread(struct f2fs_sb_info *sbi); +void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi); +block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode); int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, unsigned int segno); -void build_gc_manager(struct f2fs_sb_info *sbi); +void f2fs_build_gc_manager(struct f2fs_sb_info *sbi); /* * recovery.c */ -int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only); -bool space_for_roll_forward(struct f2fs_sb_info *sbi); +int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only); +bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi); /* * debug.c @@ -3267,29 +3274,31 @@ extern const struct inode_operations f2fs_dir_inode_operations; extern const struct inode_operations f2fs_symlink_inode_operations; extern const struct inode_operations f2fs_encrypted_symlink_inode_operations; extern const struct inode_operations f2fs_special_inode_operations; -extern struct kmem_cache *inode_entry_slab; +extern struct kmem_cache *f2fs_inode_entry_slab; /* * inline.c */ bool f2fs_may_inline_data(struct inode *inode); bool f2fs_may_inline_dentry(struct inode *inode); -void read_inline_data(struct page *page, struct page *ipage); -void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from); +void f2fs_do_read_inline_data(struct page *page, struct page *ipage); +void f2fs_truncate_inline_inode(struct inode *inode, + struct page *ipage, u64 from); int f2fs_read_inline_data(struct inode *inode, struct page *page); int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page); int f2fs_convert_inline_inode(struct inode *inode); int f2fs_write_inline_data(struct inode *inode, struct page *page); -bool recover_inline_data(struct inode *inode, struct page *npage); -struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, +bool f2fs_recover_inline_data(struct inode *inode, struct page *npage); +struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir, struct fscrypt_name *fname, struct page **res_page); -int make_empty_inline_dir(struct inode *inode, struct inode *parent, +int f2fs_make_empty_inline_dir(struct inode *inode, struct inode *parent, struct page *ipage); int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, const struct qstr *orig_name, struct inode *inode, nid_t ino, umode_t mode); -void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, - struct inode *dir, struct inode *inode); +void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, + struct page *page, struct inode *dir, + struct inode *inode); bool f2fs_empty_inline_dir(struct inode *dir); int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, struct fscrypt_str *fstr); @@ -3310,17 +3319,17 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); /* * extent_cache.c */ -struct rb_entry *__lookup_rb_tree(struct rb_root *root, +struct rb_entry *f2fs_lookup_rb_tree(struct rb_root *root, struct rb_entry *cached_re, unsigned int ofs); -struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, +struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, struct rb_root *root, struct rb_node **parent, unsigned int ofs); -struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root, +struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root *root, struct rb_entry *cached_re, unsigned int ofs, struct rb_entry **prev_entry, struct rb_entry **next_entry, struct rb_node ***insert_p, struct rb_node **insert_parent, bool force); -bool __check_rb_tree_consistence(struct f2fs_sb_info *sbi, +bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, struct rb_root *root); unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext); @@ -3332,9 +3341,9 @@ bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, void f2fs_update_extent_cache(struct dnode_of_data *dn); void f2fs_update_extent_cache_range(struct dnode_of_data *dn, pgoff_t fofs, block_t blkaddr, unsigned int len); -void init_extent_cache_info(struct f2fs_sb_info *sbi); -int __init create_extent_cache(void); -void destroy_extent_cache(void); +void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi); +int __init f2fs_create_extent_cache(void); +void f2fs_destroy_extent_cache(void); /* * sysfs.c diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 0c2af49be162..f1476c93ded5 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -160,17 +160,18 @@ static inline enum cp_reason_type need_do_checkpoint(struct inode *inode) cp_reason = CP_SB_NEED_CP; else if (file_wrong_pino(inode)) cp_reason = CP_WRONG_PINO; - else if (!space_for_roll_forward(sbi)) + else if (!f2fs_space_for_roll_forward(sbi)) cp_reason = CP_NO_SPC_ROLL; - else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino)) + else if (!f2fs_is_checkpointed_node(sbi, F2FS_I(inode)->i_pino)) cp_reason = CP_NODE_NEED_CP; else if (test_opt(sbi, FASTBOOT)) cp_reason = CP_FASTBOOT_MODE; else if (F2FS_OPTION(sbi).active_logs == 2) cp_reason = CP_SPEC_LOG_NUM; else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT && - need_dentry_mark(sbi, inode->i_ino) && - exist_written_data(sbi, F2FS_I(inode)->i_pino, TRANS_DIR_INO)) + f2fs_need_dentry_mark(sbi, inode->i_ino) && + f2fs_exist_written_data(sbi, F2FS_I(inode)->i_pino, + TRANS_DIR_INO)) cp_reason = CP_RECOVER_DIR; return cp_reason; @@ -181,7 +182,7 @@ static bool need_inode_page_update(struct f2fs_sb_info *sbi, nid_t ino) struct page *i = find_get_page(NODE_MAPPING(sbi), ino); bool ret = false; /* But we need to avoid that there are some inode updates */ - if ((i && PageDirty(i)) || need_inode_block_update(sbi, ino)) + if ((i && PageDirty(i)) || f2fs_need_inode_block_update(sbi, ino)) ret = true; f2fs_put_page(i, 0); return ret; @@ -241,14 +242,14 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, * if there is no written data, don't waste time to write recovery info. */ if (!is_inode_flag_set(inode, FI_APPEND_WRITE) && - !exist_written_data(sbi, ino, APPEND_INO)) { + !f2fs_exist_written_data(sbi, ino, APPEND_INO)) { /* it may call write_inode just prior to fsync */ if (need_inode_page_update(sbi, ino)) goto go_write; if (is_inode_flag_set(inode, FI_UPDATE_WRITE) || - exist_written_data(sbi, ino, UPDATE_INO)) + f2fs_exist_written_data(sbi, ino, UPDATE_INO)) goto flush_out; goto out; } @@ -275,7 +276,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, goto out; } sync_nodes: - ret = fsync_node_pages(sbi, inode, &wbc, atomic); + ret = f2fs_fsync_node_pages(sbi, inode, &wbc, atomic); if (ret) goto out; @@ -285,7 +286,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, goto out; } - if (need_inode_block_update(sbi, ino)) { + if (f2fs_need_inode_block_update(sbi, ino)) { f2fs_mark_inode_dirty_sync(inode, true); f2fs_write_inode(inode, NULL); goto sync_nodes; @@ -300,21 +301,21 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, * given fsync mark. */ if (!atomic) { - ret = wait_on_node_pages_writeback(sbi, ino); + ret = f2fs_wait_on_node_pages_writeback(sbi, ino); if (ret) goto out; } /* once recovery info is written, don't need to tack this */ - remove_ino_entry(sbi, ino, APPEND_INO); + f2fs_remove_ino_entry(sbi, ino, APPEND_INO); clear_inode_flag(inode, FI_APPEND_WRITE); flush_out: if (!atomic && F2FS_OPTION(sbi).fsync_mode != FSYNC_MODE_NOBARRIER) ret = f2fs_issue_flush(sbi, inode->i_ino); if (!ret) { - remove_ino_entry(sbi, ino, UPDATE_INO); + f2fs_remove_ino_entry(sbi, ino, UPDATE_INO); clear_inode_flag(inode, FI_UPDATE_WRITE); - remove_ino_entry(sbi, ino, FLUSH_INO); + f2fs_remove_ino_entry(sbi, ino, FLUSH_INO); } f2fs_update_time(sbi, REQ_TIME); out: @@ -395,13 +396,13 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_SHIFT) { set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE); + err = f2fs_get_dnode_of_data(&dn, pgofs, LOOKUP_NODE); if (err && err != -ENOENT) { goto fail; } else if (err == -ENOENT) { /* direct node does not exists */ if (whence == SEEK_DATA) { - pgofs = get_next_page_offset(&dn, pgofs); + pgofs = f2fs_get_next_page_offset(&dn, pgofs); continue; } else { goto found; @@ -489,7 +490,7 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) return dquot_file_open(inode, filp); } -void truncate_data_blocks_range(struct dnode_of_data *dn, int count) +void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct f2fs_node *raw_node; @@ -509,8 +510,8 @@ void truncate_data_blocks_range(struct dnode_of_data *dn, int count) continue; dn->data_blkaddr = NULL_ADDR; - set_data_blkaddr(dn); - invalidate_blocks(sbi, blkaddr); + f2fs_set_data_blkaddr(dn); + f2fs_invalidate_blocks(sbi, blkaddr); if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page)) clear_inode_flag(dn->inode, FI_FIRST_BLOCK_WRITTEN); nr_free++; @@ -522,7 +523,7 @@ void truncate_data_blocks_range(struct dnode_of_data *dn, int count) * once we invalidate valid blkaddr in range [ofs, ofs + count], * we will invalidate all blkaddr in the whole range. */ - fofs = start_bidx_of_node(ofs_of_node(dn->node_page), + fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + ofs; f2fs_update_extent_cache_range(dn, fofs, 0, len); dec_valid_block_count(sbi, dn->inode, nr_free); @@ -534,9 +535,9 @@ void truncate_data_blocks_range(struct dnode_of_data *dn, int count) dn->ofs_in_node, nr_free); } -void truncate_data_blocks(struct dnode_of_data *dn) +void f2fs_truncate_data_blocks(struct dnode_of_data *dn) { - truncate_data_blocks_range(dn, ADDRS_PER_BLOCK); + f2fs_truncate_data_blocks_range(dn, ADDRS_PER_BLOCK); } static int truncate_partial_data_page(struct inode *inode, u64 from, @@ -558,7 +559,7 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, return 0; } - page = get_lock_data_page(inode, index, true); + page = f2fs_get_lock_data_page(inode, index, true); if (IS_ERR(page)) return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page); truncate_out: @@ -573,7 +574,7 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, return 0; } -int truncate_blocks(struct inode *inode, u64 from, bool lock) +int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; @@ -592,21 +593,21 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) if (lock) f2fs_lock_op(sbi); - ipage = get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto out; } if (f2fs_has_inline_data(inode)) { - truncate_inline_inode(inode, ipage, from); + f2fs_truncate_inline_inode(inode, ipage, from); f2fs_put_page(ipage, 1); truncate_page = true; goto out; } set_new_dnode(&dn, inode, ipage, NULL, 0); - err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE_RA); + err = f2fs_get_dnode_of_data(&dn, free_from, LOOKUP_NODE_RA); if (err) { if (err == -ENOENT) goto free_next; @@ -619,13 +620,13 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) f2fs_bug_on(sbi, count < 0); if (dn.ofs_in_node || IS_INODE(dn.node_page)) { - truncate_data_blocks_range(&dn, count); + f2fs_truncate_data_blocks_range(&dn, count); free_from += count; } f2fs_put_dnode(&dn); free_next: - err = truncate_inode_blocks(inode, free_from); + err = f2fs_truncate_inode_blocks(inode, free_from); out: if (lock) f2fs_unlock_op(sbi); @@ -664,7 +665,7 @@ int f2fs_truncate(struct inode *inode) return err; } - err = truncate_blocks(inode, i_size_read(inode), true); + err = f2fs_truncate_blocks(inode, i_size_read(inode), true); if (err) return err; @@ -815,7 +816,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) __setattr_copy(inode, attr); if (attr->ia_valid & ATTR_MODE) { - err = posix_acl_chmod(inode, get_inode_mode(inode)); + err = posix_acl_chmod(inode, f2fs_get_inode_mode(inode)); if (err || is_inode_flag_set(inode, FI_ACL_MODE)) { inode->i_mode = F2FS_I(inode)->i_acl_mode; clear_inode_flag(inode, FI_ACL_MODE); @@ -857,7 +858,7 @@ static int fill_zero(struct inode *inode, pgoff_t index, f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); - page = get_new_data_page(inode, NULL, index, false); + page = f2fs_get_new_data_page(inode, NULL, index, false); f2fs_unlock_op(sbi); if (IS_ERR(page)) @@ -870,7 +871,7 @@ static int fill_zero(struct inode *inode, pgoff_t index, return 0; } -int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) +int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) { int err; @@ -879,10 +880,11 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) pgoff_t end_offset, count; set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, pg_start, LOOKUP_NODE); + err = f2fs_get_dnode_of_data(&dn, pg_start, LOOKUP_NODE); if (err) { if (err == -ENOENT) { - pg_start = get_next_page_offset(&dn, pg_start); + pg_start = f2fs_get_next_page_offset(&dn, + pg_start); continue; } return err; @@ -893,7 +895,7 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) f2fs_bug_on(F2FS_I_SB(inode), count == 0 || count > end_offset); - truncate_data_blocks_range(&dn, count); + f2fs_truncate_data_blocks_range(&dn, count); f2fs_put_dnode(&dn); pg_start += count; @@ -949,7 +951,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) blk_end - 1); f2fs_lock_op(sbi); - ret = truncate_hole(inode, pg_start, pg_end); + ret = f2fs_truncate_hole(inode, pg_start, pg_end); f2fs_unlock_op(sbi); up_write(&F2FS_I(inode)->i_mmap_sem); } @@ -967,7 +969,7 @@ static int __read_out_blkaddrs(struct inode *inode, block_t *blkaddr, next_dnode: set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, off, LOOKUP_NODE_RA); + ret = f2fs_get_dnode_of_data(&dn, off, LOOKUP_NODE_RA); if (ret && ret != -ENOENT) { return ret; } else if (ret == -ENOENT) { @@ -984,7 +986,7 @@ static int __read_out_blkaddrs(struct inode *inode, block_t *blkaddr, for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) { *blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); - if (!is_checkpointed_data(sbi, *blkaddr)) { + if (!f2fs_is_checkpointed_data(sbi, *blkaddr)) { if (test_opt(sbi, LFS)) { f2fs_put_dnode(&dn); @@ -1017,10 +1019,10 @@ static int __roll_back_blkaddrs(struct inode *inode, block_t *blkaddr, continue; set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, off + i, LOOKUP_NODE_RA); + ret = f2fs_get_dnode_of_data(&dn, off + i, LOOKUP_NODE_RA); if (ret) { dec_valid_block_count(sbi, inode, 1); - invalidate_blocks(sbi, *blkaddr); + f2fs_invalidate_blocks(sbi, *blkaddr); } else { f2fs_update_data_blkaddr(&dn, *blkaddr); } @@ -1050,18 +1052,18 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, pgoff_t ilen; set_new_dnode(&dn, dst_inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, dst + i, ALLOC_NODE); + ret = f2fs_get_dnode_of_data(&dn, dst + i, ALLOC_NODE); if (ret) return ret; - get_node_info(sbi, dn.nid, &ni); + f2fs_get_node_info(sbi, dn.nid, &ni); ilen = min((pgoff_t) ADDRS_PER_PAGE(dn.node_page, dst_inode) - dn.ofs_in_node, len - i); do { dn.data_blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); - truncate_data_blocks_range(&dn, 1); + f2fs_truncate_data_blocks_range(&dn, 1); if (do_replace[i]) { f2fs_i_blocks_write(src_inode, @@ -1084,10 +1086,11 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, } else { struct page *psrc, *pdst; - psrc = get_lock_data_page(src_inode, src + i, true); + psrc = f2fs_get_lock_data_page(src_inode, + src + i, true); if (IS_ERR(psrc)) return PTR_ERR(psrc); - pdst = get_new_data_page(dst_inode, NULL, dst + i, + pdst = f2fs_get_new_data_page(dst_inode, NULL, dst + i, true); if (IS_ERR(pdst)) { f2fs_put_page(psrc, 1); @@ -1098,7 +1101,8 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, f2fs_put_page(pdst, 1); f2fs_put_page(psrc, 1); - ret = truncate_hole(src_inode, src + i, src + i + 1); + ret = f2fs_truncate_hole(src_inode, + src + i, src + i + 1); if (ret) return ret; i++; @@ -1215,7 +1219,7 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) new_size = i_size_read(inode) - len; truncate_pagecache(inode, new_size); - ret = truncate_blocks(inode, new_size, true); + ret = f2fs_truncate_blocks(inode, new_size, true); if (!ret) f2fs_i_size_write(inode, new_size); out_unlock: @@ -1240,7 +1244,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, } dn->ofs_in_node = ofs_in_node; - ret = reserve_new_blocks(dn, count); + ret = f2fs_reserve_new_blocks(dn, count); if (ret) return ret; @@ -1249,7 +1253,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, dn->data_blkaddr = datablock_addr(dn->inode, dn->node_page, dn->ofs_in_node); /* - * reserve_new_blocks will not guarantee entire block + * f2fs_reserve_new_blocks will not guarantee entire block * allocation. */ if (dn->data_blkaddr == NULL_ADDR) { @@ -1257,9 +1261,9 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, break; } if (dn->data_blkaddr != NEW_ADDR) { - invalidate_blocks(sbi, dn->data_blkaddr); + f2fs_invalidate_blocks(sbi, dn->data_blkaddr); dn->data_blkaddr = NEW_ADDR; - set_data_blkaddr(dn); + f2fs_set_data_blkaddr(dn); } } @@ -1325,7 +1329,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, f2fs_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, index, ALLOC_NODE); + ret = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE); if (ret) { f2fs_unlock_op(sbi); goto out; @@ -1399,7 +1403,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); down_write(&F2FS_I(inode)->i_mmap_sem); - ret = truncate_blocks(inode, i_size_read(inode), true); + ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); if (ret) goto out; @@ -1560,7 +1564,7 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - drop_inmem_pages(inode); + f2fs_drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { clear_inode_flag(inode, FI_VOLATILE_FILE); stat_dec_volatile_write(inode); @@ -1583,7 +1587,7 @@ static int f2fs_file_flush(struct file *file, fl_owner_t id) */ if (f2fs_is_atomic_file(inode) && F2FS_I(inode)->inmem_task == current) - drop_inmem_pages(inode); + f2fs_drop_inmem_pages(inode); return 0; } @@ -1732,7 +1736,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) } if (f2fs_is_atomic_file(inode)) { - ret = commit_inmem_pages(inode); + ret = f2fs_commit_inmem_pages(inode); if (ret) goto err_out; @@ -1835,7 +1839,7 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) inode_lock(inode); if (f2fs_is_atomic_file(inode)) - drop_inmem_pages(inode); + f2fs_drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { clear_inode_flag(inode, FI_VOLATILE_FILE); stat_dec_volatile_write(inode); @@ -1892,7 +1896,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) f2fs_stop_checkpoint(sbi, false); break; case F2FS_GOING_DOWN_METAFLUSH: - sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO); + f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO); f2fs_stop_checkpoint(sbi, false); break; default: @@ -1900,10 +1904,10 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) goto out; } - stop_gc_thread(sbi); - stop_discard_thread(sbi); + f2fs_stop_gc_thread(sbi); + f2fs_stop_discard_thread(sbi); - drop_discard_cmd(sbi); + f2fs_drop_discard_cmd(sbi); clear_opt(sbi, DISCARD); f2fs_update_time(sbi, REQ_TIME); @@ -2096,7 +2100,7 @@ static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) return ret; } -static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) +static int f2fs_ioc_f2fs_write_checkpoint(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -2134,7 +2138,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, int err; /* if in-place-update policy is enabled, don't waste time here */ - if (should_update_inplace(inode, NULL)) + if (f2fs_should_update_inplace(inode, NULL)) return -EINVAL; pg_start = range->start >> PAGE_SHIFT; @@ -2229,7 +2233,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) { struct page *page; - page = get_lock_data_page(inode, idx, true); + page = f2fs_get_lock_data_page(inode, idx, true); if (IS_ERR(page)) { err = PTR_ERR(page); goto clear_out; @@ -2576,7 +2580,7 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) inode_lock(inode); - if (should_update_outplace(inode, NULL)) { + if (f2fs_should_update_outplace(inode, NULL)) { ret = -EINVAL; goto out; } @@ -2689,7 +2693,7 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case F2FS_IOC_GARBAGE_COLLECT_RANGE: return f2fs_ioc_gc_range(filp, arg); case F2FS_IOC_WRITE_CHECKPOINT: - return f2fs_ioc_write_checkpoint(filp, arg); + return f2fs_ioc_f2fs_write_checkpoint(filp, arg); case F2FS_IOC_DEFRAGMENT: return f2fs_ioc_defragment(filp, arg); case F2FS_IOC_MOVE_RANGE: diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 45713a64612d..dcadc0691a3e 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -114,7 +114,7 @@ static int gc_thread_func(void *data) return 0; } -int start_gc_thread(struct f2fs_sb_info *sbi) +int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) { struct f2fs_gc_kthread *gc_th; dev_t dev = sbi->sb->s_bdev->bd_dev; @@ -146,7 +146,7 @@ int start_gc_thread(struct f2fs_sb_info *sbi) return err; } -void stop_gc_thread(struct f2fs_sb_info *sbi) +void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi) { struct f2fs_gc_kthread *gc_th = sbi->gc_thread; if (!gc_th) @@ -429,7 +429,7 @@ static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode) iput(inode); return; } - new_ie = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); + new_ie = f2fs_kmem_cache_alloc(f2fs_inode_entry_slab, GFP_NOFS); new_ie->inode = inode; f2fs_radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie); @@ -443,7 +443,7 @@ static void put_gc_inode(struct gc_inode_list *gc_list) radix_tree_delete(&gc_list->iroot, ie->inode->i_ino); iput(ie->inode); list_del(&ie->list); - kmem_cache_free(inode_entry_slab, ie); + kmem_cache_free(f2fs_inode_entry_slab, ie); } } @@ -492,34 +492,34 @@ static void gc_node_segment(struct f2fs_sb_info *sbi, continue; if (phase == 0) { - ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, + f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, META_NAT, true); continue; } if (phase == 1) { - ra_node_page(sbi, nid); + f2fs_ra_node_page(sbi, nid); continue; } /* phase == 2 */ - node_page = get_node_page(sbi, nid); + node_page = f2fs_get_node_page(sbi, nid); if (IS_ERR(node_page)) continue; - /* block may become invalid during get_node_page */ + /* block may become invalid during f2fs_get_node_page */ if (check_valid_map(sbi, segno, off) == 0) { f2fs_put_page(node_page, 1); continue; } - get_node_info(sbi, nid, &ni); + f2fs_get_node_info(sbi, nid, &ni); if (ni.blk_addr != start_addr + off) { f2fs_put_page(node_page, 1); continue; } - move_node_page(node_page, gc_type); + f2fs_move_node_page(node_page, gc_type); stat_inc_node_blk_count(sbi, 1, gc_type); } @@ -534,7 +534,7 @@ static void gc_node_segment(struct f2fs_sb_info *sbi, * as indirect or double indirect node blocks, are given, it must be a caller's * bug. */ -block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode) +block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode) { unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4; unsigned int bidx; @@ -565,11 +565,11 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, nid = le32_to_cpu(sum->nid); ofs_in_node = le16_to_cpu(sum->ofs_in_node); - node_page = get_node_page(sbi, nid); + node_page = f2fs_get_node_page(sbi, nid); if (IS_ERR(node_page)) return false; - get_node_info(sbi, nid, dni); + f2fs_get_node_info(sbi, nid, dni); if (sum->version != dni->version) { f2fs_msg(sbi->sb, KERN_WARNING, @@ -633,7 +633,7 @@ static void move_data_block(struct inode *inode, block_t bidx, } set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE); + err = f2fs_get_dnode_of_data(&dn, bidx, LOOKUP_NODE); if (err) goto out; @@ -648,7 +648,7 @@ static void move_data_block(struct inode *inode, block_t bidx, */ f2fs_wait_on_page_writeback(page, DATA, true); - get_node_info(fio.sbi, dn.nid, &ni); + f2fs_get_node_info(fio.sbi, dn.nid, &ni); set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); /* read page */ @@ -658,7 +658,7 @@ static void move_data_block(struct inode *inode, block_t bidx, if (lfs_mode) down_write(&fio.sbi->io_order_lock); - allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, + f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, &sum, CURSEG_COLD_DATA, NULL, false); fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi), @@ -717,7 +717,7 @@ static void move_data_block(struct inode *inode, block_t bidx, if (lfs_mode) up_write(&fio.sbi->io_order_lock); if (err) - __f2fs_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr, + f2fs_do_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr, true, true); put_out: f2fs_put_dnode(&dn); @@ -730,7 +730,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, { struct page *page; - page = get_lock_data_page(inode, bidx, true); + page = f2fs_get_lock_data_page(inode, bidx, true); if (IS_ERR(page)) return; @@ -775,12 +775,12 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, f2fs_wait_on_page_writeback(page, DATA, true); if (clear_page_dirty_for_io(page)) { inode_dec_dirty_pages(inode); - remove_dirty_inode(inode); + f2fs_remove_dirty_inode(inode); } set_cold_data(page); - err = do_write_data_page(&fio); + err = f2fs_do_write_data_page(&fio); if (err) { clear_cold_data(page); if (err == -ENOMEM) { @@ -832,13 +832,13 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, continue; if (phase == 0) { - ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, + f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, META_NAT, true); continue; } if (phase == 1) { - ra_node_page(sbi, nid); + f2fs_ra_node_page(sbi, nid); continue; } @@ -847,7 +847,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, continue; if (phase == 2) { - ra_node_page(sbi, dni.ino); + f2fs_ra_node_page(sbi, dni.ino); continue; } @@ -870,8 +870,8 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, continue; } - start_bidx = start_bidx_of_node(nofs, inode); - data_page = get_read_data_page(inode, + start_bidx = f2fs_start_bidx_of_node(nofs, inode); + data_page = f2fs_get_read_data_page(inode, start_bidx + ofs_in_node, REQ_RAHEAD, true); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); @@ -905,7 +905,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, inode_dio_wait(inode); } - start_bidx = start_bidx_of_node(nofs, inode) + start_bidx = f2fs_start_bidx_of_node(nofs, inode) + ofs_in_node; if (f2fs_post_read_required(inode)) move_data_block(inode, start_bidx, gc_type, @@ -955,12 +955,12 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, /* readahead multi ssa blocks those have contiguous address */ if (sbi->segs_per_sec > 1) - ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), + f2fs_ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec, META_SSA, true); /* reference all summary page */ while (segno < end_segno) { - sum_page = get_sum_page(sbi, segno++); + sum_page = f2fs_get_sum_page(sbi, segno++); unlock_page(sum_page); } @@ -1056,7 +1056,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, * secure free segments which doesn't need fggc any more. */ if (prefree_segments(sbi)) { - ret = write_checkpoint(sbi, &cpc); + ret = f2fs_write_checkpoint(sbi, &cpc); if (ret) goto stop; } @@ -1093,13 +1093,13 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, if (has_not_enough_free_secs(sbi, sec_freed, 0)) { if (skipped_round > MAX_SKIP_ATOMIC_COUNT && skipped_round * 2 >= round) - drop_inmem_pages_all(sbi, true); + f2fs_drop_inmem_pages_all(sbi, true); segno = NULL_SEGNO; goto gc_more; } if (gc_type == FG_GC) - ret = write_checkpoint(sbi, &cpc); + ret = f2fs_write_checkpoint(sbi, &cpc); } stop: SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; @@ -1123,7 +1123,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, return ret; } -void build_gc_manager(struct f2fs_sb_info *sbi) +void f2fs_build_gc_manager(struct f2fs_sb_info *sbi) { DIRTY_I(sbi)->v_ops = &default_v_ops; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index b51cc241f354..f3185ae98860 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -42,7 +42,7 @@ bool f2fs_may_inline_dentry(struct inode *inode) return true; } -void read_inline_data(struct page *page, struct page *ipage) +void f2fs_do_read_inline_data(struct page *page, struct page *ipage) { struct inode *inode = page->mapping->host; void *src_addr, *dst_addr; @@ -64,7 +64,8 @@ void read_inline_data(struct page *page, struct page *ipage) SetPageUptodate(page); } -void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from) +void f2fs_truncate_inline_inode(struct inode *inode, + struct page *ipage, u64 from) { void *addr; @@ -85,7 +86,7 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page) { struct page *ipage; - ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); + ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) { unlock_page(page); return PTR_ERR(ipage); @@ -99,7 +100,7 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page) if (page->index) zero_user_segment(page, 0, PAGE_SIZE); else - read_inline_data(page, ipage); + f2fs_do_read_inline_data(page, ipage); if (!PageUptodate(page)) SetPageUptodate(page); @@ -131,7 +132,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) f2fs_bug_on(F2FS_P_SB(page), PageWriteback(page)); - read_inline_data(page, dn->inode_page); + f2fs_do_read_inline_data(page, dn->inode_page); set_page_dirty(page); /* clear dirty state */ @@ -142,18 +143,18 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) ClearPageError(page); fio.old_blkaddr = dn->data_blkaddr; set_inode_flag(dn->inode, FI_HOT_DATA); - write_data_page(dn, &fio); + f2fs_outplace_write_data(dn, &fio); f2fs_wait_on_page_writeback(page, DATA, true); if (dirty) { inode_dec_dirty_pages(dn->inode); - remove_dirty_inode(dn->inode); + f2fs_remove_dirty_inode(dn->inode); } /* this converted inline_data should be recovered. */ set_inode_flag(dn->inode, FI_APPEND_WRITE); /* clear inline data and flag after data writeback */ - truncate_inline_inode(dn->inode, dn->inode_page, 0); + f2fs_truncate_inline_inode(dn->inode, dn->inode_page, 0); clear_inline_node(dn->inode_page); clear_out: stat_dec_inline_inode(dn->inode); @@ -178,7 +179,7 @@ int f2fs_convert_inline_inode(struct inode *inode) f2fs_lock_op(sbi); - ipage = get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto out; @@ -207,7 +208,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) int err; set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, 0, LOOKUP_NODE); + err = f2fs_get_dnode_of_data(&dn, 0, LOOKUP_NODE); if (err) return err; @@ -225,7 +226,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) kunmap_atomic(src_addr); set_page_dirty(dn.inode_page); - clear_radix_tree_dirty_tag(page); + f2fs_clear_radix_tree_dirty_tag(page); set_inode_flag(inode, FI_APPEND_WRITE); set_inode_flag(inode, FI_DATA_EXIST); @@ -235,7 +236,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) return 0; } -bool recover_inline_data(struct inode *inode, struct page *npage) +bool f2fs_recover_inline_data(struct inode *inode, struct page *npage) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode *ri = NULL; @@ -256,7 +257,7 @@ bool recover_inline_data(struct inode *inode, struct page *npage) if (f2fs_has_inline_data(inode) && ri && (ri->i_inline & F2FS_INLINE_DATA)) { process_inline: - ipage = get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_node_page(sbi, inode->i_ino); f2fs_bug_on(sbi, IS_ERR(ipage)); f2fs_wait_on_page_writeback(ipage, NODE, true); @@ -274,20 +275,20 @@ bool recover_inline_data(struct inode *inode, struct page *npage) } if (f2fs_has_inline_data(inode)) { - ipage = get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_node_page(sbi, inode->i_ino); f2fs_bug_on(sbi, IS_ERR(ipage)); - truncate_inline_inode(inode, ipage, 0); + f2fs_truncate_inline_inode(inode, ipage, 0); clear_inode_flag(inode, FI_INLINE_DATA); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { - if (truncate_blocks(inode, 0, false)) + if (f2fs_truncate_blocks(inode, 0, false)) return false; goto process_inline; } return false; } -struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, +struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir, struct fscrypt_name *fname, struct page **res_page) { struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); @@ -298,7 +299,7 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, void *inline_dentry; f2fs_hash_t namehash; - ipage = get_node_page(sbi, dir->i_ino); + ipage = f2fs_get_node_page(sbi, dir->i_ino); if (IS_ERR(ipage)) { *res_page = ipage; return NULL; @@ -309,7 +310,7 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, inline_dentry = inline_data_addr(dir, ipage); make_dentry_ptr_inline(dir, &d, inline_dentry); - de = find_target_dentry(fname, namehash, NULL, &d); + de = f2fs_find_target_dentry(fname, namehash, NULL, &d); unlock_page(ipage); if (de) *res_page = ipage; @@ -319,7 +320,7 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, return de; } -int make_empty_inline_dir(struct inode *inode, struct inode *parent, +int f2fs_make_empty_inline_dir(struct inode *inode, struct inode *parent, struct page *ipage) { struct f2fs_dentry_ptr d; @@ -328,7 +329,7 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent, inline_dentry = inline_data_addr(inode, ipage); make_dentry_ptr_inline(inode, &d, inline_dentry); - do_make_empty_dir(inode, parent, &d); + f2fs_do_make_empty_dir(inode, parent, &d); set_page_dirty(ipage); @@ -386,7 +387,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, set_page_dirty(page); /* clear inline dir and flag after data writeback */ - truncate_inline_inode(dir, ipage, 0); + f2fs_truncate_inline_inode(dir, ipage, 0); stat_dec_inline_dir(dir); clear_inode_flag(dir, FI_INLINE_DENTRY); @@ -429,7 +430,7 @@ static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry) new_name.len = le16_to_cpu(de->name_len); ino = le32_to_cpu(de->ino); - fake_mode = get_de_type(de) << S_SHIFT; + fake_mode = f2fs_get_de_type(de) << S_SHIFT; err = f2fs_add_regular_entry(dir, &new_name, NULL, NULL, ino, fake_mode); @@ -441,8 +442,8 @@ static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry) return 0; punch_dentry_pages: truncate_inode_pages(&dir->i_data, 0); - truncate_blocks(dir, 0, false); - remove_dirty_inode(dir); + f2fs_truncate_blocks(dir, 0, false); + f2fs_remove_dirty_inode(dir); return err; } @@ -460,7 +461,7 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, } memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA(dir)); - truncate_inline_inode(dir, ipage, 0); + f2fs_truncate_inline_inode(dir, ipage, 0); unlock_page(ipage); @@ -509,14 +510,14 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, struct page *page = NULL; int err = 0; - ipage = get_node_page(sbi, dir->i_ino); + ipage = f2fs_get_node_page(sbi, dir->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); inline_dentry = inline_data_addr(dir, ipage); make_dentry_ptr_inline(dir, &d, inline_dentry); - bit_pos = room_for_filename(d.bitmap, slots, d.max); + bit_pos = f2fs_room_for_filename(d.bitmap, slots, d.max); if (bit_pos >= d.max) { err = f2fs_convert_inline_dir(dir, ipage, inline_dentry); if (err) @@ -527,7 +528,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, if (inode) { down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, new_name, + page = f2fs_init_inode_metadata(inode, dir, new_name, orig_name, ipage); if (IS_ERR(page)) { err = PTR_ERR(page); @@ -548,7 +549,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, f2fs_put_page(page, 1); } - update_parent_metadata(dir, inode, 0); + f2fs_update_parent_metadata(dir, inode, 0); fail: if (inode) up_write(&F2FS_I(inode)->i_sem); @@ -594,7 +595,7 @@ bool f2fs_empty_inline_dir(struct inode *dir) void *inline_dentry; struct f2fs_dentry_ptr d; - ipage = get_node_page(sbi, dir->i_ino); + ipage = f2fs_get_node_page(sbi, dir->i_ino); if (IS_ERR(ipage)) return false; @@ -625,7 +626,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, if (ctx->pos == d.max) return 0; - ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); + ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); @@ -651,7 +652,7 @@ int f2fs_inline_data_fiemap(struct inode *inode, struct page *ipage; int err = 0; - ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); + ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); @@ -667,7 +668,7 @@ int f2fs_inline_data_fiemap(struct inode *inode, ilen = start + len; ilen -= start; - get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni); + f2fs_get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni); byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits; byteaddr += (char *)inline_data_addr(inode, ipage) - (char *)F2FS_INODE(ipage); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 2f8c99ab99f7..27e18b5cb459 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -209,10 +209,10 @@ static int do_read_inode(struct inode *inode) projid_t i_projid; /* Check if ino is within scope */ - if (check_nid_range(sbi, inode->i_ino)) + if (f2fs_check_nid_range(sbi, inode->i_ino)) return -EINVAL; - node_page = get_node_page(sbi, inode->i_ino); + node_page = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) return PTR_ERR(node_page); @@ -278,7 +278,7 @@ static int do_read_inode(struct inode *inode) if (__written_first_block(ri)) set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); - if (!need_inode_block_update(sbi, inode->i_ino)) + if (!f2fs_need_inode_block_update(sbi, inode->i_ino)) fi->last_disk_size = inode->i_size; if (fi->i_flags & F2FS_PROJINHERIT_FL) @@ -390,7 +390,7 @@ struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino) return inode; } -void update_inode(struct inode *inode, struct page *node_page) +void f2fs_update_inode(struct inode *inode, struct page *node_page) { struct f2fs_inode *ri; struct extent_tree *et = F2FS_I(inode)->extent_tree; @@ -476,12 +476,12 @@ void update_inode(struct inode *inode, struct page *node_page) F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; } -void update_inode_page(struct inode *inode) +void f2fs_update_inode_page(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *node_page; retry: - node_page = get_node_page(sbi, inode->i_ino); + node_page = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) { int err = PTR_ERR(node_page); if (err == -ENOMEM) { @@ -492,7 +492,7 @@ void update_inode_page(struct inode *inode) } return; } - update_inode(inode, node_page); + f2fs_update_inode(inode, node_page); f2fs_put_page(node_page, 1); } @@ -511,7 +511,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) * We need to balance fs here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. */ - update_inode_page(inode); + f2fs_update_inode_page(inode); if (wbc && wbc->nr_to_write) f2fs_balance_fs(sbi, true); return 0; @@ -528,7 +528,7 @@ void f2fs_evict_inode(struct inode *inode) /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - drop_inmem_pages(inode); + f2fs_drop_inmem_pages(inode); trace_f2fs_evict_inode(inode); truncate_inode_pages_final(&inode->i_data); @@ -538,7 +538,7 @@ void f2fs_evict_inode(struct inode *inode) goto out_clear; f2fs_bug_on(sbi, get_dirty_pages(inode)); - remove_dirty_inode(inode); + f2fs_remove_dirty_inode(inode); f2fs_destroy_extent_tree(inode); @@ -547,9 +547,9 @@ void f2fs_evict_inode(struct inode *inode) dquot_initialize(inode); - remove_ino_entry(sbi, inode->i_ino, APPEND_INO); - remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); - remove_ino_entry(sbi, inode->i_ino, FLUSH_INO); + f2fs_remove_ino_entry(sbi, inode->i_ino, APPEND_INO); + f2fs_remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); + f2fs_remove_ino_entry(sbi, inode->i_ino, FLUSH_INO); sb_start_intwrite(inode->i_sb); set_inode_flag(inode, FI_NO_ALLOC); @@ -566,7 +566,7 @@ void f2fs_evict_inode(struct inode *inode) #endif if (!err) { f2fs_lock_op(sbi); - err = remove_inode_page(inode); + err = f2fs_remove_inode_page(inode); f2fs_unlock_op(sbi); if (err == -ENOENT) err = 0; @@ -579,7 +579,7 @@ void f2fs_evict_inode(struct inode *inode) } if (err) - update_inode_page(inode); + f2fs_update_inode_page(inode); dquot_free_inode(inode); sb_end_intwrite(inode->i_sb); no_delete: @@ -602,18 +602,18 @@ void f2fs_evict_inode(struct inode *inode) invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); if (inode->i_nlink) { if (is_inode_flag_set(inode, FI_APPEND_WRITE)) - add_ino_entry(sbi, inode->i_ino, APPEND_INO); + f2fs_add_ino_entry(sbi, inode->i_ino, APPEND_INO); if (is_inode_flag_set(inode, FI_UPDATE_WRITE)) - add_ino_entry(sbi, inode->i_ino, UPDATE_INO); + f2fs_add_ino_entry(sbi, inode->i_ino, UPDATE_INO); } if (is_inode_flag_set(inode, FI_FREE_NID)) { - alloc_nid_failed(sbi, inode->i_ino); + f2fs_alloc_nid_failed(sbi, inode->i_ino); clear_inode_flag(inode, FI_FREE_NID); } else { /* * If xattr nid is corrupted, we can reach out error condition, - * err & !exist_written_data(sbi, inode->i_ino, ORPHAN_INO)). - * In that case, check_nid_range() is enough to give a clue. + * err & !f2fs_exist_written_data(sbi, inode->i_ino, ORPHAN_INO)). + * In that case, f2fs_check_nid_range() is enough to give a clue. */ } out_clear: @@ -622,7 +622,7 @@ void f2fs_evict_inode(struct inode *inode) } /* caller should call f2fs_lock_op() */ -void handle_failed_inode(struct inode *inode) +void f2fs_handle_failed_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct node_info ni; @@ -637,7 +637,7 @@ void handle_failed_inode(struct inode *inode) * we must call this to avoid inode being remained as dirty, resulting * in a panic when flushing dirty inodes in gdirty_list. */ - update_inode_page(inode); + f2fs_update_inode_page(inode); f2fs_inode_synced(inode); /* don't make bad inode, since it becomes a regular file. */ @@ -648,18 +648,18 @@ void handle_failed_inode(struct inode *inode) * so we can prevent losing this orphan when encoutering checkpoint * and following suddenly power-off. */ - get_node_info(sbi, inode->i_ino, &ni); + f2fs_get_node_info(sbi, inode->i_ino, &ni); if (ni.blk_addr != NULL_ADDR) { - int err = acquire_orphan_inode(sbi); + int err = f2fs_acquire_orphan_inode(sbi); if (err) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_msg(sbi->sb, KERN_WARNING, "Too many orphan inodes, run fsck to fix."); } else { - add_orphan_inode(inode); + f2fs_add_orphan_inode(inode); } - alloc_nid_done(sbi, inode->i_ino); + f2fs_alloc_nid_done(sbi, inode->i_ino); } else { set_inode_flag(inode, FI_FREE_NID); } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index b32433d8667b..7b025524ee16 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -37,7 +37,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) return ERR_PTR(-ENOMEM); f2fs_lock_op(sbi); - if (!alloc_nid(sbi, &ino)) { + if (!f2fs_alloc_nid(sbi, &ino)) { f2fs_unlock_op(sbi); err = -ENOSPC; goto fail; @@ -196,7 +196,7 @@ static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode * up_read(&sbi->sb_lock); } -int update_extension_list(struct f2fs_sb_info *sbi, const char *name, +int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, bool hot, bool set) { __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; @@ -295,7 +295,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, goto out; f2fs_unlock_op(sbi); - alloc_nid_done(sbi, ino); + f2fs_alloc_nid_done(sbi, ino); unlock_new_inode(inode); d_instantiate(dentry, inode); @@ -306,7 +306,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, f2fs_balance_fs(sbi, true); return 0; out: - handle_failed_inode(inode); + f2fs_handle_failed_inode(inode); return err; } @@ -401,7 +401,7 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) err = PTR_ERR(page); goto out; } else { - err = __f2fs_add_link(dir, &dot, NULL, dir->i_ino, S_IFDIR); + err = f2fs_do_add_link(dir, &dot, NULL, dir->i_ino, S_IFDIR); if (err) goto out; } @@ -412,7 +412,7 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) else if (IS_ERR(page)) err = PTR_ERR(page); else - err = __f2fs_add_link(dir, &dotdot, NULL, pino, S_IFDIR); + err = f2fs_do_add_link(dir, &dotdot, NULL, pino, S_IFDIR); out: if (!err) clear_inode_flag(dir, FI_INLINE_DOTS); @@ -524,7 +524,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); - err = acquire_orphan_inode(sbi); + err = f2fs_acquire_orphan_inode(sbi); if (err) { f2fs_unlock_op(sbi); f2fs_put_page(page, 0); @@ -586,9 +586,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) - goto out_handle_failed_inode; + goto out_f2fs_handle_failed_inode; f2fs_unlock_op(sbi); - alloc_nid_done(sbi, inode->i_ino); + f2fs_alloc_nid_done(sbi, inode->i_ino); err = fscrypt_encrypt_symlink(inode, symname, len, &disk_link); if (err) @@ -622,8 +622,8 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, f2fs_balance_fs(sbi, true); goto out_free_encrypted_link; -out_handle_failed_inode: - handle_failed_inode(inode); +out_f2fs_handle_failed_inode: + f2fs_handle_failed_inode(inode); out_free_encrypted_link: if (disk_link.name != (unsigned char *)symname) kfree(disk_link.name); @@ -659,7 +659,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) goto out_fail; f2fs_unlock_op(sbi); - alloc_nid_done(sbi, inode->i_ino); + f2fs_alloc_nid_done(sbi, inode->i_ino); unlock_new_inode(inode); d_instantiate(dentry, inode); @@ -672,7 +672,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) out_fail: clear_inode_flag(inode, FI_INC_LINK); - handle_failed_inode(inode); + f2fs_handle_failed_inode(inode); return err; } @@ -711,7 +711,7 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, goto out; f2fs_unlock_op(sbi); - alloc_nid_done(sbi, inode->i_ino); + f2fs_alloc_nid_done(sbi, inode->i_ino); unlock_new_inode(inode); d_instantiate(dentry, inode); @@ -722,7 +722,7 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, f2fs_balance_fs(sbi, true); return 0; out: - handle_failed_inode(inode); + f2fs_handle_failed_inode(inode); return err; } @@ -751,7 +751,7 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, } f2fs_lock_op(sbi); - err = acquire_orphan_inode(sbi); + err = f2fs_acquire_orphan_inode(sbi); if (err) goto out; @@ -763,8 +763,8 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, * add this non-linked tmpfile to orphan list, in this way we could * remove all unused data of tmpfile after abnormal power-off. */ - add_orphan_inode(inode); - alloc_nid_done(sbi, inode->i_ino); + f2fs_add_orphan_inode(inode); + f2fs_alloc_nid_done(sbi, inode->i_ino); if (whiteout) { f2fs_i_links_write(inode, false); @@ -780,9 +780,9 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, return 0; release_out: - release_orphan_inode(sbi); + f2fs_release_orphan_inode(sbi); out: - handle_failed_inode(inode); + f2fs_handle_failed_inode(inode); return err; } @@ -889,7 +889,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_lock_op(sbi); - err = acquire_orphan_inode(sbi); + err = f2fs_acquire_orphan_inode(sbi); if (err) goto put_out_dir; @@ -903,9 +903,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, up_write(&F2FS_I(new_inode)->i_sem); if (!new_inode->i_nlink) - add_orphan_inode(new_inode); + f2fs_add_orphan_inode(new_inode); else - release_orphan_inode(sbi); + f2fs_release_orphan_inode(sbi); } else { f2fs_balance_fs(sbi, true); @@ -974,9 +974,10 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_i_links_write(old_dir, false); } if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) { - add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); + f2fs_add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); if (S_ISDIR(old_inode->i_mode)) - add_ino_entry(sbi, old_inode->i_ino, TRANS_DIR_INO); + f2fs_add_ino_entry(sbi, old_inode->i_ino, + TRANS_DIR_INO); } f2fs_unlock_op(sbi); @@ -1128,8 +1129,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_mark_inode_dirty_sync(new_dir, false); if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) { - add_ino_entry(sbi, old_dir->i_ino, TRANS_DIR_INO); - add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); + f2fs_add_ino_entry(sbi, old_dir->i_ino, TRANS_DIR_INO); + f2fs_add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); } f2fs_unlock_op(sbi); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 2902e1fadebc..cd0f60b5be7a 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -23,7 +23,7 @@ #include "trace.h" #include -#define on_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock) +#define on_f2fs_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock) static struct kmem_cache *nat_entry_slab; static struct kmem_cache *free_nid_slab; @@ -32,7 +32,7 @@ static struct kmem_cache *nat_entry_set_slab; /* * Check whether the given nid is within node id range. */ -int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) +int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) { if (unlikely(nid < F2FS_ROOT_INO(sbi) || nid >= NM_I(sbi)->max_nid)) { set_sbi_flag(sbi, SBI_NEED_FSCK); @@ -44,7 +44,7 @@ int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) return 0; } -bool available_free_memory(struct f2fs_sb_info *sbi, int type) +bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct sysinfo val; @@ -103,7 +103,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) static void clear_node_page_dirty(struct page *page) { if (PageDirty(page)) { - clear_radix_tree_dirty_tag(page); + f2fs_clear_radix_tree_dirty_tag(page); clear_page_dirty_for_io(page); dec_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); } @@ -113,7 +113,7 @@ static void clear_node_page_dirty(struct page *page) static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid) { pgoff_t index = current_nat_addr(sbi, nid); - return get_meta_page(sbi, index); + return f2fs_get_meta_page(sbi, index); } static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) @@ -130,8 +130,8 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) dst_off = next_nat_addr(sbi, src_off); /* get current nat block page with lock */ - src_page = get_meta_page(sbi, src_off); - dst_page = grab_meta_page(sbi, dst_off); + src_page = f2fs_get_meta_page(sbi, src_off); + dst_page = f2fs_grab_meta_page(sbi, dst_off); f2fs_bug_on(sbi, PageDirty(src_page)); src_addr = page_address(src_page); @@ -267,7 +267,7 @@ static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, start, nr); } -int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid) +int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; @@ -284,7 +284,7 @@ int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid) return need; } -bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) +bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; @@ -298,7 +298,7 @@ bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) return is_cp; } -bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) +bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; @@ -397,7 +397,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, up_write(&nm_i->nat_tree_lock); } -int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) +int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) { struct f2fs_nm_info *nm_i = NM_I(sbi); int nr = nr_shrink; @@ -419,7 +419,8 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) /* * This function always returns success */ -void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) +void f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, + struct node_info *ni) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -449,7 +450,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) /* Check current segment summary */ down_read(&curseg->journal_rwsem); - i = lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0); + i = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0); if (i >= 0) { ne = nat_in_journal(journal, i); node_info_from_raw_nat(ni, &ne); @@ -464,7 +465,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) index = current_nat_addr(sbi, nid); up_read(&nm_i->nat_tree_lock); - page = get_meta_page(sbi, index); + page = f2fs_get_meta_page(sbi, index); nat_blk = (struct f2fs_nat_block *)page_address(page); ne = nat_blk->entries[nid - start_nid]; node_info_from_raw_nat(ni, &ne); @@ -477,7 +478,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) /* * readahead MAX_RA_NODE number of node pages. */ -static void ra_node_pages(struct page *parent, int start, int n) +static void f2fs_ra_node_pages(struct page *parent, int start, int n) { struct f2fs_sb_info *sbi = F2FS_P_SB(parent); struct blk_plug plug; @@ -491,13 +492,13 @@ static void ra_node_pages(struct page *parent, int start, int n) end = min(end, NIDS_PER_BLOCK); for (i = start; i < end; i++) { nid = get_nid(parent, i, false); - ra_node_page(sbi, nid); + f2fs_ra_node_page(sbi, nid); } blk_finish_plug(&plug); } -pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs) +pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs) { const long direct_index = ADDRS_PER_INODE(dn->inode); const long direct_blks = ADDRS_PER_BLOCK; @@ -612,7 +613,7 @@ static int get_node_path(struct inode *inode, long block, * f2fs_unlock_op() only if ro is not set RDONLY_NODE. * In the case of RDONLY_NODE, we don't need to care about mutex. */ -int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) +int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct page *npage[4]; @@ -631,7 +632,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) npage[0] = dn->inode_page; if (!npage[0]) { - npage[0] = get_node_page(sbi, nids[0]); + npage[0] = f2fs_get_node_page(sbi, nids[0]); if (IS_ERR(npage[0])) return PTR_ERR(npage[0]); } @@ -655,24 +656,24 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) if (!nids[i] && mode == ALLOC_NODE) { /* alloc new node */ - if (!alloc_nid(sbi, &(nids[i]))) { + if (!f2fs_alloc_nid(sbi, &(nids[i]))) { err = -ENOSPC; goto release_pages; } dn->nid = nids[i]; - npage[i] = new_node_page(dn, noffset[i]); + npage[i] = f2fs_new_node_page(dn, noffset[i]); if (IS_ERR(npage[i])) { - alloc_nid_failed(sbi, nids[i]); + f2fs_alloc_nid_failed(sbi, nids[i]); err = PTR_ERR(npage[i]); goto release_pages; } set_nid(parent, offset[i - 1], nids[i], i == 1); - alloc_nid_done(sbi, nids[i]); + f2fs_alloc_nid_done(sbi, nids[i]); done = true; } else if (mode == LOOKUP_NODE_RA && i == level && level > 1) { - npage[i] = get_node_page_ra(parent, offset[i - 1]); + npage[i] = f2fs_get_node_page_ra(parent, offset[i - 1]); if (IS_ERR(npage[i])) { err = PTR_ERR(npage[i]); goto release_pages; @@ -687,7 +688,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) } if (!done) { - npage[i] = get_node_page(sbi, nids[i]); + npage[i] = f2fs_get_node_page(sbi, nids[i]); if (IS_ERR(npage[i])) { err = PTR_ERR(npage[i]); f2fs_put_page(npage[0], 0); @@ -726,15 +727,15 @@ static void truncate_node(struct dnode_of_data *dn) struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct node_info ni; - get_node_info(sbi, dn->nid, &ni); + f2fs_get_node_info(sbi, dn->nid, &ni); /* Deallocate node address */ - invalidate_blocks(sbi, ni.blk_addr); + f2fs_invalidate_blocks(sbi, ni.blk_addr); dec_valid_node_count(sbi, dn->inode, dn->nid == dn->inode->i_ino); set_node_addr(sbi, &ni, NULL_ADDR, false); if (dn->nid == dn->inode->i_ino) { - remove_orphan_inode(sbi, dn->nid); + f2fs_remove_orphan_inode(sbi, dn->nid); dec_valid_inode_count(sbi); f2fs_inode_synced(dn->inode); } @@ -759,7 +760,7 @@ static int truncate_dnode(struct dnode_of_data *dn) return 1; /* get direct node */ - page = get_node_page(F2FS_I_SB(dn->inode), dn->nid); + page = f2fs_get_node_page(F2FS_I_SB(dn->inode), dn->nid); if (IS_ERR(page) && PTR_ERR(page) == -ENOENT) return 1; else if (IS_ERR(page)) @@ -768,7 +769,7 @@ static int truncate_dnode(struct dnode_of_data *dn) /* Make dnode_of_data for parameter */ dn->node_page = page; dn->ofs_in_node = 0; - truncate_data_blocks(dn); + f2fs_truncate_data_blocks(dn); truncate_node(dn); return 1; } @@ -789,13 +790,13 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr); - page = get_node_page(F2FS_I_SB(dn->inode), dn->nid); + page = f2fs_get_node_page(F2FS_I_SB(dn->inode), dn->nid); if (IS_ERR(page)) { trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page)); return PTR_ERR(page); } - ra_node_pages(page, ofs, NIDS_PER_BLOCK); + f2fs_ra_node_pages(page, ofs, NIDS_PER_BLOCK); rn = F2FS_NODE(page); if (depth < 3) { @@ -865,7 +866,7 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, /* get indirect nodes in the path */ for (i = 0; i < idx + 1; i++) { /* reference count'll be increased */ - pages[i] = get_node_page(F2FS_I_SB(dn->inode), nid[i]); + pages[i] = f2fs_get_node_page(F2FS_I_SB(dn->inode), nid[i]); if (IS_ERR(pages[i])) { err = PTR_ERR(pages[i]); idx = i - 1; @@ -874,7 +875,7 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, nid[i + 1] = get_nid(pages[i], offset[i + 1], false); } - ra_node_pages(pages[idx], offset[idx + 1], NIDS_PER_BLOCK); + f2fs_ra_node_pages(pages[idx], offset[idx + 1], NIDS_PER_BLOCK); /* free direct nodes linked to a partial indirect node */ for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) { @@ -911,7 +912,7 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, /* * All the block addresses of data and nodes should be nullified. */ -int truncate_inode_blocks(struct inode *inode, pgoff_t from) +int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int err = 0, cont = 1; @@ -927,7 +928,7 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from) if (level < 0) return level; - page = get_node_page(sbi, inode->i_ino); + page = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) { trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page)); return PTR_ERR(page); @@ -1007,7 +1008,7 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from) } /* caller must lock inode page */ -int truncate_xattr_node(struct inode *inode) +int f2fs_truncate_xattr_node(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t nid = F2FS_I(inode)->i_xattr_nid; @@ -1017,7 +1018,7 @@ int truncate_xattr_node(struct inode *inode) if (!nid) return 0; - npage = get_node_page(sbi, nid); + npage = f2fs_get_node_page(sbi, nid); if (IS_ERR(npage)) return PTR_ERR(npage); @@ -1032,17 +1033,17 @@ int truncate_xattr_node(struct inode *inode) * Caller should grab and release a rwsem by calling f2fs_lock_op() and * f2fs_unlock_op(). */ -int remove_inode_page(struct inode *inode) +int f2fs_remove_inode_page(struct inode *inode) { struct dnode_of_data dn; int err; set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); - err = get_dnode_of_data(&dn, 0, LOOKUP_NODE); + err = f2fs_get_dnode_of_data(&dn, 0, LOOKUP_NODE); if (err) return err; - err = truncate_xattr_node(inode); + err = f2fs_truncate_xattr_node(inode); if (err) { f2fs_put_dnode(&dn); return err; @@ -1051,7 +1052,7 @@ int remove_inode_page(struct inode *inode) /* remove potential inline_data blocks */ if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - truncate_data_blocks_range(&dn, 1); + f2fs_truncate_data_blocks_range(&dn, 1); /* 0 is possible, after f2fs_new_inode() has failed */ f2fs_bug_on(F2FS_I_SB(inode), @@ -1062,7 +1063,7 @@ int remove_inode_page(struct inode *inode) return 0; } -struct page *new_inode_page(struct inode *inode) +struct page *f2fs_new_inode_page(struct inode *inode) { struct dnode_of_data dn; @@ -1070,10 +1071,10 @@ struct page *new_inode_page(struct inode *inode) set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); /* caller should f2fs_put_page(page, 1); */ - return new_node_page(&dn, 0); + return f2fs_new_node_page(&dn, 0); } -struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs) +struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct node_info new_ni; @@ -1091,7 +1092,7 @@ struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs) goto fail; #ifdef CONFIG_F2FS_CHECK_FS - get_node_info(sbi, dn->nid, &new_ni); + f2fs_get_node_info(sbi, dn->nid, &new_ni); f2fs_bug_on(sbi, new_ni.blk_addr != NULL_ADDR); #endif new_ni.nid = dn->nid; @@ -1143,7 +1144,7 @@ static int read_node_page(struct page *page, int op_flags) if (PageUptodate(page)) return LOCKED_PAGE; - get_node_info(sbi, page->index, &ni); + f2fs_get_node_info(sbi, page->index, &ni); if (unlikely(ni.blk_addr == NULL_ADDR)) { ClearPageUptodate(page); @@ -1157,14 +1158,14 @@ static int read_node_page(struct page *page, int op_flags) /* * Readahead a node page */ -void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) +void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) { struct page *apage; int err; if (!nid) return; - if (check_nid_range(sbi, nid)) + if (f2fs_check_nid_range(sbi, nid)) return; rcu_read_lock(); @@ -1189,7 +1190,7 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, if (!nid) return ERR_PTR(-ENOENT); - if (check_nid_range(sbi, nid)) + if (f2fs_check_nid_range(sbi, nid)) return ERR_PTR(-EINVAL); repeat: page = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false); @@ -1206,7 +1207,7 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, } if (parent) - ra_node_pages(parent, start + 1, MAX_RA_NODE); + f2fs_ra_node_pages(parent, start + 1, MAX_RA_NODE); lock_page(page); @@ -1240,12 +1241,12 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, return page; } -struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) +struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) { return __get_node_page(sbi, nid, NULL, 0); } -struct page *get_node_page_ra(struct page *parent, int start) +struct page *f2fs_get_node_page_ra(struct page *parent, int start) { struct f2fs_sb_info *sbi = F2FS_P_SB(parent); nid_t nid = get_nid(parent, start, false); @@ -1280,7 +1281,7 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) ret = f2fs_write_inline_data(inode, page); inode_dec_dirty_pages(inode); - remove_dirty_inode(inode); + f2fs_remove_dirty_inode(inode); if (ret) set_page_dirty(page); page_out: @@ -1384,7 +1385,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, down_read(&sbi->node_write); } - get_node_info(sbi, nid, &ni); + f2fs_get_node_info(sbi, nid, &ni); /* This page is already truncated */ if (unlikely(ni.blk_addr == NULL_ADDR)) { @@ -1401,7 +1402,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, set_page_writeback(page); ClearPageError(page); fio.old_blkaddr = ni.blk_addr; - write_node_page(nid, &fio); + f2fs_do_write_node_page(nid, &fio); set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); dec_page_count(sbi, F2FS_DIRTY_NODES); up_read(&sbi->node_write); @@ -1430,7 +1431,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, return AOP_WRITEPAGE_ACTIVATE; } -void move_node_page(struct page *node_page, int gc_type) +void f2fs_move_node_page(struct page *node_page, int gc_type) { if (gc_type == FG_GC) { struct writeback_control wbc = { @@ -1467,7 +1468,7 @@ static int f2fs_write_node_page(struct page *page, return __write_node_page(page, false, NULL, wbc, false, FS_NODE_IO); } -int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, +int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic) { pgoff_t index; @@ -1534,9 +1535,9 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, if (IS_INODE(page)) { if (is_inode_flag_set(inode, FI_DIRTY_INODE)) - update_inode(inode, page); + f2fs_update_inode(inode, page); set_dentry_mark(page, - need_dentry_mark(sbi, ino)); + f2fs_need_dentry_mark(sbi, ino)); } /* may be written by other thread */ if (!PageDirty(page)) @@ -1586,7 +1587,8 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, return ret ? -EIO: 0; } -int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, +int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, + struct writeback_control *wbc, bool do_balance, enum iostat_type io_type) { pgoff_t index; @@ -1687,7 +1689,7 @@ int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, return ret; } -int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) +int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) { pgoff_t index = 0; struct pagevec pvec; @@ -1744,7 +1746,7 @@ static int f2fs_write_node_pages(struct address_space *mapping, diff = nr_pages_to_write(sbi, NODE, wbc); wbc->sync_mode = WB_SYNC_NONE; blk_start_plug(&plug); - sync_node_pages(sbi, wbc, true, FS_NODE_IO); + f2fs_sync_node_pages(sbi, wbc, true, FS_NODE_IO); blk_finish_plug(&plug); wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return 0; @@ -1892,20 +1894,20 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, * Thread A Thread B * - f2fs_create * - f2fs_new_inode - * - alloc_nid + * - f2fs_alloc_nid * - __insert_nid_to_list(PREALLOC_NID) * - f2fs_balance_fs_bg - * - build_free_nids - * - __build_free_nids + * - f2fs_build_free_nids + * - __f2fs_build_free_nids * - scan_nat_page * - add_free_nid * - __lookup_nat_cache * - f2fs_add_link - * - init_inode_metadata - * - new_inode_page - * - new_node_page + * - f2fs_init_inode_metadata + * - f2fs_new_inode_page + * - f2fs_new_node_page * - set_node_addr - * - alloc_nid_done + * - f2fs_alloc_nid_done * - __remove_nid_from_list(PREALLOC_NID) * - __insert_nid_to_list(FREE_NID) */ @@ -2037,7 +2039,8 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi) up_read(&nm_i->nat_tree_lock); } -static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) +static void __f2fs_build_free_nids(struct f2fs_sb_info *sbi, + bool sync, bool mount) { struct f2fs_nm_info *nm_i = NM_I(sbi); int i = 0; @@ -2050,7 +2053,7 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK) return; - if (!sync && !available_free_memory(sbi, FREE_NIDS)) + if (!sync && !f2fs_available_free_memory(sbi, FREE_NIDS)) return; if (!mount) { @@ -2062,7 +2065,7 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) } /* readahead nat pages to be scanned */ - ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, + f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT, true); down_read(&nm_i->nat_tree_lock); @@ -2092,14 +2095,14 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) up_read(&nm_i->nat_tree_lock); - ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), + f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), nm_i->ra_nid_pages, META_NAT, false); } -void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) +void f2fs_build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) { mutex_lock(&NM_I(sbi)->build_lock); - __build_free_nids(sbi, sync, mount); + __f2fs_build_free_nids(sbi, sync, mount); mutex_unlock(&NM_I(sbi)->build_lock); } @@ -2108,7 +2111,7 @@ void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) * from second parameter of this function. * The returned nid could be used ino as well as nid when inode is created. */ -bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) +bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i = NULL; @@ -2126,8 +2129,8 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) return false; } - /* We should not use stale free nids created by build_free_nids */ - if (nm_i->nid_cnt[FREE_NID] && !on_build_free_nids(nm_i)) { + /* We should not use stale free nids created by f2fs_build_free_nids */ + if (nm_i->nid_cnt[FREE_NID] && !on_f2fs_build_free_nids(nm_i)) { f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); i = list_first_entry(&nm_i->free_nid_list, struct free_nid, list); @@ -2144,14 +2147,14 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) spin_unlock(&nm_i->nid_list_lock); /* Let's scan nat pages and its caches to get free nids */ - build_free_nids(sbi, true, false); + f2fs_build_free_nids(sbi, true, false); goto retry; } /* - * alloc_nid() should be called prior to this function. + * f2fs_alloc_nid() should be called prior to this function. */ -void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) +void f2fs_alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; @@ -2166,9 +2169,9 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) } /* - * alloc_nid() should be called prior to this function. + * f2fs_alloc_nid() should be called prior to this function. */ -void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) +void f2fs_alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; @@ -2181,7 +2184,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) i = __lookup_free_nid_list(nm_i, nid); f2fs_bug_on(sbi, !i); - if (!available_free_memory(sbi, FREE_NIDS)) { + if (!f2fs_available_free_memory(sbi, FREE_NIDS)) { __remove_free_nid(sbi, i, PREALLOC_NID); need_free = true; } else { @@ -2198,7 +2201,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) kmem_cache_free(free_nid_slab, i); } -int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) +int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i, *next; @@ -2226,14 +2229,14 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) return nr - nr_shrink; } -void recover_inline_xattr(struct inode *inode, struct page *page) +void f2fs_recover_inline_xattr(struct inode *inode, struct page *page) { void *src_addr, *dst_addr; size_t inline_size; struct page *ipage; struct f2fs_inode *ri; - ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); + ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); f2fs_bug_on(F2FS_I_SB(inode), IS_ERR(ipage)); ri = F2FS_INODE(page); @@ -2251,11 +2254,11 @@ void recover_inline_xattr(struct inode *inode, struct page *page) f2fs_wait_on_page_writeback(ipage, NODE, true); memcpy(dst_addr, src_addr, inline_size); update_inode: - update_inode(inode, ipage); + f2fs_update_inode(inode, ipage); f2fs_put_page(ipage, 1); } -int recover_xattr_data(struct inode *inode, struct page *page) +int f2fs_recover_xattr_data(struct inode *inode, struct page *page) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; @@ -2268,25 +2271,25 @@ int recover_xattr_data(struct inode *inode, struct page *page) goto recover_xnid; /* 1: invalidate the previous xattr nid */ - get_node_info(sbi, prev_xnid, &ni); - invalidate_blocks(sbi, ni.blk_addr); + f2fs_get_node_info(sbi, prev_xnid, &ni); + f2fs_invalidate_blocks(sbi, ni.blk_addr); dec_valid_node_count(sbi, inode, false); set_node_addr(sbi, &ni, NULL_ADDR, false); recover_xnid: /* 2: update xattr nid in inode */ - if (!alloc_nid(sbi, &new_xnid)) + if (!f2fs_alloc_nid(sbi, &new_xnid)) return -ENOSPC; set_new_dnode(&dn, inode, NULL, NULL, new_xnid); - xpage = new_node_page(&dn, XATTR_NODE_OFFSET); + xpage = f2fs_new_node_page(&dn, XATTR_NODE_OFFSET); if (IS_ERR(xpage)) { - alloc_nid_failed(sbi, new_xnid); + f2fs_alloc_nid_failed(sbi, new_xnid); return PTR_ERR(xpage); } - alloc_nid_done(sbi, new_xnid); - update_inode_page(inode); + f2fs_alloc_nid_done(sbi, new_xnid); + f2fs_update_inode_page(inode); /* 3: update and set xattr node page dirty */ memcpy(F2FS_NODE(xpage), F2FS_NODE(page), VALID_XATTR_BLOCK_SIZE); @@ -2297,14 +2300,14 @@ int recover_xattr_data(struct inode *inode, struct page *page) return 0; } -int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) +int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) { struct f2fs_inode *src, *dst; nid_t ino = ino_of_node(page); struct node_info old_ni, new_ni; struct page *ipage; - get_node_info(sbi, ino, &old_ni); + f2fs_get_node_info(sbi, ino, &old_ni); if (unlikely(old_ni.blk_addr != NULL_ADDR)) return -EINVAL; @@ -2358,7 +2361,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) return 0; } -void restore_node_summary(struct f2fs_sb_info *sbi, +void f2fs_restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum) { struct f2fs_node *rn; @@ -2375,10 +2378,10 @@ void restore_node_summary(struct f2fs_sb_info *sbi, nrpages = min(last_offset - i, BIO_MAX_PAGES); /* readahead node pages */ - ra_meta_pages(sbi, addr, nrpages, META_POR, true); + f2fs_ra_meta_pages(sbi, addr, nrpages, META_POR, true); for (idx = addr; idx < addr + nrpages; idx++) { - struct page *page = get_tmp_page(sbi, idx); + struct page *page = f2fs_get_tmp_page(sbi, idx); rn = F2FS_NODE(page); sum_entry->nid = rn->footer.nid; @@ -2520,7 +2523,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, nat_get_blkaddr(ne) == NEW_ADDR); if (to_journal) { - offset = lookup_journal_in_cursum(journal, + offset = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 1); f2fs_bug_on(sbi, offset < 0); raw_ne = &nat_in_journal(journal, offset); @@ -2557,7 +2560,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, /* * This function is called during the checkpointing process. */ -void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) +void f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -2620,7 +2623,7 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg - nm_i->nat_bits_blocks; for (i = 0; i < nm_i->nat_bits_blocks; i++) { - struct page *page = get_meta_page(sbi, nat_bits_addr++); + struct page *page = f2fs_get_meta_page(sbi, nat_bits_addr++); memcpy(nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS), page_address(page), F2FS_BLKSIZE); @@ -2763,7 +2766,7 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi) return 0; } -int build_node_manager(struct f2fs_sb_info *sbi) +int f2fs_build_node_manager(struct f2fs_sb_info *sbi) { int err; @@ -2783,11 +2786,11 @@ int build_node_manager(struct f2fs_sb_info *sbi) /* load free nid status from nat_bits table */ load_free_nid_bitmap(sbi); - build_free_nids(sbi, true, true); + f2fs_build_free_nids(sbi, true, true); return 0; } -void destroy_node_manager(struct f2fs_sb_info *sbi) +void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i, *next_i; @@ -2859,7 +2862,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) kfree(nm_i); } -int __init create_node_manager_caches(void) +int __init f2fs_create_node_manager_caches(void) { nat_entry_slab = f2fs_kmem_cache_create("nat_entry", sizeof(struct nat_entry)); @@ -2885,7 +2888,7 @@ int __init create_node_manager_caches(void) return -ENOMEM; } -void destroy_node_manager_caches(void) +void f2fs_destroy_node_manager_caches(void) { kmem_cache_destroy(nat_entry_set_slab); kmem_cache_destroy(free_nid_slab); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 3c3551811134..daf81d416b89 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -47,7 +47,7 @@ static struct kmem_cache *fsync_entry_slab; -bool space_for_roll_forward(struct f2fs_sb_info *sbi) +bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi) { s64 nalloc = percpu_counter_sum_positive(&sbi->alloc_valid_block_count); @@ -162,7 +162,7 @@ static int recover_dentry(struct inode *inode, struct page *ipage, goto out_put; } - err = acquire_orphan_inode(F2FS_I_SB(inode)); + err = f2fs_acquire_orphan_inode(F2FS_I_SB(inode)); if (err) { iput(einode); goto out_put; @@ -173,7 +173,7 @@ static int recover_dentry(struct inode *inode, struct page *ipage, } else if (IS_ERR(page)) { err = PTR_ERR(page); } else { - err = __f2fs_do_add_link(dir, &fname, inode, + err = f2fs_add_dentry(dir, &fname, inode, inode->i_ino, inode->i_mode); } if (err == -ENOMEM) @@ -252,10 +252,10 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, while (1) { struct fsync_inode_entry *entry; - if (!is_valid_meta_blkaddr(sbi, blkaddr, META_POR)) + if (!f2fs_is_valid_meta_blkaddr(sbi, blkaddr, META_POR)) return 0; - page = get_tmp_page(sbi, blkaddr); + page = f2fs_get_tmp_page(sbi, blkaddr); if (!is_recoverable_dnode(page)) break; @@ -269,7 +269,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, if (!check_only && IS_INODE(page) && is_dent_dnode(page)) { - err = recover_inode_page(sbi, page); + err = f2fs_recover_inode_page(sbi, page); if (err) break; quota_inode = true; @@ -310,7 +310,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, blkaddr = next_blkaddr_of_node(page); f2fs_put_page(page, 1); - ra_meta_pages_cond(sbi, blkaddr); + f2fs_ra_meta_pages_cond(sbi, blkaddr); } f2fs_put_page(page, 1); return err; @@ -353,7 +353,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, } } - sum_page = get_sum_page(sbi, segno); + sum_page = f2fs_get_sum_page(sbi, segno); sum_node = (struct f2fs_summary_block *)page_address(sum_page); sum = sum_node->entries[blkoff]; f2fs_put_page(sum_page, 1); @@ -373,7 +373,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, } /* Get the node page */ - node_page = get_node_page(sbi, nid); + node_page = f2fs_get_node_page(sbi, nid); if (IS_ERR(node_page)) return PTR_ERR(node_page); @@ -398,7 +398,8 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, inode = dn->inode; } - bidx = start_bidx_of_node(offset, inode) + le16_to_cpu(sum.ofs_in_node); + bidx = f2fs_start_bidx_of_node(offset, inode) + + le16_to_cpu(sum.ofs_in_node); /* * if inode page is locked, unlock temporarily, but its reference @@ -408,11 +409,11 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, unlock_page(dn->inode_page); set_new_dnode(&tdn, inode, NULL, NULL, 0); - if (get_dnode_of_data(&tdn, bidx, LOOKUP_NODE)) + if (f2fs_get_dnode_of_data(&tdn, bidx, LOOKUP_NODE)) goto out; if (tdn.data_blkaddr == blkaddr) - truncate_data_blocks_range(&tdn, 1); + f2fs_truncate_data_blocks_range(&tdn, 1); f2fs_put_dnode(&tdn); out: @@ -425,7 +426,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, truncate_out: if (datablock_addr(tdn.inode, tdn.node_page, tdn.ofs_in_node) == blkaddr) - truncate_data_blocks_range(&tdn, 1); + f2fs_truncate_data_blocks_range(&tdn, 1); if (dn->inode->i_ino == nid && !dn->inode_page_locked) unlock_page(dn->inode_page); return 0; @@ -441,25 +442,25 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, /* step 1: recover xattr */ if (IS_INODE(page)) { - recover_inline_xattr(inode, page); + f2fs_recover_inline_xattr(inode, page); } else if (f2fs_has_xattr_block(ofs_of_node(page))) { - err = recover_xattr_data(inode, page); + err = f2fs_recover_xattr_data(inode, page); if (!err) recovered++; goto out; } /* step 2: recover inline data */ - if (recover_inline_data(inode, page)) + if (f2fs_recover_inline_data(inode, page)) goto out; /* step 3: recover data indices */ - start = start_bidx_of_node(ofs_of_node(page), inode); + start = f2fs_start_bidx_of_node(ofs_of_node(page), inode); end = start + ADDRS_PER_PAGE(page, inode); set_new_dnode(&dn, inode, NULL, NULL, 0); retry_dn: - err = get_dnode_of_data(&dn, start, ALLOC_NODE); + err = f2fs_get_dnode_of_data(&dn, start, ALLOC_NODE); if (err) { if (err == -ENOMEM) { congestion_wait(BLK_RW_ASYNC, HZ/50); @@ -470,7 +471,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, f2fs_wait_on_page_writeback(dn.node_page, NODE, true); - get_node_info(sbi, dn.nid, &ni); + f2fs_get_node_info(sbi, dn.nid, &ni); f2fs_bug_on(sbi, ni.ino != ino_of_node(page)); f2fs_bug_on(sbi, ofs_of_node(dn.node_page) != ofs_of_node(page)); @@ -486,7 +487,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, /* dest is invalid, just invalidate src block */ if (dest == NULL_ADDR) { - truncate_data_blocks_range(&dn, 1); + f2fs_truncate_data_blocks_range(&dn, 1); continue; } @@ -500,19 +501,19 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, * and then reserve one new block in dnode page. */ if (dest == NEW_ADDR) { - truncate_data_blocks_range(&dn, 1); - reserve_new_block(&dn); + f2fs_truncate_data_blocks_range(&dn, 1); + f2fs_reserve_new_block(&dn); continue; } /* dest is valid block, try to recover from src to dest */ - if (is_valid_meta_blkaddr(sbi, dest, META_POR)) { + if (f2fs_is_valid_meta_blkaddr(sbi, dest, META_POR)) { if (src == NULL_ADDR) { - err = reserve_new_block(&dn); + err = f2fs_reserve_new_block(&dn); #ifdef CONFIG_F2FS_FAULT_INJECTION while (err) - err = reserve_new_block(&dn); + err = f2fs_reserve_new_block(&dn); #endif /* We should not get -ENOSPC */ f2fs_bug_on(sbi, err); @@ -567,12 +568,12 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, while (1) { struct fsync_inode_entry *entry; - if (!is_valid_meta_blkaddr(sbi, blkaddr, META_POR)) + if (!f2fs_is_valid_meta_blkaddr(sbi, blkaddr, META_POR)) break; - ra_meta_pages_cond(sbi, blkaddr); + f2fs_ra_meta_pages_cond(sbi, blkaddr); - page = get_tmp_page(sbi, blkaddr); + page = f2fs_get_tmp_page(sbi, blkaddr); if (!is_recoverable_dnode(page)) { f2fs_put_page(page, 1); @@ -610,11 +611,11 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, f2fs_put_page(page, 1); } if (!err) - allocate_new_segments(sbi); + f2fs_allocate_new_segments(sbi); return err; } -int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) +int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) { struct list_head inode_list; struct list_head dir_list; @@ -689,7 +690,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) struct cp_control cpc = { .reason = CP_RECOVERY, }; - err = write_checkpoint(sbi, &cpc); + err = f2fs_write_checkpoint(sbi, &cpc); } kmem_cache_destroy(fsync_entry_slab); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 507f697178b6..8672bf574426 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -169,7 +169,7 @@ static unsigned long __find_rev_next_zero_bit(const unsigned long *addr, return result - size + __reverse_ffz(tmp); } -bool need_SSR(struct f2fs_sb_info *sbi) +bool f2fs_need_SSR(struct f2fs_sb_info *sbi) { int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); @@ -184,7 +184,7 @@ bool need_SSR(struct f2fs_sb_info *sbi) SM_I(sbi)->min_ssr_sections + reserved_sections(sbi)); } -void register_inmem_page(struct inode *inode, struct page *page) +void f2fs_register_inmem_page(struct inode *inode, struct page *page) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); @@ -239,7 +239,8 @@ static int __revoke_inmem_pages(struct inode *inode, trace_f2fs_commit_inmem_page(page, INMEM_REVOKE); retry: set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); + err = f2fs_get_dnode_of_data(&dn, page->index, + LOOKUP_NODE); if (err) { if (err == -ENOMEM) { congestion_wait(BLK_RW_ASYNC, HZ/50); @@ -249,9 +250,9 @@ static int __revoke_inmem_pages(struct inode *inode, err = -EAGAIN; goto next; } - get_node_info(sbi, dn.nid, &ni); + f2fs_get_node_info(sbi, dn.nid, &ni); if (cur->old_addr == NEW_ADDR) { - invalidate_blocks(sbi, dn.data_blkaddr); + f2fs_invalidate_blocks(sbi, dn.data_blkaddr); f2fs_update_data_blkaddr(&dn, NEW_ADDR); } else f2fs_replace_block(sbi, &dn, dn.data_blkaddr, @@ -273,7 +274,7 @@ static int __revoke_inmem_pages(struct inode *inode, return err; } -void drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure) +void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure) { struct list_head *head = &sbi->inode_list[ATOMIC_FILE]; struct inode *inode; @@ -296,7 +297,7 @@ void drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure) } drop: set_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - drop_inmem_pages(inode); + f2fs_drop_inmem_pages(inode); iput(inode); } skip: @@ -305,7 +306,7 @@ void drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure) goto next; } -void drop_inmem_pages(struct inode *inode) +void f2fs_drop_inmem_pages(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); @@ -323,7 +324,7 @@ void drop_inmem_pages(struct inode *inode) stat_dec_atomic_write(inode); } -void drop_inmem_page(struct inode *inode, struct page *page) +void f2fs_drop_inmem_page(struct inode *inode, struct page *page) { struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -353,7 +354,7 @@ void drop_inmem_page(struct inode *inode, struct page *page) trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE); } -static int __commit_inmem_pages(struct inode *inode) +static int __f2fs_commit_inmem_pages(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); @@ -383,14 +384,14 @@ static int __commit_inmem_pages(struct inode *inode) f2fs_wait_on_page_writeback(page, DATA, true); if (clear_page_dirty_for_io(page)) { inode_dec_dirty_pages(inode); - remove_dirty_inode(inode); + f2fs_remove_dirty_inode(inode); } retry: fio.page = page; fio.old_blkaddr = NULL_ADDR; fio.encrypted_page = NULL; fio.need_lock = LOCK_DONE; - err = do_write_data_page(&fio); + err = f2fs_do_write_data_page(&fio); if (err) { if (err == -ENOMEM) { congestion_wait(BLK_RW_ASYNC, HZ/50); @@ -431,7 +432,7 @@ static int __commit_inmem_pages(struct inode *inode) return err; } -int commit_inmem_pages(struct inode *inode) +int f2fs_commit_inmem_pages(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); @@ -443,7 +444,7 @@ int commit_inmem_pages(struct inode *inode) set_inode_flag(inode, FI_ATOMIC_COMMIT); mutex_lock(&fi->inmem_lock); - err = __commit_inmem_pages(inode); + err = __f2fs_commit_inmem_pages(inode); spin_lock(&sbi->inode_lock[ATOMIC_FILE]); if (!list_empty(&fi->inmem_ilist)) @@ -490,24 +491,24 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) return; /* try to shrink extent cache when there is no enough memory */ - if (!available_free_memory(sbi, EXTENT_CACHE)) + if (!f2fs_available_free_memory(sbi, EXTENT_CACHE)) f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER); /* check the # of cached NAT entries */ - if (!available_free_memory(sbi, NAT_ENTRIES)) - try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK); + if (!f2fs_available_free_memory(sbi, NAT_ENTRIES)) + f2fs_try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK); - if (!available_free_memory(sbi, FREE_NIDS)) - try_to_free_nids(sbi, MAX_FREE_NIDS); + if (!f2fs_available_free_memory(sbi, FREE_NIDS)) + f2fs_try_to_free_nids(sbi, MAX_FREE_NIDS); else - build_free_nids(sbi, false, false); + f2fs_build_free_nids(sbi, false, false); if (!is_idle(sbi) && !excess_dirty_nats(sbi)) return; /* checkpoint is the only way to shrink partial cached entries */ - if (!available_free_memory(sbi, NAT_ENTRIES) || - !available_free_memory(sbi, INO_ENTRIES) || + if (!f2fs_available_free_memory(sbi, NAT_ENTRIES) || + !f2fs_available_free_memory(sbi, INO_ENTRIES) || excess_prefree_segs(sbi) || excess_dirty_nats(sbi) || f2fs_time_over(sbi, CP_TIME)) { @@ -515,7 +516,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) struct blk_plug plug; blk_start_plug(&plug); - sync_dirty_inodes(sbi, FILE_INODE); + f2fs_sync_dirty_inodes(sbi, FILE_INODE); blk_finish_plug(&plug); } f2fs_sync_fs(sbi->sb, true); @@ -548,7 +549,7 @@ static int submit_flush_wait(struct f2fs_sb_info *sbi, nid_t ino) return __submit_flush_wait(sbi, sbi->sb->s_bdev); for (i = 0; i < sbi->s_ndevs; i++) { - if (!is_dirty_device(sbi, ino, i, FLUSH_INO)) + if (!f2fs_is_dirty_device(sbi, ino, i, FLUSH_INO)) continue; ret = __submit_flush_wait(sbi, FDEV(i).bdev); if (ret) @@ -659,7 +660,7 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino) return cmd.ret; } -int create_flush_cmd_control(struct f2fs_sb_info *sbi) +int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; struct flush_cmd_control *fcc; @@ -696,7 +697,7 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) return err; } -void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) +void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) { struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; @@ -1102,7 +1103,7 @@ static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi, goto do_insert; } - p = __lookup_rb_tree_for_insert(sbi, &dcc->root, &parent, lstart); + p = f2fs_lookup_rb_tree_for_insert(sbi, &dcc->root, &parent, lstart); do_insert: dc = __attach_discard_cmd(sbi, bdev, lstart, start, len, parent, p); if (!dc) @@ -1167,7 +1168,7 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, mutex_lock(&dcc->cmd_lock); - dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root, + dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, NULL, lstart, (struct rb_entry **)&prev_dc, (struct rb_entry **)&next_dc, @@ -1278,7 +1279,8 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, mutex_lock(&dcc->cmd_lock); if (list_empty(pend_list)) goto next; - f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); + f2fs_bug_on(sbi, + !f2fs_check_rb_tree_consistence(sbi, &dcc->root)); blk_start_plug(&plug); list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); @@ -1331,7 +1333,7 @@ static bool __drop_discard_cmd(struct f2fs_sb_info *sbi) return dropped; } -void drop_discard_cmd(struct f2fs_sb_info *sbi) +void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi) { __drop_discard_cmd(sbi); } @@ -1422,7 +1424,8 @@ static void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) bool need_wait = false; mutex_lock(&dcc->cmd_lock); - dc = (struct discard_cmd *)__lookup_rb_tree(&dcc->root, NULL, blkaddr); + dc = (struct discard_cmd *)f2fs_lookup_rb_tree(&dcc->root, + NULL, blkaddr); if (dc) { if (dc->state == D_PREP) { __punch_discard_cmd(sbi, dc, blkaddr); @@ -1437,7 +1440,7 @@ static void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) __wait_one_discard_bio(sbi, dc); } -void stop_discard_thread(struct f2fs_sb_info *sbi) +void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; @@ -1685,7 +1688,7 @@ static void release_discard_addr(struct discard_entry *entry) kmem_cache_free(discard_entry_slab, entry); } -void release_discard_addrs(struct f2fs_sb_info *sbi) +void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi) { struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list); struct discard_entry *entry, *this; @@ -1696,7 +1699,7 @@ void release_discard_addrs(struct f2fs_sb_info *sbi) } /* - * Should call clear_prefree_segments after checkpoint is done. + * Should call f2fs_clear_prefree_segments after checkpoint is done. */ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) { @@ -1709,7 +1712,8 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) mutex_unlock(&dirty_i->seglist_lock); } -void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) +void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, + struct cp_control *cpc) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *head = &dcc->entry_list; @@ -1851,7 +1855,7 @@ static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi) if (!dcc) return; - stop_discard_thread(sbi); + f2fs_stop_discard_thread(sbi); kfree(dcc); SM_I(sbi)->dcc_info = NULL; @@ -1967,7 +1971,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) get_sec_entry(sbi, segno)->valid_blocks += del; } -void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) +void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) { unsigned int segno = GET_SEGNO(sbi, addr); struct sit_info *sit_i = SIT_I(sbi); @@ -1987,7 +1991,7 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) up_write(&sit_i->sentry_lock); } -bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) +bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) { struct sit_info *sit_i = SIT_I(sbi); unsigned int segno, offset; @@ -2026,7 +2030,7 @@ static void __add_sum_entry(struct f2fs_sb_info *sbi, int type, /* * Calculate the number of current summary pages for writing */ -int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) +int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) { int valid_sum_count = 0; int i, sum_in_page; @@ -2056,14 +2060,15 @@ int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) /* * Caller should put this summary page */ -struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno) +struct page *f2fs_get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno) { - return get_meta_page(sbi, GET_SUM_BLOCK(sbi, segno)); + return f2fs_get_meta_page(sbi, GET_SUM_BLOCK(sbi, segno)); } -void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr) +void f2fs_update_meta_page(struct f2fs_sb_info *sbi, + void *src, block_t blk_addr) { - struct page *page = grab_meta_page(sbi, blk_addr); + struct page *page = f2fs_grab_meta_page(sbi, blk_addr); memcpy(page_address(page), src, PAGE_SIZE); set_page_dirty(page); @@ -2073,14 +2078,14 @@ void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr) static void write_sum_page(struct f2fs_sb_info *sbi, struct f2fs_summary_block *sum_blk, block_t blk_addr) { - update_meta_page(sbi, (void *)sum_blk, blk_addr); + f2fs_update_meta_page(sbi, (void *)sum_blk, blk_addr); } static void write_current_sum_page(struct f2fs_sb_info *sbi, int type, block_t blk_addr) { struct curseg_info *curseg = CURSEG_I(sbi, type); - struct page *page = grab_meta_page(sbi, blk_addr); + struct page *page = f2fs_grab_meta_page(sbi, blk_addr); struct f2fs_summary_block *src = curseg->sum_blk; struct f2fs_summary_block *dst; @@ -2325,7 +2330,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type) curseg->alloc_type = SSR; __next_free_blkoff(sbi, curseg, 0); - sum_page = get_sum_page(sbi, new_segno); + sum_page = f2fs_get_sum_page(sbi, new_segno); sum_node = (struct f2fs_summary_block *)page_address(sum_page); memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); f2fs_put_page(sum_page, 1); @@ -2339,7 +2344,7 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) int i, cnt; bool reversed = false; - /* need_SSR() already forces to do this */ + /* f2fs_need_SSR() already forces to do this */ if (v_ops->get_victim(sbi, &segno, BG_GC, type, SSR)) { curseg->next_segno = segno; return 1; @@ -2391,7 +2396,7 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, new_curseg(sbi, type, false); else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) new_curseg(sbi, type, false); - else if (need_SSR(sbi) && get_ssr_segment(sbi, type)) + else if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type)) change_curseg(sbi, type); else new_curseg(sbi, type, false); @@ -2399,7 +2404,7 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, stat_inc_seg_type(sbi, curseg); } -void allocate_new_segments(struct f2fs_sb_info *sbi) +void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi) { struct curseg_info *curseg; unsigned int old_segno; @@ -2421,7 +2426,8 @@ static const struct segment_allocation default_salloc_ops = { .allocate_segment = allocate_segment_by_default, }; -bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc) +bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, + struct cp_control *cpc) { __u64 trim_start = cpc->trim_start; bool has_candidate = false; @@ -2454,9 +2460,9 @@ static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi, issued = 0; mutex_lock(&dcc->cmd_lock); - f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); + f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, &dcc->root)); - dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root, + dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, NULL, start, (struct rb_entry **)&prev_dc, (struct rb_entry **)&next_dc, @@ -2537,7 +2543,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) goto out; mutex_lock(&sbi->gc_mutex); - err = write_checkpoint(sbi, &cpc); + err = f2fs_write_checkpoint(sbi, &cpc); mutex_unlock(&sbi->gc_mutex); if (err) goto out; @@ -2571,7 +2577,7 @@ static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) return false; } -int rw_hint_to_seg_type(enum rw_hint hint) +int f2fs_rw_hint_to_seg_type(enum rw_hint hint) { switch (hint) { case WRITE_LIFE_SHORT: @@ -2644,7 +2650,7 @@ int rw_hint_to_seg_type(enum rw_hint hint) * WRITE_LIFE_LONG " WRITE_LIFE_LONG */ -enum rw_hint io_type_to_rw_hint(struct f2fs_sb_info *sbi, +enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, enum page_type type, enum temp_type temp) { if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER) { @@ -2715,7 +2721,7 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) is_inode_flag_set(inode, FI_ATOMIC_FILE) || is_inode_flag_set(inode, FI_VOLATILE_FILE)) return CURSEG_HOT_DATA; - /* rw_hint_to_seg_type(inode->i_write_hint); */ + /* f2fs_rw_hint_to_seg_type(inode->i_write_hint); */ return CURSEG_WARM_DATA; } else { if (IS_DNODE(fio->page)) @@ -2752,7 +2758,7 @@ static int __get_segment_type(struct f2fs_io_info *fio) return type; } -void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, +void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, struct f2fs_io_info *fio, bool add_list) @@ -2835,7 +2841,7 @@ static void update_device_state(struct f2fs_io_info *fio) devidx = f2fs_target_device_index(sbi, fio->new_blkaddr); /* update device state for fsync */ - set_dirty_device(sbi, fio->ino, devidx, FLUSH_INO); + f2fs_set_dirty_device(sbi, fio->ino, devidx, FLUSH_INO); /* update device state for checkpoint */ if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) { @@ -2853,7 +2859,7 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) if (keep_order) down_read(&fio->sbi->io_order_lock); reallocate: - allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, + f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, &fio->new_blkaddr, sum, type, fio, true); /* writeout dirty page into bdev */ @@ -2869,7 +2875,7 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) up_read(&fio->sbi->io_order_lock); } -void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, +void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page, enum iostat_type io_type) { struct f2fs_io_info fio = { @@ -2895,7 +2901,7 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, f2fs_update_iostat(sbi, io_type, F2FS_BLKSIZE); } -void write_node_page(unsigned int nid, struct f2fs_io_info *fio) +void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio) { struct f2fs_summary sum; @@ -2905,14 +2911,15 @@ void write_node_page(unsigned int nid, struct f2fs_io_info *fio) f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); } -void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) +void f2fs_outplace_write_data(struct dnode_of_data *dn, + struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = fio->sbi; struct f2fs_summary sum; struct node_info ni; f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR); - get_node_info(sbi, dn->nid, &ni); + f2fs_get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); do_write_page(&sum, fio); f2fs_update_data_blkaddr(dn, fio->new_blkaddr); @@ -2920,7 +2927,7 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) f2fs_update_iostat(sbi, fio->io_type, F2FS_BLKSIZE); } -int rewrite_data_page(struct f2fs_io_info *fio) +int f2fs_inplace_write_data(struct f2fs_io_info *fio) { int err; struct f2fs_sb_info *sbi = fio->sbi; @@ -2955,7 +2962,7 @@ static inline int __f2fs_get_curseg(struct f2fs_sb_info *sbi, return i; } -void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, +void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, block_t old_blkaddr, block_t new_blkaddr, bool recover_curseg, bool recover_newaddr) { @@ -3040,7 +3047,7 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, set_summary(&sum, dn->nid, dn->ofs_in_node, version); - __f2fs_replace_block(sbi, &sum, old_addr, new_addr, + f2fs_do_replace_block(sbi, &sum, old_addr, new_addr, recover_curseg, recover_newaddr); f2fs_update_data_blkaddr(dn, new_addr); @@ -3086,7 +3093,7 @@ static void read_compacted_summaries(struct f2fs_sb_info *sbi) start = start_sum_block(sbi); - page = get_meta_page(sbi, start++); + page = f2fs_get_meta_page(sbi, start++); kaddr = (unsigned char *)page_address(page); /* Step 1: restore nat cache */ @@ -3126,7 +3133,7 @@ static void read_compacted_summaries(struct f2fs_sb_info *sbi) f2fs_put_page(page, 1); page = NULL; - page = get_meta_page(sbi, start++); + page = f2fs_get_meta_page(sbi, start++); kaddr = (unsigned char *)page_address(page); offset = 0; } @@ -3165,7 +3172,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) blk_addr = GET_SUM_BLOCK(sbi, segno); } - new = get_meta_page(sbi, blk_addr); + new = f2fs_get_meta_page(sbi, blk_addr); sum = (struct f2fs_summary_block *)page_address(new); if (IS_NODESEG(type)) { @@ -3177,7 +3184,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) ns->ofs_in_node = 0; } } else { - restore_node_summary(sbi, segno, sum); + f2fs_restore_node_summary(sbi, segno, sum); } } @@ -3209,10 +3216,10 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) int err; if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) { - int npages = npages_for_summary_flush(sbi, true); + int npages = f2fs_npages_for_summary_flush(sbi, true); if (npages >= 2) - ra_meta_pages(sbi, start_sum_block(sbi), npages, + f2fs_ra_meta_pages(sbi, start_sum_block(sbi), npages, META_CP, true); /* restore for compacted data summary */ @@ -3221,7 +3228,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) } if (__exist_node_summaries(sbi)) - ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_TYPE, type), + f2fs_ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_TYPE, type), NR_CURSEG_TYPE - type, META_CP, true); for (; type <= CURSEG_COLD_NODE; type++) { @@ -3247,7 +3254,7 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) int written_size = 0; int i, j; - page = grab_meta_page(sbi, blkaddr++); + page = f2fs_grab_meta_page(sbi, blkaddr++); kaddr = (unsigned char *)page_address(page); memset(kaddr, 0, PAGE_SIZE); @@ -3272,7 +3279,7 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) for (j = 0; j < blkoff; j++) { if (!page) { - page = grab_meta_page(sbi, blkaddr++); + page = f2fs_grab_meta_page(sbi, blkaddr++); kaddr = (unsigned char *)page_address(page); memset(kaddr, 0, PAGE_SIZE); written_size = 0; @@ -3309,7 +3316,7 @@ static void write_normal_summaries(struct f2fs_sb_info *sbi, write_current_sum_page(sbi, i, blkaddr + (i - type)); } -void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk) +void f2fs_write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk) { if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) write_compacted_summaries(sbi, start_blk); @@ -3317,12 +3324,12 @@ void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk) write_normal_summaries(sbi, start_blk, CURSEG_HOT_DATA); } -void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk) +void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk) { write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE); } -int lookup_journal_in_cursum(struct f2fs_journal *journal, int type, +int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, unsigned int val, int alloc) { int i; @@ -3347,7 +3354,7 @@ int lookup_journal_in_cursum(struct f2fs_journal *journal, int type, static struct page *get_current_sit_page(struct f2fs_sb_info *sbi, unsigned int segno) { - return get_meta_page(sbi, current_sit_addr(sbi, segno)); + return f2fs_get_meta_page(sbi, current_sit_addr(sbi, segno)); } static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, @@ -3360,7 +3367,7 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, src_off = current_sit_addr(sbi, start); dst_off = next_sit_addr(sbi, src_off); - page = grab_meta_page(sbi, dst_off); + page = f2fs_grab_meta_page(sbi, dst_off); seg_info_to_sit_page(sbi, page, start); set_page_dirty(page); @@ -3456,7 +3463,7 @@ static void remove_sits_in_journal(struct f2fs_sb_info *sbi) * CP calls this function, which flushes SIT entries including sit_journal, * and moves prefree segs to free segs. */ -void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) +void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct sit_info *sit_i = SIT_I(sbi); unsigned long *bitmap = sit_i->dirty_sentries_bitmap; @@ -3528,7 +3535,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) } if (to_journal) { - offset = lookup_journal_in_cursum(journal, + offset = f2fs_lookup_journal_in_cursum(journal, SIT_JOURNAL, segno, 1); f2fs_bug_on(sbi, offset < 0); segno_in_journal(journal, offset) = @@ -3744,7 +3751,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) block_t total_node_blocks = 0; do { - readed = ra_meta_pages(sbi, start_blk, BIO_MAX_PAGES, + readed = f2fs_ra_meta_pages(sbi, start_blk, BIO_MAX_PAGES, META_SIT, true); start = start_blk * sit_i->sents_per_block; @@ -3962,7 +3969,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi) up_write(&sit_i->sentry_lock); } -int build_segment_manager(struct f2fs_sb_info *sbi) +int f2fs_build_segment_manager(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); @@ -3999,7 +4006,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi) init_rwsem(&sm_info->curseg_lock); if (!f2fs_readonly(sbi->sb)) { - err = create_flush_cmd_control(sbi); + err = f2fs_create_flush_cmd_control(sbi); if (err) return err; } @@ -4124,13 +4131,13 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) kfree(sit_i); } -void destroy_segment_manager(struct f2fs_sb_info *sbi) +void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi) { struct f2fs_sm_info *sm_info = SM_I(sbi); if (!sm_info) return; - destroy_flush_cmd_control(sbi, true); + f2fs_destroy_flush_cmd_control(sbi, true); destroy_discard_cmd_control(sbi); destroy_dirty_segmap(sbi); destroy_curseg(sbi); @@ -4140,7 +4147,7 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi) kfree(sm_info); } -int __init create_segment_manager_caches(void) +int __init f2fs_create_segment_manager_caches(void) { discard_entry_slab = f2fs_kmem_cache_create("discard_entry", sizeof(struct discard_entry)); @@ -4173,7 +4180,7 @@ int __init create_segment_manager_caches(void) return -ENOMEM; } -void destroy_segment_manager_caches(void) +void f2fs_destroy_segment_manager_caches(void) { kmem_cache_destroy(sit_entry_set_slab); kmem_cache_destroy(discard_cmd_slab); diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index 0b5664a1a6cc..36cfd816c160 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -109,11 +109,11 @@ unsigned long f2fs_shrink_scan(struct shrinker *shrink, /* shrink clean nat cache entries */ if (freed < nr) - freed += try_to_free_nats(sbi, nr - freed); + freed += f2fs_try_to_free_nats(sbi, nr - freed); /* shrink free nids cache entries */ if (freed < nr) - freed += try_to_free_nids(sbi, nr - freed); + freed += f2fs_try_to_free_nids(sbi, nr - freed); spin_lock(&f2fs_list_lock); p = p->next; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 9819c04e6848..d306725d7399 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -865,7 +865,7 @@ static int f2fs_drop_inode(struct inode *inode) /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - drop_inmem_pages(inode); + f2fs_drop_inmem_pages(inode); /* should remain fi->extent_tree for writepage */ f2fs_destroy_extent_node(inode); @@ -1002,7 +1002,7 @@ static void f2fs_put_super(struct super_block *sb) struct cp_control cpc = { .reason = CP_UMOUNT, }; - write_checkpoint(sbi, &cpc); + f2fs_write_checkpoint(sbi, &cpc); } /* be sure to wait for any on-going discard commands */ @@ -1012,17 +1012,17 @@ static void f2fs_put_super(struct super_block *sb) struct cp_control cpc = { .reason = CP_UMOUNT | CP_TRIMMED, }; - write_checkpoint(sbi, &cpc); + f2fs_write_checkpoint(sbi, &cpc); } - /* write_checkpoint can update stat informaion */ + /* f2fs_write_checkpoint can update stat informaion */ f2fs_destroy_stats(sbi); /* * normally superblock is clean, so we need to release this. * In addition, EIO will skip do checkpoint, we need this as well. */ - release_ino_entry(sbi, true); + f2fs_release_ino_entry(sbi, true); f2fs_leave_shrinker(sbi); mutex_unlock(&sbi->umount_mutex); @@ -1034,8 +1034,8 @@ static void f2fs_put_super(struct super_block *sb) iput(sbi->meta_inode); /* destroy f2fs internal modules */ - destroy_node_manager(sbi); - destroy_segment_manager(sbi); + f2fs_destroy_node_manager(sbi); + f2fs_destroy_segment_manager(sbi); kfree(sbi->ckpt); @@ -1078,7 +1078,7 @@ int f2fs_sync_fs(struct super_block *sb, int sync) cpc.reason = __get_cp_reason(sbi); mutex_lock(&sbi->gc_mutex); - err = write_checkpoint(sbi, &cpc); + err = f2fs_write_checkpoint(sbi, &cpc); mutex_unlock(&sbi->gc_mutex); } f2fs_trace_ios(NULL, 1); @@ -1481,11 +1481,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) */ if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) { if (sbi->gc_thread) { - stop_gc_thread(sbi); + f2fs_stop_gc_thread(sbi); need_restart_gc = true; } } else if (!sbi->gc_thread) { - err = start_gc_thread(sbi); + err = f2fs_start_gc_thread(sbi); if (err) goto restore_opts; need_stop_gc = true; @@ -1508,9 +1508,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) */ if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { clear_opt(sbi, FLUSH_MERGE); - destroy_flush_cmd_control(sbi, false); + f2fs_destroy_flush_cmd_control(sbi, false); } else { - err = create_flush_cmd_control(sbi); + err = f2fs_create_flush_cmd_control(sbi); if (err) goto restore_gc; } @@ -1528,11 +1528,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) return 0; restore_gc: if (need_restart_gc) { - if (start_gc_thread(sbi)) + if (f2fs_start_gc_thread(sbi)) f2fs_msg(sbi->sb, KERN_WARNING, "background gc thread has stopped"); } else if (need_stop_gc) { - stop_gc_thread(sbi); + f2fs_stop_gc_thread(sbi); } restore_opts: #ifdef CONFIG_QUOTA @@ -1953,7 +1953,7 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb, struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; - if (check_nid_range(sbi, ino)) + if (f2fs_check_nid_range(sbi, ino)) return ERR_PTR(-ESTALE); /* @@ -2279,7 +2279,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return 0; } -int sanity_check_ckpt(struct f2fs_sb_info *sbi) +int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) { unsigned int total, fsmeta; struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); @@ -2832,7 +2832,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) goto free_io_dummy; } - err = get_valid_checkpoint(sbi); + err = f2fs_get_valid_checkpoint(sbi); if (err) { f2fs_msg(sb, KERN_ERR, "Failed to get valid F2FS checkpoint"); goto free_meta_inode; @@ -2862,18 +2862,18 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) spin_lock_init(&sbi->inode_lock[i]); } - init_extent_cache_info(sbi); + f2fs_init_extent_cache_info(sbi); - init_ino_entry_info(sbi); + f2fs_init_ino_entry_info(sbi); /* setup f2fs internal modules */ - err = build_segment_manager(sbi); + err = f2fs_build_segment_manager(sbi); if (err) { f2fs_msg(sb, KERN_ERR, "Failed to initialize F2FS segment manager"); goto free_sm; } - err = build_node_manager(sbi); + err = f2fs_build_node_manager(sbi); if (err) { f2fs_msg(sb, KERN_ERR, "Failed to initialize F2FS node manager"); @@ -2891,7 +2891,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) sbi->kbytes_written = le64_to_cpu(seg_i->journal->info.kbytes_written); - build_gc_manager(sbi); + f2fs_build_gc_manager(sbi); /* get an inode for node space */ sbi->node_inode = f2fs_iget(sb, F2FS_NODE_INO(sbi)); @@ -2943,7 +2943,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) } #endif /* if there are nt orphan nodes free them */ - err = recover_orphan_inodes(sbi); + err = f2fs_recover_orphan_inodes(sbi); if (err) goto free_meta; @@ -2965,7 +2965,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (!retry) goto skip_recovery; - err = recover_fsync_data(sbi, false); + err = f2fs_recover_fsync_data(sbi, false); if (err < 0) { need_fsck = true; f2fs_msg(sb, KERN_ERR, @@ -2973,7 +2973,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) goto free_meta; } } else { - err = recover_fsync_data(sbi, true); + err = f2fs_recover_fsync_data(sbi, true); if (!f2fs_readonly(sb) && err > 0) { err = -EINVAL; @@ -2983,7 +2983,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) } } skip_recovery: - /* recover_fsync_data() cleared this already */ + /* f2fs_recover_fsync_data() cleared this already */ clear_sbi_flag(sbi, SBI_POR_DOING); /* @@ -2992,7 +2992,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) */ if (test_opt(sbi, BG_GC) && !f2fs_readonly(sb)) { /* After POR, we can run background GC thread.*/ - err = start_gc_thread(sbi); + err = f2fs_start_gc_thread(sbi); if (err) goto free_meta; } @@ -3023,10 +3023,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) #endif f2fs_sync_inode_meta(sbi); /* - * Some dirty meta pages can be produced by recover_orphan_inodes() + * Some dirty meta pages can be produced by f2fs_recover_orphan_inodes() * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg() - * followed by write_checkpoint() through f2fs_write_node_pages(), which - * falls into an infinite loop in sync_meta_pages(). + * followed by f2fs_write_checkpoint() through f2fs_write_node_pages(), which + * falls into an infinite loop in f2fs_sync_meta_pages(). */ truncate_inode_pages_final(META_MAPPING(sbi)); #ifdef CONFIG_QUOTA @@ -3039,13 +3039,13 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) free_stats: f2fs_destroy_stats(sbi); free_node_inode: - release_ino_entry(sbi, true); + f2fs_release_ino_entry(sbi, true); truncate_inode_pages_final(NODE_MAPPING(sbi)); iput(sbi->node_inode); free_nm: - destroy_node_manager(sbi); + f2fs_destroy_node_manager(sbi); free_sm: - destroy_segment_manager(sbi); + f2fs_destroy_segment_manager(sbi); free_devices: destroy_device_list(sbi); kfree(sbi->ckpt); @@ -3091,8 +3091,8 @@ static void kill_f2fs_super(struct super_block *sb) { if (sb->s_root) { set_sbi_flag(F2FS_SB(sb), SBI_IS_CLOSE); - stop_gc_thread(F2FS_SB(sb)); - stop_discard_thread(F2FS_SB(sb)); + f2fs_stop_gc_thread(F2FS_SB(sb)); + f2fs_stop_discard_thread(F2FS_SB(sb)); } kill_block_super(sb); } @@ -3141,16 +3141,16 @@ static int __init init_f2fs_fs(void) err = init_inodecache(); if (err) goto fail; - err = create_node_manager_caches(); + err = f2fs_create_node_manager_caches(); if (err) goto free_inodecache; - err = create_segment_manager_caches(); + err = f2fs_create_segment_manager_caches(); if (err) goto free_node_manager_caches; - err = create_checkpoint_caches(); + err = f2fs_create_checkpoint_caches(); if (err) goto free_segment_manager_caches; - err = create_extent_cache(); + err = f2fs_create_extent_cache(); if (err) goto free_checkpoint_caches; err = f2fs_init_sysfs(); @@ -3179,13 +3179,13 @@ static int __init init_f2fs_fs(void) free_sysfs: f2fs_exit_sysfs(); free_extent_cache: - destroy_extent_cache(); + f2fs_destroy_extent_cache(); free_checkpoint_caches: - destroy_checkpoint_caches(); + f2fs_destroy_checkpoint_caches(); free_segment_manager_caches: - destroy_segment_manager_caches(); + f2fs_destroy_segment_manager_caches(); free_node_manager_caches: - destroy_node_manager_caches(); + f2fs_destroy_node_manager_caches(); free_inodecache: destroy_inodecache(); fail: @@ -3199,10 +3199,10 @@ static void __exit exit_f2fs_fs(void) unregister_filesystem(&f2fs_fs_type); unregister_shrinker(&f2fs_shrinker_info); f2fs_exit_sysfs(); - destroy_extent_cache(); - destroy_checkpoint_caches(); - destroy_segment_manager_caches(); - destroy_node_manager_caches(); + f2fs_destroy_extent_cache(); + f2fs_destroy_checkpoint_caches(); + f2fs_destroy_segment_manager_caches(); + f2fs_destroy_node_manager_caches(); destroy_inodecache(); f2fs_destroy_trace_ios(); } diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index ac3ea6044936..60c827eadd82 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -165,7 +165,7 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, return snprintf(buf, PAGE_SIZE, "%u\n", *ui); } -static ssize_t __f2fs_sbi_store(struct f2fs_attr *a, +static ssize_t __sbi_store(struct f2fs_attr *a, struct f2fs_sb_info *sbi, const char *buf, size_t count) { @@ -201,13 +201,13 @@ static ssize_t __f2fs_sbi_store(struct f2fs_attr *a, down_write(&sbi->sb_lock); - ret = update_extension_list(sbi, name, hot, set); + ret = f2fs_update_extension_list(sbi, name, hot, set); if (ret) goto out; ret = f2fs_commit_super(sbi, false); if (ret) - update_extension_list(sbi, name, hot, !set); + f2fs_update_extension_list(sbi, name, hot, !set); out: up_write(&sbi->sb_lock); return ret ? ret : count; @@ -288,7 +288,7 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, if (gc_entry) down_read(&sbi->sb->s_umount); - ret = __f2fs_sbi_store(a, sbi, buf, count); + ret = __sbi_store(a, sbi, buf, count); if (gc_entry) up_read(&sbi->sb->s_umount); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 116be979b897..61a5d9284bc0 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -299,7 +299,7 @@ static int read_inline_xattr(struct inode *inode, struct page *ipage, if (ipage) { inline_addr = inline_xattr_addr(inode, ipage); } else { - page = get_node_page(sbi, inode->i_ino); + page = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) return PTR_ERR(page); @@ -320,7 +320,7 @@ static int read_xattr_block(struct inode *inode, void *txattr_addr) void *xattr_addr; /* The inode already has an extended attribute block. */ - xpage = get_node_page(sbi, xnid); + xpage = f2fs_get_node_page(sbi, xnid); if (IS_ERR(xpage)) return PTR_ERR(xpage); @@ -444,7 +444,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, int err = 0; if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid) - if (!alloc_nid(sbi, &new_nid)) + if (!f2fs_alloc_nid(sbi, &new_nid)) return -ENOSPC; /* write to inline xattr */ @@ -452,9 +452,9 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, if (ipage) { inline_addr = inline_xattr_addr(inode, ipage); } else { - in_page = get_node_page(sbi, inode->i_ino); + in_page = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(in_page)) { - alloc_nid_failed(sbi, new_nid); + f2fs_alloc_nid_failed(sbi, new_nid); return PTR_ERR(in_page); } inline_addr = inline_xattr_addr(inode, in_page); @@ -464,8 +464,8 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, NODE, true); /* no need to use xattr node block */ if (hsize <= inline_size) { - err = truncate_xattr_node(inode); - alloc_nid_failed(sbi, new_nid); + err = f2fs_truncate_xattr_node(inode); + f2fs_alloc_nid_failed(sbi, new_nid); if (err) { f2fs_put_page(in_page, 1); return err; @@ -478,10 +478,10 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, /* write to xattr node block */ if (F2FS_I(inode)->i_xattr_nid) { - xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); + xpage = f2fs_get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); if (IS_ERR(xpage)) { err = PTR_ERR(xpage); - alloc_nid_failed(sbi, new_nid); + f2fs_alloc_nid_failed(sbi, new_nid); goto in_page_out; } f2fs_bug_on(sbi, new_nid); @@ -489,13 +489,13 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, } else { struct dnode_of_data dn; set_new_dnode(&dn, inode, NULL, NULL, new_nid); - xpage = new_node_page(&dn, XATTR_NODE_OFFSET); + xpage = f2fs_new_node_page(&dn, XATTR_NODE_OFFSET); if (IS_ERR(xpage)) { err = PTR_ERR(xpage); - alloc_nid_failed(sbi, new_nid); + f2fs_alloc_nid_failed(sbi, new_nid); goto in_page_out; } - alloc_nid_done(sbi, new_nid); + f2fs_alloc_nid_done(sbi, new_nid); } xattr_addr = page_address(xpage); @@ -733,7 +733,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, if (err) return err; - /* this case is only from init_inode_metadata */ + /* this case is only from f2fs_init_inode_metadata */ if (ipage) return __f2fs_setxattr(inode, index, name, value, size, ipage, flags); From d5b4710fcf381855b348216179e925f78815ef2c Mon Sep 17 00:00:00 2001 From: youngjun yoo Date: Wed, 30 May 2018 04:21:14 +0900 Subject: [PATCH 0768/1212] fs: f2fs: changed variable type of offset "unsigned" to "loff_t" clean up checkpatch warning: WARNING: Prefer 'unsigned int' to bare use of 'unsigned' Signed-off-by: youngjun yoo Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f1476c93ded5..516fe3cc85ff 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -98,7 +98,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, /* page is wholly or partially inside EOF */ if (((loff_t)(page->index + 1) << PAGE_SHIFT) > i_size_read(inode)) { - unsigned offset; + loff_t offset; offset = i_size_read(inode) & ~PAGE_MASK; zero_user_segment(page, offset, PAGE_SIZE); } @@ -543,7 +543,7 @@ void f2fs_truncate_data_blocks(struct dnode_of_data *dn) static int truncate_partial_data_page(struct inode *inode, u64 from, bool cache_only) { - unsigned offset = from & (PAGE_SIZE - 1); + loff_t offset = from & (PAGE_SIZE - 1); pgoff_t index = from >> PAGE_SHIFT; struct address_space *mapping = inode->i_mapping; struct page *page; From 39ee53e22320abc578d94dded9244d64d450135a Mon Sep 17 00:00:00 2001 From: youngjun yoo Date: Wed, 30 May 2018 04:33:07 +0900 Subject: [PATCH 0769/1212] fs: f2fs: add missing blank lines after declarations clean up checkpatch warning: WARNING: Missing a blank line after declarations Signed-off-by: youngjun yoo Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 516fe3cc85ff..c01e97426b2f 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -99,6 +99,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, if (((loff_t)(page->index + 1) << PAGE_SHIFT) > i_size_read(inode)) { loff_t offset; + offset = i_size_read(inode) & ~PAGE_MASK; zero_user_segment(page, offset, PAGE_SIZE); } @@ -416,6 +417,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) dn.ofs_in_node++, pgofs++, data_ofs = (loff_t)pgofs << PAGE_SHIFT) { block_t blkaddr; + blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); @@ -506,6 +508,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) for (; count > 0; count--, addr++, dn->ofs_in_node++) { block_t blkaddr = le32_to_cpu(*addr); + if (blkaddr == NULL_ADDR) continue; From 1ae5aadab1914fbdfcc24761005203e46fa0b343 Mon Sep 17 00:00:00 2001 From: youngjun yoo Date: Wed, 30 May 2018 04:34:58 +0900 Subject: [PATCH 0770/1212] fs: f2fs: insert space around that ':' and ', ' clean up checkpatch error: ERROR: space required after that ':' ERROR: space required after that ',' Signed-off-by: youngjun yoo Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c01e97426b2f..de1c712777c9 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1487,7 +1487,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset, last_off = map.m_lblk + map.m_len - 1; /* update new size to the failed position */ - new_size = (last_off == pg_end) ? offset + len: + new_size = (last_off == pg_end) ? offset + len : (loff_t)(last_off + 1) << PAGE_SHIFT; } else { new_size = ((loff_t)pg_end << PAGE_SHIFT) + off_end; @@ -2132,7 +2132,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, struct inode *inode = file_inode(filp); struct f2fs_map_blocks map = { .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE }; - struct extent_info ei = {0,0,0}; + struct extent_info ei = {0, 0, 0}; pgoff_t pg_start, pg_end, next_pgofs; unsigned int blk_per_seg = sbi->blocks_per_seg; unsigned int total = 0, sec_num; From 588ecdfd7d023e7ed43fc516823d7df3c9d14fc3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 4 Jun 2018 23:20:17 +0800 Subject: [PATCH 0771/1212] f2fs: fix to update mtime correctly If we change system time to the past, get_mtime() will return a overflowed time, and SIT_I(sbi)->max_mtime will be udpated incorrectly, this patch fixes the two issues. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/segment.c | 7 ++++--- fs/f2fs/segment.h | 18 +++++++++++++++--- 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index b00c807c8c8b..60b4886f5bb6 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1234,7 +1234,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) * modify checkpoint * version number is already updated */ - ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi)); + ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi, true)); ckpt->free_segment_count = cpu_to_le32(free_segments(sbi)); for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { ckpt->cur_node_segno[i] = diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 8672bf574426..9a3dc92ecf23 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1902,8 +1902,9 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) (new_vblocks > sbi->blocks_per_seg))); se->valid_blocks = new_vblocks; - se->mtime = get_mtime(sbi); - SIT_I(sbi)->max_mtime = se->mtime; + se->mtime = get_mtime(sbi, false); + if (se->mtime > SIT_I(sbi)->max_mtime) + SIT_I(sbi)->max_mtime = se->mtime; /* Update valid block bitmap */ if (del > 0) { @@ -3965,7 +3966,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi) if (sit_i->min_mtime > mtime) sit_i->min_mtime = mtime; } - sit_i->max_mtime = get_mtime(sbi); + sit_i->max_mtime = get_mtime(sbi, false); up_write(&sit_i->sentry_lock); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 3e7ef7c6771f..f18fc82fbe99 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -745,11 +745,23 @@ static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start) #endif } -static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi) +static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi, + bool base_time) { struct sit_info *sit_i = SIT_I(sbi); - return sit_i->elapsed_time + CURRENT_TIME_SEC.tv_sec - - sit_i->mounted_time; + time64_t diff, now = ktime_get_real_seconds(); + + if (now >= sit_i->mounted_time) + return sit_i->elapsed_time + now - sit_i->mounted_time; + + /* system time is set to the past */ + if (!base_time) { + diff = sit_i->mounted_time - now; + if (sit_i->elapsed_time >= diff) + return sit_i->elapsed_time - diff; + return 0; + } + return sit_i->elapsed_time; } static inline void set_summary(struct f2fs_summary *sum, nid_t nid, From 6a4540cf1984dafe622622d647f22089ef404839 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 4 Jun 2018 23:20:35 +0800 Subject: [PATCH 0772/1212] f2fs: don't change wbc->sync_mode We should never falsify wbc->sync_mode passed from mm, otherwise mm can trigger writeback with wrong IO priority. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index cd0f60b5be7a..5264b079b93e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1744,7 +1744,6 @@ static int f2fs_write_node_pages(struct address_space *mapping, trace_f2fs_writepages(mapping->host, wbc, NODE); diff = nr_pages_to_write(sbi, NODE, wbc); - wbc->sync_mode = WB_SYNC_NONE; blk_start_plug(&plug); f2fs_sync_node_pages(sbi, wbc, true, FS_NODE_IO); blk_finish_plug(&plug); From 853e7339b634660b951d9892e036faf225cf1187 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 4 Jun 2018 23:20:36 +0800 Subject: [PATCH 0773/1212] f2fs: let sync node IO interrupt async one Although mixed sync/async IOs can have continuous LBA, as they have different IO priority, block IO scheduler will add them into different queues and commit them separately, result in splited IOs which causes wrose performance. This patch gives high priority to synchronous IO of nodes, means that once synchronous flow starts, it can interrupt asynchronous writeback flow of system flusher, so more big IOs can be expected. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 ++ fs/f2fs/data.c | 9 +++++---- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 2 ++ fs/f2fs/gc.c | 7 +++++++ fs/f2fs/node.c | 21 ++++++++++++++++++--- fs/f2fs/super.c | 3 ++- 7 files changed, 37 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 60b4886f5bb6..e255e9b5538f 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1090,7 +1090,9 @@ static int block_operations(struct f2fs_sb_info *sbi) if (get_pages(sbi, F2FS_DIRTY_NODES)) { up_write(&sbi->node_write); + atomic_inc(&sbi->wb_sync_req[NODE]); err = f2fs_sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO); + atomic_dec(&sbi->wb_sync_req[NODE]); if (err) { up_write(&sbi->node_change); f2fs_unlock_all(sbi); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 4b0db685e5d5..a166927355c8 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1930,6 +1930,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, int ret = 0; int done = 0; struct pagevec pvec; + struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); int nr_pages; pgoff_t uninitialized_var(writeback_index); pgoff_t index; @@ -1984,7 +1985,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, bool submitted = false; /* give a priority to WB_SYNC threads */ - if (atomic_read(&F2FS_M_SB(mapping)->wb_sync_req) && + if (atomic_read(&sbi->wb_sync_req[DATA]) && wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; @@ -2104,8 +2105,8 @@ static int __f2fs_write_data_pages(struct address_space *mapping, /* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */ if (wbc->sync_mode == WB_SYNC_ALL) - atomic_inc(&sbi->wb_sync_req); - else if (atomic_read(&sbi->wb_sync_req)) + atomic_inc(&sbi->wb_sync_req[DATA]); + else if (atomic_read(&sbi->wb_sync_req[DATA])) goto skip_write; blk_start_plug(&plug); @@ -2113,7 +2114,7 @@ static int __f2fs_write_data_pages(struct address_space *mapping, blk_finish_plug(&plug); if (wbc->sync_mode == WB_SYNC_ALL) - atomic_dec(&sbi->wb_sync_req); + atomic_dec(&sbi->wb_sync_req[DATA]); /* * if some pages were truncated, we cannot guarantee its mapping->host * to detect pending bios. diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e91f7ff71dc6..6873b321c2c1 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1265,7 +1265,7 @@ struct f2fs_sb_info { struct percpu_counter alloc_valid_block_count; /* writeback control */ - atomic_t wb_sync_req; /* count # of WB_SYNC threads */ + atomic_t wb_sync_req[META]; /* count # of WB_SYNC threads */ /* valid inode count */ struct percpu_counter total_valid_inode_count; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index de1c712777c9..8b0002f05451 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -277,7 +277,9 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, goto out; } sync_nodes: + atomic_inc(&sbi->wb_sync_req[NODE]); ret = f2fs_fsync_node_pages(sbi, inode, &wbc, atomic); + atomic_dec(&sbi->wb_sync_req[NODE]); if (ret) goto out; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index dcadc0691a3e..772ef64d2035 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -473,12 +473,16 @@ static void gc_node_segment(struct f2fs_sb_info *sbi, block_t start_addr; int off; int phase = 0; + bool fggc = (gc_type == FG_GC); start_addr = START_BLOCK(sbi, segno); next_step: entry = sum; + if (fggc && phase == 2) + atomic_inc(&sbi->wb_sync_req[NODE]); + for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { nid_t nid = le32_to_cpu(entry->nid); struct page *node_page; @@ -525,6 +529,9 @@ static void gc_node_segment(struct f2fs_sb_info *sbi, if (++phase < 3) goto next_step; + + if (fggc) + atomic_dec(&sbi->wb_sync_req[NODE]); } /* diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 5264b079b93e..baa8ee1aca38 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1596,21 +1596,28 @@ int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, int step = 0; int nwritten = 0; int ret = 0; - int nr_pages; + int nr_pages, done = 0; pagevec_init(&pvec, 0); next_step: index = 0; - while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_DIRTY))) { + while (!done && (nr_pages = pagevec_lookup_tag(&pvec, + NODE_MAPPING(sbi), &index, PAGECACHE_TAG_DIRTY))) { int i; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; bool submitted = false; + /* give a priority to WB_SYNC threads */ + if (atomic_read(&sbi->wb_sync_req[NODE]) && + wbc->sync_mode == WB_SYNC_NONE) { + done = 1; + break; + } + /* * flushing sequence with step: * 0. indirect nodes @@ -1741,6 +1748,11 @@ static int f2fs_write_node_pages(struct address_space *mapping, if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE)) goto skip_write; + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_inc(&sbi->wb_sync_req[NODE]); + else if (atomic_read(&sbi->wb_sync_req[NODE])) + goto skip_write; + trace_f2fs_writepages(mapping->host, wbc, NODE); diff = nr_pages_to_write(sbi, NODE, wbc); @@ -1748,6 +1760,9 @@ static int f2fs_write_node_pages(struct address_space *mapping, f2fs_sync_node_pages(sbi, wbc, true, FS_NODE_IO); blk_finish_plug(&plug); wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); + + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_dec(&sbi->wb_sync_req[NODE]); return 0; skip_write: diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d306725d7399..0b803213ed64 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2360,7 +2360,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) for (i = 0; i < NR_COUNT_TYPE; i++) atomic_set(&sbi->nr_pages[i], 0); - atomic_set(&sbi->wb_sync_req, 0); + for (i = 0; i < META; i++) + atomic_set(&sbi->wb_sync_req[i], 0); INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); From d400752f547f8aea87260885fcdceed3a58e9072 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 4 Jun 2018 23:20:51 +0800 Subject: [PATCH 0774/1212] f2fs: fix to clear FI_VOLATILE_FILE correctly Thread A Thread B - f2fs_release_file - clear_inode_flag(FI_VOLATILE_FILE) - wb_writeback - writeback_sb_inodes - __writeback_single_inode - do_writepages - f2fs_write_data_pages - __write_data_page all volatile file's pages are writebacked to storage - set_inode_flag(FI_DROP_CACHE) - filemap_fdatawrite There is a hole that mm can flush all dirty pages of volatile file as inode is not tagged with both FI_VOLATILE_FILE and FI_DROP_CACHE flags, we should never writeback the page #0 and also it's unneeded to writeback other pages. This patch adjusts to relocate clear_inode_flag(FI_VOLATILE_FILE), so that FI_VOLATILE_FILE flag can be remained before all dirty pages were dropped to avoid issue. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 8b0002f05451..2ca53f7b94e9 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1571,11 +1571,11 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) if (f2fs_is_atomic_file(inode)) f2fs_drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { - clear_inode_flag(inode, FI_VOLATILE_FILE); - stat_dec_volatile_write(inode); set_inode_flag(inode, FI_DROP_CACHE); filemap_fdatawrite(inode->i_mapping); clear_inode_flag(inode, FI_DROP_CACHE); + clear_inode_flag(inode, FI_VOLATILE_FILE); + stat_dec_volatile_write(inode); } return 0; } From c41203299a521a7ba9bb41afbc14c534ee1e3554 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 7 May 2018 16:47:02 -0700 Subject: [PATCH 0775/1212] overflow.h: Add allocation size calculation helpers In preparation for replacing unchecked overflows for memory allocations, this creates helpers for the 3 most common calculations: array_size(a, b): 2-dimensional array array3_size(a, b, c): 3-dimensional array struct_size(ptr, member, n): struct followed by n-many trailing members Each of these return SIZE_MAX on overflow instead of wrapping around. (Additionally renames a variable named "array_size" to avoid future collision.) Co-developed-by: Matthew Wilcox Signed-off-by: Kees Cook --- drivers/md/dm-table.c | 10 +- include/linux/overflow.h | 278 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 283 insertions(+), 5 deletions(-) create mode 100644 include/linux/overflow.h diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index cb5d0daf53bb..8e9646a2550d 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -507,14 +507,14 @@ static int adjoin(struct dm_table *table, struct dm_target *ti) * On the other hand, dm-switch needs to process bulk data using messages and * excessive use of GFP_NOIO could cause trouble. */ -static char **realloc_argv(unsigned *array_size, char **old_argv) +static char **realloc_argv(unsigned *size, char **old_argv) { char **argv; unsigned new_size; gfp_t gfp; - if (*array_size) { - new_size = *array_size * 2; + if (*size) { + new_size = *size * 2; gfp = GFP_KERNEL; } else { new_size = 8; @@ -522,8 +522,8 @@ static char **realloc_argv(unsigned *array_size, char **old_argv) } argv = kmalloc(new_size * sizeof(*argv), gfp); if (argv) { - memcpy(argv, old_argv, *array_size * sizeof(*argv)); - *array_size = new_size; + memcpy(argv, old_argv, *size * sizeof(*argv)); + *size = new_size; } kfree(old_argv); diff --git a/include/linux/overflow.h b/include/linux/overflow.h new file mode 100644 index 000000000000..8712ff70995f --- /dev/null +++ b/include/linux/overflow.h @@ -0,0 +1,278 @@ +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ +#ifndef __LINUX_OVERFLOW_H +#define __LINUX_OVERFLOW_H + +#include + +/* + * In the fallback code below, we need to compute the minimum and + * maximum values representable in a given type. These macros may also + * be useful elsewhere, so we provide them outside the + * COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW block. + * + * It would seem more obvious to do something like + * + * #define type_min(T) (T)(is_signed_type(T) ? (T)1 << (8*sizeof(T)-1) : 0) + * #define type_max(T) (T)(is_signed_type(T) ? ((T)1 << (8*sizeof(T)-1)) - 1 : ~(T)0) + * + * Unfortunately, the middle expressions, strictly speaking, have + * undefined behaviour, and at least some versions of gcc warn about + * the type_max expression (but not if -fsanitize=undefined is in + * effect; in that case, the warning is deferred to runtime...). + * + * The slightly excessive casting in type_min is to make sure the + * macros also produce sensible values for the exotic type _Bool. [The + * overflow checkers only almost work for _Bool, but that's + * a-feature-not-a-bug, since people shouldn't be doing arithmetic on + * _Bools. Besides, the gcc builtins don't allow _Bool* as third + * argument.] + * + * Idea stolen from + * https://mail-index.netbsd.org/tech-misc/2007/02/05/0000.html - + * credit to Christian Biere. + */ +#define is_signed_type(type) (((type)(-1)) < (type)1) +#define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type))) +#define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T))) +#define type_min(T) ((T)((T)-type_max(T)-(T)1)) + + +#ifdef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW +/* + * For simplicity and code hygiene, the fallback code below insists on + * a, b and *d having the same type (similar to the min() and max() + * macros), whereas gcc's type-generic overflow checkers accept + * different types. Hence we don't just make check_add_overflow an + * alias for __builtin_add_overflow, but add type checks similar to + * below. + */ +#define check_add_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + __builtin_add_overflow(__a, __b, __d); \ +}) + +#define check_sub_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + __builtin_sub_overflow(__a, __b, __d); \ +}) + +#define check_mul_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + __builtin_mul_overflow(__a, __b, __d); \ +}) + +#else + + +/* Checking for unsigned overflow is relatively easy without causing UB. */ +#define __unsigned_add_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + *__d = __a + __b; \ + *__d < __a; \ +}) +#define __unsigned_sub_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + *__d = __a - __b; \ + __a < __b; \ +}) +/* + * If one of a or b is a compile-time constant, this avoids a division. + */ +#define __unsigned_mul_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + *__d = __a * __b; \ + __builtin_constant_p(__b) ? \ + __b > 0 && __a > type_max(typeof(__a)) / __b : \ + __a > 0 && __b > type_max(typeof(__b)) / __a; \ +}) + +/* + * For signed types, detecting overflow is much harder, especially if + * we want to avoid UB. But the interface of these macros is such that + * we must provide a result in *d, and in fact we must produce the + * result promised by gcc's builtins, which is simply the possibly + * wrapped-around value. Fortunately, we can just formally do the + * operations in the widest relevant unsigned type (u64) and then + * truncate the result - gcc is smart enough to generate the same code + * with and without the (u64) casts. + */ + +/* + * Adding two signed integers can overflow only if they have the same + * sign, and overflow has happened iff the result has the opposite + * sign. + */ +#define __signed_add_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + *__d = (u64)__a + (u64)__b; \ + (((~(__a ^ __b)) & (*__d ^ __a)) \ + & type_min(typeof(__a))) != 0; \ +}) + +/* + * Subtraction is similar, except that overflow can now happen only + * when the signs are opposite. In this case, overflow has happened if + * the result has the opposite sign of a. + */ +#define __signed_sub_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + *__d = (u64)__a - (u64)__b; \ + ((((__a ^ __b)) & (*__d ^ __a)) \ + & type_min(typeof(__a))) != 0; \ +}) + +/* + * Signed multiplication is rather hard. gcc always follows C99, so + * division is truncated towards 0. This means that we can write the + * overflow check like this: + * + * (a > 0 && (b > MAX/a || b < MIN/a)) || + * (a < -1 && (b > MIN/a || b < MAX/a) || + * (a == -1 && b == MIN) + * + * The redundant casts of -1 are to silence an annoying -Wtype-limits + * (included in -Wextra) warning: When the type is u8 or u16, the + * __b_c_e in check_mul_overflow obviously selects + * __unsigned_mul_overflow, but unfortunately gcc still parses this + * code and warns about the limited range of __b. + */ + +#define __signed_mul_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + typeof(a) __tmax = type_max(typeof(a)); \ + typeof(a) __tmin = type_min(typeof(a)); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + *__d = (u64)__a * (u64)__b; \ + (__b > 0 && (__a > __tmax/__b || __a < __tmin/__b)) || \ + (__b < (typeof(__b))-1 && (__a > __tmin/__b || __a < __tmax/__b)) || \ + (__b == (typeof(__b))-1 && __a == __tmin); \ +}) + + +#define check_add_overflow(a, b, d) \ + __builtin_choose_expr(is_signed_type(typeof(a)), \ + __signed_add_overflow(a, b, d), \ + __unsigned_add_overflow(a, b, d)) + +#define check_sub_overflow(a, b, d) \ + __builtin_choose_expr(is_signed_type(typeof(a)), \ + __signed_sub_overflow(a, b, d), \ + __unsigned_sub_overflow(a, b, d)) + +#define check_mul_overflow(a, b, d) \ + __builtin_choose_expr(is_signed_type(typeof(a)), \ + __signed_mul_overflow(a, b, d), \ + __unsigned_mul_overflow(a, b, d)) + + +#endif /* COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW */ + +/** + * array_size() - Calculate size of 2-dimensional array. + * + * @a: dimension one + * @b: dimension two + * + * Calculates size of 2-dimensional array: @a * @b. + * + * Returns: number of bytes needed to represent the array or SIZE_MAX on + * overflow. + */ +static inline __must_check size_t array_size(size_t a, size_t b) +{ + size_t bytes; + + if (check_mul_overflow(a, b, &bytes)) + return SIZE_MAX; + + return bytes; +} + +/** + * array3_size() - Calculate size of 3-dimensional array. + * + * @a: dimension one + * @b: dimension two + * @c: dimension three + * + * Calculates size of 3-dimensional array: @a * @b * @c. + * + * Returns: number of bytes needed to represent the array or SIZE_MAX on + * overflow. + */ +static inline __must_check size_t array3_size(size_t a, size_t b, size_t c) +{ + size_t bytes; + + if (check_mul_overflow(a, b, &bytes)) + return SIZE_MAX; + if (check_mul_overflow(bytes, c, &bytes)) + return SIZE_MAX; + + return bytes; +} + +static inline __must_check size_t __ab_c_size(size_t n, size_t size, size_t c) +{ + size_t bytes; + + if (check_mul_overflow(n, size, &bytes)) + return SIZE_MAX; + if (check_add_overflow(bytes, c, &bytes)) + return SIZE_MAX; + + return bytes; +} + +/** + * struct_size() - Calculate size of structure with trailing array. + * @p: Pointer to the structure. + * @member: Name of the array member. + * @n: Number of elements in the array. + * + * Calculates size of memory needed for structure @p followed by an + * array of @n @member elements. + * + * Return: number of bytes needed or SIZE_MAX on overflow. + */ +#define struct_size(p, member, n) \ + __ab_c_size(n, \ + sizeof(*(p)->member) + __must_be_array((p)->member),\ + sizeof(*(p))) + +#endif /* __LINUX_OVERFLOW_H */ From 3ea03ea4bd0940bb8f9bc18f957918d1fd7e90db Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 12 Jun 2018 14:28:16 -0700 Subject: [PATCH 0776/1212] treewide: Use array_size() in f2fs_kmalloc() The f2fs_kmalloc() function has no 2-factor argument form, so multiplication factors need to be wrapped in array_size(). This patch replaces cases of: f2fs_kmalloc(handle, a * b, gfp) with: f2fs_kmalloc(handle, array_size(a, b), gfp) as well as handling cases of: f2fs_kmalloc(handle, a * b * c, gfp) with: f2fs_kmalloc(handle, array3_size(a, b, c), gfp) This does, however, attempt to ignore constant size factors like: f2fs_kmalloc(handle, 4 * 1024, gfp) though any constants defined via macros get caught up in the conversion. Any factors with a sizeof() of "unsigned char", "char", and "u8" were dropped, since they're redundant. The Coccinelle script used for this was: // Fix redundant parens around sizeof(). @@ expression HANDLE; type TYPE; expression THING, E; @@ ( f2fs_kmalloc(HANDLE, - (sizeof(TYPE)) * E + sizeof(TYPE) * E , ...) | f2fs_kmalloc(HANDLE, - (sizeof(THING)) * E + sizeof(THING) * E , ...) ) // Drop single-byte sizes and redundant parens. @@ expression HANDLE; expression COUNT; typedef u8; typedef __u8; @@ ( f2fs_kmalloc(HANDLE, - sizeof(u8) * (COUNT) + COUNT , ...) | f2fs_kmalloc(HANDLE, - sizeof(__u8) * (COUNT) + COUNT , ...) | f2fs_kmalloc(HANDLE, - sizeof(char) * (COUNT) + COUNT , ...) | f2fs_kmalloc(HANDLE, - sizeof(unsigned char) * (COUNT) + COUNT , ...) | f2fs_kmalloc(HANDLE, - sizeof(u8) * COUNT + COUNT , ...) | f2fs_kmalloc(HANDLE, - sizeof(__u8) * COUNT + COUNT , ...) | f2fs_kmalloc(HANDLE, - sizeof(char) * COUNT + COUNT , ...) | f2fs_kmalloc(HANDLE, - sizeof(unsigned char) * COUNT + COUNT , ...) ) // 2-factor product with sizeof(type/expression) and identifier or constant. @@ expression HANDLE; type TYPE; expression THING; identifier COUNT_ID; constant COUNT_CONST; @@ ( f2fs_kmalloc(HANDLE, - sizeof(TYPE) * (COUNT_ID) + array_size(COUNT_ID, sizeof(TYPE)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(TYPE) * COUNT_ID + array_size(COUNT_ID, sizeof(TYPE)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(TYPE) * (COUNT_CONST) + array_size(COUNT_CONST, sizeof(TYPE)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(TYPE) * COUNT_CONST + array_size(COUNT_CONST, sizeof(TYPE)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(THING) * (COUNT_ID) + array_size(COUNT_ID, sizeof(THING)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(THING) * COUNT_ID + array_size(COUNT_ID, sizeof(THING)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(THING) * (COUNT_CONST) + array_size(COUNT_CONST, sizeof(THING)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(THING) * COUNT_CONST + array_size(COUNT_CONST, sizeof(THING)) , ...) ) // 2-factor product, only identifiers. @@ expression HANDLE; identifier SIZE, COUNT; @@ f2fs_kmalloc(HANDLE, - SIZE * COUNT + array_size(COUNT, SIZE) , ...) // 3-factor product with 1 sizeof(type) or sizeof(expression), with // redundant parens removed. @@ expression HANDLE; expression THING; identifier STRIDE, COUNT; type TYPE; @@ ( f2fs_kmalloc(HANDLE, - sizeof(TYPE) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(TYPE) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(TYPE) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(TYPE) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(THING) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(THING) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(THING) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(THING) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) ) // 3-factor product with 2 sizeof(variable), with redundant parens removed. @@ expression HANDLE; expression THING1, THING2; identifier COUNT; type TYPE1, TYPE2; @@ ( f2fs_kmalloc(HANDLE, - sizeof(TYPE1) * sizeof(TYPE2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(THING1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(THING1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(TYPE1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) ) // 3-factor product, only identifiers, with redundant parens removed. @@ expression HANDLE; identifier STRIDE, SIZE, COUNT; @@ ( f2fs_kmalloc(HANDLE, - (COUNT) * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kmalloc(HANDLE, - COUNT * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kmalloc(HANDLE, - COUNT * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kmalloc(HANDLE, - (COUNT) * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kmalloc(HANDLE, - COUNT * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kmalloc(HANDLE, - (COUNT) * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kmalloc(HANDLE, - (COUNT) * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kmalloc(HANDLE, - COUNT * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) ) // Any remaining multi-factor products, first at least 3-factor products // when they're not all constants... @@ expression HANDLE; expression E1, E2, E3; constant C1, C2, C3; @@ ( f2fs_kmalloc(HANDLE, C1 * C2 * C3, ...) | f2fs_kmalloc(HANDLE, - E1 * E2 * E3 + array3_size(E1, E2, E3) , ...) ) // And then all remaining 2 factors products when they're not all constants. @@ expression HANDLE; expression E1, E2; constant C1, C2; @@ ( f2fs_kmalloc(HANDLE, C1 * C2, ...) | f2fs_kmalloc(HANDLE, - E1 * E2 + array_size(E1, E2) , ...) ) Signed-off-by: Kees Cook --- fs/f2fs/f2fs.h | 1 + fs/f2fs/super.c | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6873b321c2c1..0b514cf1ac6f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -26,6 +26,7 @@ #include #include #include +#include #define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_F2FS_FS_ENCRYPTION) #include diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 0b803213ed64..bfa56b037ed8 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2791,9 +2791,11 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) int n = (i == META) ? 1: NR_TEMP_TYPE; int j; - sbi->write_io[i] = f2fs_kmalloc(sbi, - n * sizeof(struct f2fs_bio_info), - GFP_KERNEL); + sbi->write_io[i] = + f2fs_kmalloc(sbi, + array_size(n, + sizeof(struct f2fs_bio_info)), + GFP_KERNEL); if (!sbi->write_io[i]) { err = -ENOMEM; goto free_options; From f15443db99c35cd3bf44d76bc4f6d181f89e4acd Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 12 Jun 2018 14:28:23 -0700 Subject: [PATCH 0777/1212] treewide: Use array_size() in f2fs_kzalloc() The f2fs_kzalloc() function has no 2-factor argument form, so multiplication factors need to be wrapped in array_size(). This patch replaces cases of: f2fs_kzalloc(handle, a * b, gfp) with: f2fs_kzalloc(handle, array_size(a, b), gfp) as well as handling cases of: f2fs_kzalloc(handle, a * b * c, gfp) with: f2fs_kzalloc(handle, array3_size(a, b, c), gfp) This does, however, attempt to ignore constant size factors like: f2fs_kzalloc(handle, 4 * 1024, gfp) though any constants defined via macros get caught up in the conversion. Any factors with a sizeof() of "unsigned char", "char", and "u8" were dropped, since they're redundant. The Coccinelle script used for this was: // Fix redundant parens around sizeof(). @@ expression HANDLE; type TYPE; expression THING, E; @@ ( f2fs_kzalloc(HANDLE, - (sizeof(TYPE)) * E + sizeof(TYPE) * E , ...) | f2fs_kzalloc(HANDLE, - (sizeof(THING)) * E + sizeof(THING) * E , ...) ) // Drop single-byte sizes and redundant parens. @@ expression HANDLE; expression COUNT; typedef u8; typedef __u8; @@ ( f2fs_kzalloc(HANDLE, - sizeof(u8) * (COUNT) + COUNT , ...) | f2fs_kzalloc(HANDLE, - sizeof(__u8) * (COUNT) + COUNT , ...) | f2fs_kzalloc(HANDLE, - sizeof(char) * (COUNT) + COUNT , ...) | f2fs_kzalloc(HANDLE, - sizeof(unsigned char) * (COUNT) + COUNT , ...) | f2fs_kzalloc(HANDLE, - sizeof(u8) * COUNT + COUNT , ...) | f2fs_kzalloc(HANDLE, - sizeof(__u8) * COUNT + COUNT , ...) | f2fs_kzalloc(HANDLE, - sizeof(char) * COUNT + COUNT , ...) | f2fs_kzalloc(HANDLE, - sizeof(unsigned char) * COUNT + COUNT , ...) ) // 2-factor product with sizeof(type/expression) and identifier or constant. @@ expression HANDLE; type TYPE; expression THING; identifier COUNT_ID; constant COUNT_CONST; @@ ( f2fs_kzalloc(HANDLE, - sizeof(TYPE) * (COUNT_ID) + array_size(COUNT_ID, sizeof(TYPE)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(TYPE) * COUNT_ID + array_size(COUNT_ID, sizeof(TYPE)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(TYPE) * (COUNT_CONST) + array_size(COUNT_CONST, sizeof(TYPE)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(TYPE) * COUNT_CONST + array_size(COUNT_CONST, sizeof(TYPE)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(THING) * (COUNT_ID) + array_size(COUNT_ID, sizeof(THING)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(THING) * COUNT_ID + array_size(COUNT_ID, sizeof(THING)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(THING) * (COUNT_CONST) + array_size(COUNT_CONST, sizeof(THING)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(THING) * COUNT_CONST + array_size(COUNT_CONST, sizeof(THING)) , ...) ) // 2-factor product, only identifiers. @@ expression HANDLE; identifier SIZE, COUNT; @@ f2fs_kzalloc(HANDLE, - SIZE * COUNT + array_size(COUNT, SIZE) , ...) // 3-factor product with 1 sizeof(type) or sizeof(expression), with // redundant parens removed. @@ expression HANDLE; expression THING; identifier STRIDE, COUNT; type TYPE; @@ ( f2fs_kzalloc(HANDLE, - sizeof(TYPE) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(TYPE) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(TYPE) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(TYPE) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(THING) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(THING) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(THING) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(THING) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) ) // 3-factor product with 2 sizeof(variable), with redundant parens removed. @@ expression HANDLE; expression THING1, THING2; identifier COUNT; type TYPE1, TYPE2; @@ ( f2fs_kzalloc(HANDLE, - sizeof(TYPE1) * sizeof(TYPE2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(THING1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(THING1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(TYPE1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) ) // 3-factor product, only identifiers, with redundant parens removed. @@ expression HANDLE; identifier STRIDE, SIZE, COUNT; @@ ( f2fs_kzalloc(HANDLE, - (COUNT) * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kzalloc(HANDLE, - COUNT * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kzalloc(HANDLE, - COUNT * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kzalloc(HANDLE, - (COUNT) * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kzalloc(HANDLE, - COUNT * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kzalloc(HANDLE, - (COUNT) * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kzalloc(HANDLE, - (COUNT) * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kzalloc(HANDLE, - COUNT * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) ) // Any remaining multi-factor products, first at least 3-factor products // when they're not all constants... @@ expression HANDLE; expression E1, E2, E3; constant C1, C2, C3; @@ ( f2fs_kzalloc(HANDLE, C1 * C2 * C3, ...) | f2fs_kzalloc(HANDLE, - E1 * E2 * E3 + array3_size(E1, E2, E3) , ...) ) // And then all remaining 2 factors products when they're not all constants. @@ expression HANDLE; expression E1, E2; constant C1, C2; @@ ( f2fs_kzalloc(HANDLE, C1 * C2, ...) | f2fs_kzalloc(HANDLE, - E1 * E2 + array_size(E1, E2) , ...) ) Signed-off-by: Kees Cook --- fs/f2fs/checkpoint.c | 3 ++- fs/f2fs/node.c | 6 ++++-- fs/f2fs/segment.c | 3 ++- fs/f2fs/super.c | 12 ++++++++---- 4 files changed, 16 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index e255e9b5538f..178623c15765 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -807,7 +807,8 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi) block_t cp_blk_no; int i; - sbi->ckpt = f2fs_kzalloc(sbi, cp_blks * blk_size, GFP_KERNEL); + sbi->ckpt = f2fs_kzalloc(sbi, array_size(blk_size, cp_blks), + GFP_KERNEL); if (!sbi->ckpt) return -ENOMEM; /* diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index baa8ee1aca38..1ad24998e29c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2756,8 +2756,10 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi) struct f2fs_nm_info *nm_i = NM_I(sbi); int i; - nm_i->free_nid_bitmap = f2fs_kzalloc(sbi, nm_i->nat_blocks * - sizeof(unsigned char *), GFP_KERNEL); + nm_i->free_nid_bitmap = + f2fs_kzalloc(sbi, array_size(sizeof(unsigned char *), + nm_i->nat_blocks), + GFP_KERNEL); if (!nm_i->free_nid_bitmap) return -ENOMEM; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 9a3dc92ecf23..97ec716ac0c1 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3716,7 +3716,8 @@ static int build_curseg(struct f2fs_sb_info *sbi) struct curseg_info *array; int i; - array = f2fs_kzalloc(sbi, sizeof(*array) * NR_CURSEG_TYPE, GFP_KERNEL); + array = f2fs_kzalloc(sbi, array_size(NR_CURSEG_TYPE, sizeof(*array)), + GFP_KERNEL); if (!array) return -ENOMEM; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index bfa56b037ed8..08635dc2594f 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2423,8 +2423,10 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) #define F2FS_REPORT_NR_ZONES 4096 - zones = f2fs_kzalloc(sbi, sizeof(struct blk_zone) * - F2FS_REPORT_NR_ZONES, GFP_KERNEL); + zones = f2fs_kzalloc(sbi, + array_size(F2FS_REPORT_NR_ZONES, + sizeof(struct blk_zone)), + GFP_KERNEL); if (!zones) return -ENOMEM; @@ -2568,8 +2570,10 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) * Initialize multiple devices information, or single * zoned block device information. */ - sbi->devs = f2fs_kzalloc(sbi, sizeof(struct f2fs_dev_info) * - max_devices, GFP_KERNEL); + sbi->devs = f2fs_kzalloc(sbi, + array_size(max_devices, + sizeof(struct f2fs_dev_info)), + GFP_KERNEL); if (!sbi->devs) return -ENOMEM; From 6944da0a68ca00f8f27bd71e0e0e292ea14b5ca5 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 12 Jun 2018 14:28:35 -0700 Subject: [PATCH 0778/1212] treewide: Use array_size in f2fs_kvzalloc() The f2fs_kvzalloc() function has no 2-factor argument form, so multiplication factors need to be wrapped in array_size(). This patch replaces cases of: f2fs_kvzalloc(handle, a * b, gfp) with: f2fs_kvzalloc(handle, array_size(a, b), gfp) as well as handling cases of: f2fs_kvzalloc(handle, a * b * c, gfp) with: f2fs_kvzalloc(handle, array3_size(a, b, c), gfp) This does, however, attempt to ignore constant size factors like: f2fs_kvzalloc(handle, 4 * 1024, gfp) though any constants defined via macros get caught up in the conversion. Any factors with a sizeof() of "unsigned char", "char", and "u8" were dropped, since they're redundant. The Coccinelle script used for this was: // Fix redundant parens around sizeof(). @@ expression HANDLE; type TYPE; expression THING, E; @@ ( f2fs_kvzalloc(HANDLE, - (sizeof(TYPE)) * E + sizeof(TYPE) * E , ...) | f2fs_kvzalloc(HANDLE, - (sizeof(THING)) * E + sizeof(THING) * E , ...) ) // Drop single-byte sizes and redundant parens. @@ expression HANDLE; expression COUNT; typedef u8; typedef __u8; @@ ( f2fs_kvzalloc(HANDLE, - sizeof(u8) * (COUNT) + COUNT , ...) | f2fs_kvzalloc(HANDLE, - sizeof(__u8) * (COUNT) + COUNT , ...) | f2fs_kvzalloc(HANDLE, - sizeof(char) * (COUNT) + COUNT , ...) | f2fs_kvzalloc(HANDLE, - sizeof(unsigned char) * (COUNT) + COUNT , ...) | f2fs_kvzalloc(HANDLE, - sizeof(u8) * COUNT + COUNT , ...) | f2fs_kvzalloc(HANDLE, - sizeof(__u8) * COUNT + COUNT , ...) | f2fs_kvzalloc(HANDLE, - sizeof(char) * COUNT + COUNT , ...) | f2fs_kvzalloc(HANDLE, - sizeof(unsigned char) * COUNT + COUNT , ...) ) // 2-factor product with sizeof(type/expression) and identifier or constant. @@ expression HANDLE; type TYPE; expression THING; identifier COUNT_ID; constant COUNT_CONST; @@ ( f2fs_kvzalloc(HANDLE, - sizeof(TYPE) * (COUNT_ID) + array_size(COUNT_ID, sizeof(TYPE)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(TYPE) * COUNT_ID + array_size(COUNT_ID, sizeof(TYPE)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(TYPE) * (COUNT_CONST) + array_size(COUNT_CONST, sizeof(TYPE)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(TYPE) * COUNT_CONST + array_size(COUNT_CONST, sizeof(TYPE)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(THING) * (COUNT_ID) + array_size(COUNT_ID, sizeof(THING)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(THING) * COUNT_ID + array_size(COUNT_ID, sizeof(THING)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(THING) * (COUNT_CONST) + array_size(COUNT_CONST, sizeof(THING)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(THING) * COUNT_CONST + array_size(COUNT_CONST, sizeof(THING)) , ...) ) // 2-factor product, only identifiers. @@ expression HANDLE; identifier SIZE, COUNT; @@ f2fs_kvzalloc(HANDLE, - SIZE * COUNT + array_size(COUNT, SIZE) , ...) // 3-factor product with 1 sizeof(type) or sizeof(expression), with // redundant parens removed. @@ expression HANDLE; expression THING; identifier STRIDE, COUNT; type TYPE; @@ ( f2fs_kvzalloc(HANDLE, - sizeof(TYPE) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(TYPE) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(TYPE) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(TYPE) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(THING) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(THING) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(THING) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(THING) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) ) // 3-factor product with 2 sizeof(variable), with redundant parens removed. @@ expression HANDLE; expression THING1, THING2; identifier COUNT; type TYPE1, TYPE2; @@ ( f2fs_kvzalloc(HANDLE, - sizeof(TYPE1) * sizeof(TYPE2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(THING1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(THING1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(TYPE1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) ) // 3-factor product, only identifiers, with redundant parens removed. @@ expression HANDLE; identifier STRIDE, SIZE, COUNT; @@ ( f2fs_kvzalloc(HANDLE, - (COUNT) * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kvzalloc(HANDLE, - COUNT * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kvzalloc(HANDLE, - COUNT * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kvzalloc(HANDLE, - (COUNT) * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kvzalloc(HANDLE, - COUNT * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kvzalloc(HANDLE, - (COUNT) * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kvzalloc(HANDLE, - (COUNT) * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kvzalloc(HANDLE, - COUNT * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) ) // Any remaining multi-factor products, first at least 3-factor products // when they're not all constants... @@ expression HANDLE; expression E1, E2, E3; constant C1, C2, C3; @@ ( f2fs_kvzalloc(HANDLE, C1 * C2 * C3, ...) | f2fs_kvzalloc(HANDLE, - E1 * E2 * E3 + array3_size(E1, E2, E3) , ...) ) // And then all remaining 2 factors products when they're not all constants. @@ expression HANDLE; expression E1, E2; constant C1, C2; @@ ( f2fs_kvzalloc(HANDLE, C1 * C2, ...) | f2fs_kvzalloc(HANDLE, - E1 * E2 + array_size(E1, E2) , ...) ) Signed-off-by: Kees Cook --- fs/f2fs/file.c | 6 ++++-- fs/f2fs/node.c | 6 ++++-- fs/f2fs/segment.c | 12 ++++++++---- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 2ca53f7b94e9..1ada29893092 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1129,12 +1129,14 @@ static int __exchange_data_block(struct inode *src_inode, olen = min((pgoff_t)4 * ADDRS_PER_BLOCK, len); src_blkaddr = f2fs_kvzalloc(F2FS_I_SB(src_inode), - sizeof(block_t) * olen, GFP_KERNEL); + array_size(olen, sizeof(block_t)), + GFP_KERNEL); if (!src_blkaddr) return -ENOMEM; do_replace = f2fs_kvzalloc(F2FS_I_SB(src_inode), - sizeof(int) * olen, GFP_KERNEL); + array_size(olen, sizeof(int)), + GFP_KERNEL); if (!do_replace) { kvfree(src_blkaddr); return -ENOMEM; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 1ad24998e29c..b72fac4766a9 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2775,8 +2775,10 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi) if (!nm_i->nat_block_bitmap) return -ENOMEM; - nm_i->free_nid_count = f2fs_kvzalloc(sbi, nm_i->nat_blocks * - sizeof(unsigned short), GFP_KERNEL); + nm_i->free_nid_count = + f2fs_kvzalloc(sbi, array_size(sizeof(unsigned short), + nm_i->nat_blocks), + GFP_KERNEL); if (!nm_i->free_nid_count) return -ENOMEM; return 0; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 97ec716ac0c1..3d0c42ef0474 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3598,8 +3598,10 @@ static int build_sit_info(struct f2fs_sb_info *sbi) SM_I(sbi)->sit_info = sit_i; - sit_i->sentries = f2fs_kvzalloc(sbi, MAIN_SEGS(sbi) * - sizeof(struct seg_entry), GFP_KERNEL); + sit_i->sentries = + f2fs_kvzalloc(sbi, array_size(sizeof(struct seg_entry), + MAIN_SEGS(sbi)), + GFP_KERNEL); if (!sit_i->sentries) return -ENOMEM; @@ -3639,8 +3641,10 @@ static int build_sit_info(struct f2fs_sb_info *sbi) return -ENOMEM; if (sbi->segs_per_sec > 1) { - sit_i->sec_entries = f2fs_kvzalloc(sbi, MAIN_SECS(sbi) * - sizeof(struct sec_entry), GFP_KERNEL); + sit_i->sec_entries = + f2fs_kvzalloc(sbi, array_size(sizeof(struct sec_entry), + MAIN_SECS(sbi)), + GFP_KERNEL); if (!sit_i->sec_entries) return -ENOMEM; } From 4e55d28084cc1c94c62b63ece1eeeeb29dc4941e Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Sat, 14 Jul 2018 12:52:10 -0400 Subject: [PATCH 0779/1212] net: 6lowpan: fix reserved space for single frames commit ac74f87c789af40936a80131c4759f3e72579c3a upstream. This patch fixes patch add handling to take care tail and headroom for single 6lowpan frames. We need to be sure we have a skb with the right head and tailroom for single frames. This patch do it by using skb_copy_expand() if head and tailroom is not enough allocated by upper layer. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=195059 Reported-by: David Palma Reported-by: Rabi Narayan Sahoo Cc: stable@vger.kernel.org Signed-off-by: Alexander Aring Signed-off-by: Stefan Schmidt Signed-off-by: Greg Kroah-Hartman --- net/ieee802154/6lowpan/tx.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/net/ieee802154/6lowpan/tx.c b/net/ieee802154/6lowpan/tx.c index d4353faced35..a10db45b2e1e 100644 --- a/net/ieee802154/6lowpan/tx.c +++ b/net/ieee802154/6lowpan/tx.c @@ -265,9 +265,24 @@ netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *ldev) /* We must take a copy of the skb before we modify/replace the ipv6 * header as the header could be used elsewhere */ - skb = skb_unshare(skb, GFP_ATOMIC); - if (!skb) - return NET_XMIT_DROP; + if (unlikely(skb_headroom(skb) < ldev->needed_headroom || + skb_tailroom(skb) < ldev->needed_tailroom)) { + struct sk_buff *nskb; + + nskb = skb_copy_expand(skb, ldev->needed_headroom, + ldev->needed_tailroom, GFP_ATOMIC); + if (likely(nskb)) { + consume_skb(skb); + skb = nskb; + } else { + kfree_skb(skb); + return NET_XMIT_DROP; + } + } else { + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) + return NET_XMIT_DROP; + } ret = lowpan_header(skb, ldev, &dgram_size, &dgram_offset); if (ret < 0) { From aeca800e562ddca7449f8d39732f4cd2b5a41dce Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 2 Jul 2018 16:32:03 -0400 Subject: [PATCH 0780/1212] net: mac802154: tx: expand tailroom if necessary commit f9c52831133050c6b82aa8b6831c92da2bbf2a0b upstream. This patch is necessary if case of AF_PACKET or other socket interface which I am aware of it and didn't allocated the necessary room. Reported-by: David Palma Reported-by: Rabi Narayan Sahoo Cc: stable@vger.kernel.org Signed-off-by: Alexander Aring Signed-off-by: Stefan Schmidt Signed-off-by: Greg Kroah-Hartman --- net/mac802154/tx.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c index 3827f359b336..9e1ff9d4cf2d 100644 --- a/net/mac802154/tx.c +++ b/net/mac802154/tx.c @@ -72,8 +72,21 @@ ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb) int ret; if (!(local->hw.flags & IEEE802154_HW_TX_OMIT_CKSUM)) { - u16 crc = crc_ccitt(0, skb->data, skb->len); + struct sk_buff *nskb; + u16 crc; + if (unlikely(skb_tailroom(skb) < IEEE802154_FCS_LEN)) { + nskb = skb_copy_expand(skb, 0, IEEE802154_FCS_LEN, + GFP_ATOMIC); + if (likely(nskb)) { + consume_skb(skb); + skb = nskb; + } else { + goto err_tx; + } + } + + crc = crc_ccitt(0, skb->data, skb->len); put_unaligned_le16(crc, skb_put(skb, 2)); } From c9f7c99fc92dc8a36a7d8f2a8af0457bc9aa20a0 Mon Sep 17 00:00:00 2001 From: Chirantan Ekbote Date: Mon, 16 Jul 2018 17:35:29 -0700 Subject: [PATCH 0781/1212] 9p/net: Fix zero-copy path in the 9p virtio transport commit d28c756caee6e414d9ba367d0b92da24145af2a8 upstream. The zero-copy optimization when reading or writing large chunks of data is quite useful. However, the 9p messages created through the zero-copy write path have an incorrect message size: it should be the size of the header + size of the data being written but instead it's just the size of the header. This only works if the server ignores the size field of the message and otherwise breaks the framing of the protocol. Fix this by re-writing the message size field with the correct value. Tested by running `dd if=/dev/zero of=out bs=4k count=1` inside a virtio-9p mount. Link: http://lkml.kernel.org/r/20180717003529.114368-1-chirantan@chromium.org Signed-off-by: Chirantan Ekbote Reviewed-by: Greg Kurz Tested-by: Greg Kurz Cc: Dylan Reid Cc: Guenter Roeck Cc: stable@vger.kernel.org Signed-off-by: Dominique Martinet Signed-off-by: Greg Kroah-Hartman --- net/9p/trans_virtio.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 2ddeecca5b12..cb2276b91b3c 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -409,6 +409,7 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, p9_debug(P9_DEBUG_TRANS, "virtio request\n"); if (uodata) { + __le32 sz; int n = p9_get_mapped_pages(chan, &out_pages, uodata, outlen, &offs, &need_drop); if (n < 0) @@ -419,6 +420,12 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, memcpy(&req->tc->sdata[req->tc->size - 4], &v, 4); outlen = n; } + /* The size field of the message must include the length of the + * header and the length of the data. We didn't actually know + * the length of the data until this point so add it in now. + */ + sz = cpu_to_le32(req->tc->size + outlen); + memcpy(&req->tc->sdata[0], &sz, sizeof(sz)); } else if (uidata) { int n = p9_get_mapped_pages(chan, &in_pages, uidata, inlen, &offs, &need_drop); From b4de9ac2c750127c2d8bc9a737eca611405e1171 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Fri, 7 Sep 2018 01:13:40 +0100 Subject: [PATCH 0782/1212] net: lan78xx: Fix misplaced tasklet_schedule() call Commit 136f55f66019 ("net: lan78xx: fix rx handling before first packet is send") was not correctly backported to 4.4. The call to tasklet_schedule() belongs in lan78xx_link_reset(). Fixes: d1fc12d8475c ("net: lan78xx: fix rx handling before first packet is send") Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- drivers/net/usb/lan78xx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index acec4b565511..1aede726052c 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -902,6 +902,8 @@ static int lan78xx_link_reset(struct lan78xx_net *dev) ret = lan78xx_update_flowcontrol(dev, ecmd.duplex, ladv, radv); netif_carrier_on(dev->net); + + tasklet_schedule(&dev->bh); } return ret; @@ -1361,8 +1363,6 @@ static void lan78xx_init_mac_address(struct lan78xx_net *dev) netif_dbg(dev, ifup, dev->net, "MAC address set to random addr"); } - - tasklet_schedule(&dev->bh); } ret = lan78xx_write_reg(dev, MAF_LO(0), addr_lo); From c2b736ff27b4c5bdfb7d66559383f851e40ff495 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Fri, 10 Aug 2018 11:13:52 +0200 Subject: [PATCH 0783/1212] spi: davinci: fix a NULL pointer dereference commit 563a53f3906a6b43692498e5b3ae891fac93a4af upstream. On non-OF systems spi->controlled_data may be NULL. This causes a NULL pointer derefence on dm365-evm. Signed-off-by: Bartosz Golaszewski Signed-off-by: Mark Brown Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/spi/spi-davinci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-davinci.c b/drivers/spi/spi-davinci.c index c872a2e54c4b..2603bee2ce07 100644 --- a/drivers/spi/spi-davinci.c +++ b/drivers/spi/spi-davinci.c @@ -220,7 +220,7 @@ static void davinci_spi_chipselect(struct spi_device *spi, int value) pdata = &dspi->pdata; /* program delay transfers if tx_delay is non zero */ - if (spicfg->wdelay) + if (spicfg && spicfg->wdelay) spidat1 |= SPIDAT1_WDEL; /* From 182e963432d867384f2e55487ec60ca7a9f99cd1 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Wed, 2 May 2018 20:50:21 +0100 Subject: [PATCH 0784/1212] drm/i915/userptr: reject zero user_size commit c11c7bfd213495784b22ef82a69b6489f8d0092f upstream. Operating on a zero sized GEM userptr object will lead to explosions. Fixes: 5cc9ed4b9a7a ("drm/i915: Introduce mapping of user pages into video memory (userptr) ioctl") Testcase: igt/gem_userptr_blits/input-checking Signed-off-by: Matthew Auld Cc: Chris Wilson Reviewed-by: Chris Wilson Signed-off-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20180502195021.30900-1-matthew.auld@intel.com Cc: Loic Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/i915_gem_userptr.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c index 19fb0bddc1cd..359fe2b8bb8a 100644 --- a/drivers/gpu/drm/i915/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c @@ -842,6 +842,9 @@ i915_gem_userptr_ioctl(struct drm_device *dev, void *data, struct drm_file *file I915_USERPTR_UNSYNCHRONIZED)) return -EINVAL; + if (!args->user_size) + return -EINVAL; + if (offset_in_page(args->user_ptr | args->user_size)) return -EINVAL; From c9fadf27006ba098fcd46e7e1f0fb1daedce33d4 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Tue, 7 Aug 2018 02:12:45 +0530 Subject: [PATCH 0785/1212] powerpc/fadump: handle crash memory ranges array index overflow commit 1bd6a1c4b80a28d975287630644e6b47d0f977a5 upstream. Crash memory ranges is an array of memory ranges of the crashing kernel to be exported as a dump via /proc/vmcore file. The size of the array is set based on INIT_MEMBLOCK_REGIONS, which works alright in most cases where memblock memory regions count is less than INIT_MEMBLOCK_REGIONS value. But this count can grow beyond INIT_MEMBLOCK_REGIONS value since commit 142b45a72e22 ("memblock: Add array resizing support"). On large memory systems with a few DLPAR operations, the memblock memory regions count could be larger than INIT_MEMBLOCK_REGIONS value. On such systems, registering fadump results in crash or other system failures like below: task: c00007f39a290010 ti: c00000000b738000 task.ti: c00000000b738000 NIP: c000000000047df4 LR: c0000000000f9e58 CTR: c00000000010f180 REGS: c00000000b73b570 TRAP: 0300 Tainted: G L X (4.4.140+) MSR: 8000000000009033 CR: 22004484 XER: 20000000 CFAR: c000000000008500 DAR: 000007a450000000 DSISR: 40000000 SOFTE: 0 ... NIP [c000000000047df4] smp_send_reschedule+0x24/0x80 LR [c0000000000f9e58] resched_curr+0x138/0x160 Call Trace: resched_curr+0x138/0x160 (unreliable) check_preempt_curr+0xc8/0xf0 ttwu_do_wakeup+0x38/0x150 try_to_wake_up+0x224/0x4d0 __wake_up_common+0x94/0x100 ep_poll_callback+0xac/0x1c0 __wake_up_common+0x94/0x100 __wake_up_sync_key+0x70/0xa0 sock_def_readable+0x58/0xa0 unix_stream_sendmsg+0x2dc/0x4c0 sock_sendmsg+0x68/0xa0 ___sys_sendmsg+0x2cc/0x2e0 __sys_sendmsg+0x5c/0xc0 SyS_socketcall+0x36c/0x3f0 system_call+0x3c/0x100 as array index overflow is not checked for while setting up crash memory ranges causing memory corruption. To resolve this issue, dynamically allocate memory for crash memory ranges and resize it incrementally, in units of pagesize, on hitting array size limit. Fixes: 2df173d9e85d ("fadump: Initialize elfcore header and add PT_LOAD program headers.") Cc: stable@vger.kernel.org # v3.4+ Signed-off-by: Hari Bathini Reviewed-by: Mahesh Salgaonkar [mpe: Just use PAGE_SIZE directly, fixup variable placement] Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/include/asm/fadump.h | 3 - arch/powerpc/kernel/fadump.c | 92 ++++++++++++++++++++++++++----- 2 files changed, 78 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h index 493e72f64b35..5768ec3c1781 100644 --- a/arch/powerpc/include/asm/fadump.h +++ b/arch/powerpc/include/asm/fadump.h @@ -194,9 +194,6 @@ struct fadump_crash_info_header { struct cpumask cpu_online_mask; }; -/* Crash memory ranges */ -#define INIT_CRASHMEM_RANGES (INIT_MEMBLOCK_REGIONS + 2) - struct fad_crash_memory_ranges { unsigned long long base; unsigned long long size; diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 791d4c3329c3..c3c835290131 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -48,8 +49,10 @@ static struct fadump_mem_struct fdm; static const struct fadump_mem_struct *fdm_active; static DEFINE_MUTEX(fadump_mutex); -struct fad_crash_memory_ranges crash_memory_ranges[INIT_CRASHMEM_RANGES]; +struct fad_crash_memory_ranges *crash_memory_ranges; +int crash_memory_ranges_size; int crash_mem_ranges; +int max_crash_mem_ranges; /* Scan the Firmware Assisted dump configuration details. */ int __init early_init_dt_scan_fw_dump(unsigned long node, @@ -726,38 +729,88 @@ static int __init process_fadump(const struct fadump_mem_struct *fdm_active) return 0; } -static inline void fadump_add_crash_memory(unsigned long long base, - unsigned long long end) +static void free_crash_memory_ranges(void) +{ + kfree(crash_memory_ranges); + crash_memory_ranges = NULL; + crash_memory_ranges_size = 0; + max_crash_mem_ranges = 0; +} + +/* + * Allocate or reallocate crash memory ranges array in incremental units + * of PAGE_SIZE. + */ +static int allocate_crash_memory_ranges(void) +{ + struct fad_crash_memory_ranges *new_array; + u64 new_size; + + new_size = crash_memory_ranges_size + PAGE_SIZE; + pr_debug("Allocating %llu bytes of memory for crash memory ranges\n", + new_size); + + new_array = krealloc(crash_memory_ranges, new_size, GFP_KERNEL); + if (new_array == NULL) { + pr_err("Insufficient memory for setting up crash memory ranges\n"); + free_crash_memory_ranges(); + return -ENOMEM; + } + + crash_memory_ranges = new_array; + crash_memory_ranges_size = new_size; + max_crash_mem_ranges = (new_size / + sizeof(struct fad_crash_memory_ranges)); + return 0; +} + +static inline int fadump_add_crash_memory(unsigned long long base, + unsigned long long end) { if (base == end) - return; + return 0; + + if (crash_mem_ranges == max_crash_mem_ranges) { + int ret; + + ret = allocate_crash_memory_ranges(); + if (ret) + return ret; + } pr_debug("crash_memory_range[%d] [%#016llx-%#016llx], %#llx bytes\n", crash_mem_ranges, base, end - 1, (end - base)); crash_memory_ranges[crash_mem_ranges].base = base; crash_memory_ranges[crash_mem_ranges].size = end - base; crash_mem_ranges++; + return 0; } -static void fadump_exclude_reserved_area(unsigned long long start, +static int fadump_exclude_reserved_area(unsigned long long start, unsigned long long end) { unsigned long long ra_start, ra_end; + int ret = 0; ra_start = fw_dump.reserve_dump_area_start; ra_end = ra_start + fw_dump.reserve_dump_area_size; if ((ra_start < end) && (ra_end > start)) { if ((start < ra_start) && (end > ra_end)) { - fadump_add_crash_memory(start, ra_start); - fadump_add_crash_memory(ra_end, end); + ret = fadump_add_crash_memory(start, ra_start); + if (ret) + return ret; + + ret = fadump_add_crash_memory(ra_end, end); } else if (start < ra_start) { - fadump_add_crash_memory(start, ra_start); + ret = fadump_add_crash_memory(start, ra_start); } else if (ra_end < end) { - fadump_add_crash_memory(ra_end, end); + ret = fadump_add_crash_memory(ra_end, end); } } else - fadump_add_crash_memory(start, end); + ret = fadump_add_crash_memory(start, end); + + return ret; } static int fadump_init_elfcore_header(char *bufp) @@ -793,10 +846,11 @@ static int fadump_init_elfcore_header(char *bufp) * Traverse through memblock structure and setup crash memory ranges. These * ranges will be used create PT_LOAD program headers in elfcore header. */ -static void fadump_setup_crash_memory_ranges(void) +static int fadump_setup_crash_memory_ranges(void) { struct memblock_region *reg; unsigned long long start, end; + int ret; pr_debug("Setup crash memory ranges.\n"); crash_mem_ranges = 0; @@ -807,7 +861,9 @@ static void fadump_setup_crash_memory_ranges(void) * specified during fadump registration. We need to create a separate * program header for this chunk with the correct offset. */ - fadump_add_crash_memory(RMA_START, fw_dump.boot_memory_size); + ret = fadump_add_crash_memory(RMA_START, fw_dump.boot_memory_size); + if (ret) + return ret; for_each_memblock(memory, reg) { start = (unsigned long long)reg->base; @@ -816,8 +872,12 @@ static void fadump_setup_crash_memory_ranges(void) start = fw_dump.boot_memory_size; /* add this range excluding the reserved dump area. */ - fadump_exclude_reserved_area(start, end); + ret = fadump_exclude_reserved_area(start, end); + if (ret) + return ret; } + + return 0; } /* @@ -941,6 +1001,7 @@ static void register_fadump(void) { unsigned long addr; void *vaddr; + int ret; /* * If no memory is reserved then we can not register for firmware- @@ -949,7 +1010,9 @@ static void register_fadump(void) if (!fw_dump.reserve_dump_area_size) return; - fadump_setup_crash_memory_ranges(); + ret = fadump_setup_crash_memory_ranges(); + if (ret) + return ret; addr = be64_to_cpu(fdm.rmr_region.destination_address) + be64_to_cpu(fdm.rmr_region.source_len); /* Initialize fadump crash info header. */ @@ -1028,6 +1091,7 @@ void fadump_cleanup(void) } else if (fw_dump.dump_registered) { /* Un-register Firmware-assisted dump if it was registered. */ fadump_unregister_dump(&fdm); + free_crash_memory_ranges(); } } From fa4cd57290cf0f227e82473550868ddde0d1f074 Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Tue, 7 Aug 2018 19:46:46 +0530 Subject: [PATCH 0786/1212] powerpc/pseries: Fix endianness while restoring of r3 in MCE handler. commit cd813e1cd7122f2c261dce5b54d1e0c97f80e1a5 upstream. During Machine Check interrupt on pseries platform, register r3 points RTAS extended event log passed by hypervisor. Since hypervisor uses r3 to pass pointer to rtas log, it stores the original r3 value at the start of the memory (first 8 bytes) pointed by r3. Since hypervisor stores this info and rtas log is in BE format, linux should make sure to restore r3 value in correct endian format. Without this patch when MCE handler, after recovery, returns to code that that caused the MCE may end up with Data SLB access interrupt for invalid address followed by kernel panic or hang. Severe Machine check interrupt [Recovered] NIP [d00000000ca301b8]: init_module+0x1b8/0x338 [bork_kernel] Initiator: CPU Error type: SLB [Multihit] Effective address: d00000000ca70000 cpu 0xa: Vector: 380 (Data SLB Access) at [c0000000fc7775b0] pc: c0000000009694c0: vsnprintf+0x80/0x480 lr: c0000000009698e0: vscnprintf+0x20/0x60 sp: c0000000fc777830 msr: 8000000002009033 dar: a803a30c000000d0 current = 0xc00000000bc9ef00 paca = 0xc00000001eca5c00 softe: 3 irq_happened: 0x01 pid = 8860, comm = insmod vscnprintf+0x20/0x60 vprintk_emit+0xb4/0x4b0 vprintk_func+0x5c/0xd0 printk+0x38/0x4c init_module+0x1c0/0x338 [bork_kernel] do_one_initcall+0x54/0x230 do_init_module+0x8c/0x248 load_module+0x12b8/0x15b0 sys_finit_module+0xa8/0x110 system_call+0x58/0x6c --- Exception: c00 (System Call) at 00007fff8bda0644 SP (7fffdfbfe980) is in userspace This patch fixes this issue. Fixes: a08a53ea4c97 ("powerpc/le: Enable RTAS events support") Cc: stable@vger.kernel.org # v3.15+ Reviewed-by: Nicholas Piggin Signed-off-by: Mahesh Salgaonkar Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/platforms/pseries/ras.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 3b6647e574b6..f5313a78e5d6 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -300,7 +300,7 @@ static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs) } savep = __va(regs->gpr[3]); - regs->gpr[3] = savep[0]; /* restore original r3 */ + regs->gpr[3] = be64_to_cpu(savep[0]); /* restore original r3 */ /* If it isn't an extended log we can use the per cpu 64bit buffer */ h = (struct rtas_error_log *)&savep[1]; From 2c9ffc9d440d31efa23136f5a64eebccfc2ec553 Mon Sep 17 00:00:00 2001 From: piaojun Date: Wed, 25 Jul 2018 11:13:16 +0800 Subject: [PATCH 0787/1212] fs/9p/xattr.c: catch the error of p9_client_clunk when setting xattr failed commit 3111784bee81591ea2815011688d28b65df03627 upstream. In my testing, v9fs_fid_xattr_set will return successfully even if the backend ext4 filesystem has no space to store xattr key-value. That will cause inconsistent behavior between front end and back end. The reason is that lsetxattr will be triggered by p9_client_clunk, and unfortunately we did not catch the error. This patch will catch the error to notify upper caller. p9_client_clunk (in 9p) p9_client_rpc(clnt, P9_TCLUNK, "d", fid->fid); v9fs_clunk (in qemu) put_fid free_fid v9fs_xattr_fid_clunk v9fs_co_lsetxattr s->ops->lsetxattr ext4_xattr_user_set (in host ext4 filesystem) Link: http://lkml.kernel.org/r/5B57EACC.2060900@huawei.com Signed-off-by: Jun Piao Cc: Eric Van Hensbergen Cc: Ron Minnich Cc: Latchesar Ionkov Cc: Andrew Morton Cc: stable@vger.kernel.org Signed-off-by: Dominique Martinet Signed-off-by: Greg Kroah-Hartman --- fs/9p/xattr.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index e3d026ac382e..f35168ce426b 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -107,7 +107,7 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name, { struct kvec kvec = {.iov_base = (void *)value, .iov_len = value_len}; struct iov_iter from; - int retval; + int retval, err; iov_iter_kvec(&from, WRITE | ITER_KVEC, &kvec, 1, value_len); @@ -128,7 +128,9 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name, retval); else p9_client_write(fid, 0, &from, &retval); - p9_client_clunk(fid); + err = p9_client_clunk(fid); + if (!retval && err) + retval = err; return retval; } From 1d2e1e399f86ca085eb85a9f68c20cf4fbf2c79d Mon Sep 17 00:00:00 2001 From: jiangyiwen Date: Fri, 3 Aug 2018 12:11:34 +0800 Subject: [PATCH 0788/1212] 9p/virtio: fix off-by-one error in sg list bounds check commit 23cba9cbde0bba05d772b335fe5f66aa82b9ad19 upstream. Because the value of limit is VIRTQUEUE_NUM, if index is equal to limit, it will cause sg array out of bounds, so correct the judgement of BUG_ON. Link: http://lkml.kernel.org/r/5B63D5F6.6080109@huawei.com Signed-off-by: Yiwen Jiang Reported-By: Dan Carpenter Acked-by: Jun Piao Cc: stable@vger.kernel.org Signed-off-by: Dominique Martinet Signed-off-by: Greg Kroah-Hartman --- net/9p/trans_virtio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index cb2276b91b3c..669198ac73db 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -192,7 +192,7 @@ static int pack_sg_list(struct scatterlist *sg, int start, s = rest_of_page(data); if (s > count) s = count; - BUG_ON(index > limit); + BUG_ON(index >= limit); /* Make sure we don't terminate early. */ sg_unmark_end(&sg[index]); sg_set_buf(&sg[index++], data, s); @@ -237,6 +237,7 @@ pack_sg_list_p(struct scatterlist *sg, int start, int limit, s = PAGE_SIZE - data_off; if (s > count) s = count; + BUG_ON(index >= limit); /* Make sure we don't terminate early. */ sg_unmark_end(&sg[index]); sg_set_page(&sg[index++], pdata[i++], s, data_off); From 34cc7cf15e16a34447581a5956bf1a434fcb190f Mon Sep 17 00:00:00 2001 From: Tomas Bortoli Date: Tue, 10 Jul 2018 00:29:43 +0200 Subject: [PATCH 0789/1212] net/9p/client.c: version pointer uninitialized commit 7913690dcc5e18e235769fd87c34143072f5dbea upstream. The p9_client_version() does not initialize the version pointer. If the call to p9pdu_readf() returns an error and version has not been allocated in p9pdu_readf(), then the program will jump to the "error" label and will try to free the version pointer. If version is not initialized, free() will be called with uninitialized, garbage data and will provoke a crash. Link: http://lkml.kernel.org/r/20180709222943.19503-1-tomasbortoli@gmail.com Signed-off-by: Tomas Bortoli Reported-by: syzbot+65c6b72f284a39d416b4@syzkaller.appspotmail.com Reviewed-by: Jun Piao Reviewed-by: Yiwen Jiang Cc: Eric Van Hensbergen Cc: Ron Minnich Cc: Latchesar Ionkov Signed-off-by: Andrew Morton Cc: stable@vger.kernel.org Signed-off-by: Dominique Martinet Signed-off-by: Greg Kroah-Hartman --- net/9p/client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/9p/client.c b/net/9p/client.c index 3ff26eb1ea20..ed8738c4dc09 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -931,7 +931,7 @@ static int p9_client_version(struct p9_client *c) { int err = 0; struct p9_req_t *req; - char *version; + char *version = NULL; int msize; p9_debug(P9_DEBUG_9P, ">>> TVERSION msize %d protocol %d\n", From 06d7a39a9c397a62615122d21dafa4c16506e7e2 Mon Sep 17 00:00:00 2001 From: Tomas Bortoli Date: Fri, 20 Jul 2018 11:27:30 +0200 Subject: [PATCH 0790/1212] net/9p/trans_fd.c: fix race-condition by flushing workqueue before the kfree() commit 430ac66eb4c5b5c4eb846b78ebf65747510b30f1 upstream. The patch adds the flush in p9_mux_poll_stop() as it the function used by p9_conn_destroy(), in turn called by p9_fd_close() to stop the async polling associated with the data regarding the connection. Link: http://lkml.kernel.org/r/20180720092730.27104-1-tomasbortoli@gmail.com Signed-off-by: Tomas Bortoli Reported-by: syzbot+39749ed7d9ef6dfb23f6@syzkaller.appspotmail.com To: Eric Van Hensbergen To: Ron Minnich To: Latchesar Ionkov Cc: Yiwen Jiang Cc: stable@vger.kernel.org Signed-off-by: Dominique Martinet Signed-off-by: Greg Kroah-Hartman --- net/9p/trans_fd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index bced8c074c12..c923221bb8b9 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -185,6 +185,8 @@ static void p9_mux_poll_stop(struct p9_conn *m) spin_lock_irqsave(&p9_poll_lock, flags); list_del_init(&m->poll_pending_link); spin_unlock_irqrestore(&p9_poll_lock, flags); + + flush_work(&p9_poll_work); } /** From 15898df477269c981dc1ae5afa39e1bb65e1db0a Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Fri, 7 Sep 2018 11:13:07 +0200 Subject: [PATCH 0791/1212] x86/mm/pat: Fix L1TF stable backport for CPA, 2nd call Mostly recycling the commit log from adaba23ccd7d which fixed populate_pmd, but did not fix populate_pud. The same problem exists there. Stable trees reverted the following patch: Revert "x86/mm/pat: Ensure cpa->pfn only contains page frame numbers" This reverts commit 87e2bd898d3a79a8c609f183180adac47879a2a4 which is commit edc3b9129cecd0f0857112136f5b8b1bc1d45918 upstream. but the L1TF patch 02ff2769edbc backported here x86/mm/pat: Make set_memory_np() L1TF safe commit 958f79b9ee55dfaf00c8106ed1c22a2919e0028b upstream set_memory_np() is used to mark kernel mappings not present, but it has it's own open coded mechanism which does not have the L1TF protection of inverting the address bits. assumed that cpa->pfn contains a PFN. With the above patch reverted it does not, which causes the PUD to be set to an incorrect address shifted by 12 bits, which can cause various failures. Convert the address to a PFN before passing it to pud_pfn(). This is a 4.4 stable only patch to fix the L1TF patches backport there. Cc: stable@vger.kernel.org # 4.4-only Cc: Andi Kleen Signed-off-by: Jiri Slaby Signed-off-by: Greg Kroah-Hartman --- arch/x86/mm/pageattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 1007fa80f5a6..0e1dd7d47f05 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1079,7 +1079,7 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, * Map everything starting from the Gb boundary, possibly with 1G pages */ while (end - start >= PUD_SIZE) { - set_pud(pud, pud_mkhuge(pfn_pud(cpa->pfn, + set_pud(pud, pud_mkhuge(pfn_pud(cpa->pfn >> PAGE_SHIFT, canon_pgprot(pud_pgprot)))); start += PUD_SIZE; From 75ae059e856946a47f600c9ee1cd60dba006c6d3 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 2 Aug 2018 16:08:52 -0400 Subject: [PATCH 0792/1212] dm cache metadata: save in-core policy_hint_size to on-disk superblock commit fd2fa95416188a767a63979296fa3e169a9ef5ec upstream. policy_hint_size starts as 0 during __write_initial_superblock(). It isn't until the policy is loaded that policy_hint_size is set in-core (cmd->policy_hint_size). But it never got recorded in the on-disk superblock because __commit_transaction() didn't deal with transfering the in-core cmd->policy_hint_size to the on-disk superblock. The in-core cmd->policy_hint_size gets initialized by metadata_open()'s __begin_transaction_flags() which re-reads all superblock fields. Because the superblock's policy_hint_size was never properly stored, when the cache was created, hints_array_available() would always return false when re-activating a previously created cache. This means __load_mappings() always considered the hints invalid and never made use of the hints (these hints served to optimize). Another detremental side-effect of this oversight is the cache_check utility would fail with: "invalid hint width: 0" Cc: stable@vger.kernel.org Signed-off-by: Mike Snitzer Signed-off-by: Greg Kroah-Hartman --- drivers/md/dm-cache-metadata.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index d3c55d7754af..905badc6cb17 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -337,7 +337,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd) disk_super->version = cpu_to_le32(MAX_CACHE_VERSION); memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name)); memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version)); - disk_super->policy_hint_size = 0; + disk_super->policy_hint_size = cpu_to_le32(0); __copy_sm_root(cmd, disk_super); @@ -652,6 +652,7 @@ static int __commit_transaction(struct dm_cache_metadata *cmd, disk_super->policy_version[0] = cpu_to_le32(cmd->policy_version[0]); disk_super->policy_version[1] = cpu_to_le32(cmd->policy_version[1]); disk_super->policy_version[2] = cpu_to_le32(cmd->policy_version[2]); + disk_super->policy_hint_size = cpu_to_le32(cmd->policy_hint_size); disk_super->read_hits = cpu_to_le32(cmd->stats.read_hits); disk_super->read_misses = cpu_to_le32(cmd->stats.read_misses); From ac14c5d1a5d14df3d5dffdde9fb1ec42abf38ed8 Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Mon, 25 Jun 2018 11:03:07 +0300 Subject: [PATCH 0793/1212] iio: ad9523: Fix displayed phase commit 5a4e33c1c53ae7d4425f7d94e60e4458a37b349e upstream. Fix the displayed phase for the ad9523 driver. Currently the most significant decimal place is dropped and all other digits are shifted one to the left. This is due to a multiplication by 10, which is not necessary, so remove it. Signed-off-by: Lars-Peter Clausen Signed-off-by: Alexandru Ardelean Fixes: cd1678f9632 ("iio: frequency: New driver for AD9523 SPI Low Jitter Clock Generator") Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/frequency/ad9523.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/frequency/ad9523.c b/drivers/iio/frequency/ad9523.c index 44a30f286de1..adc86aa30409 100644 --- a/drivers/iio/frequency/ad9523.c +++ b/drivers/iio/frequency/ad9523.c @@ -641,7 +641,7 @@ static int ad9523_read_raw(struct iio_dev *indio_dev, code = (AD9523_CLK_DIST_DIV_PHASE_REV(ret) * 3141592) / AD9523_CLK_DIST_DIV_REV(ret); *val = code / 1000000; - *val2 = (code % 1000000) * 10; + *val2 = code % 1000000; return IIO_VAL_INT_PLUS_MICRO; default: return -EINVAL; From 0785d7aedf80d0d36f6ec259477369125a491edb Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Fri, 27 Jul 2018 09:42:45 +0300 Subject: [PATCH 0794/1212] iio: ad9523: Fix return value for ad952x_store() commit 9a5094ca29ea9b1da301b31fd377c0c0c4c23034 upstream. A sysfs write callback function needs to either return the number of consumed characters or an error. The ad952x_store() function currently returns 0 if the input value was "0", this will signal that no characters have been consumed and the function will be called repeatedly in a loop indefinitely. Fix this by returning number of supplied characters to indicate that the whole input string has been consumed. Signed-off-by: Lars-Peter Clausen Signed-off-by: Alexandru Ardelean Fixes: cd1678f96329 ("iio: frequency: New driver for AD9523 SPI Low Jitter Clock Generator") Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/frequency/ad9523.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/frequency/ad9523.c b/drivers/iio/frequency/ad9523.c index adc86aa30409..57b1812a5a18 100644 --- a/drivers/iio/frequency/ad9523.c +++ b/drivers/iio/frequency/ad9523.c @@ -507,7 +507,7 @@ static ssize_t ad9523_store(struct device *dev, return ret; if (!state) - return 0; + return len; mutex_lock(&indio_dev->mlock); switch ((u32)this_attr->address) { From 244ce5c9b32a62626367f5159d1557c815029da4 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Tue, 19 Jun 2018 16:00:24 -0700 Subject: [PATCH 0795/1212] vmw_balloon: fix inflation of 64-bit GFNs commit 09755690c6b7c1eabdc4651eb3b276f8feb1e447 upstream. When balloon batching is not supported by the hypervisor, the guest frame number (GFN) must fit in 32-bit. However, due to a bug, this check was mistakenly ignored. In practice, when total RAM is greater than 16TB, the balloon does not work currently, making this bug unlikely to happen. Fixes: ef0f8f112984 ("VMware balloon: partially inline vmballoon_reserve_page.") Cc: stable@vger.kernel.org Reviewed-by: Xavier Deguillard Signed-off-by: Nadav Amit Signed-off-by: Greg Kroah-Hartman --- drivers/misc/vmw_balloon.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c index 5e047bfc0cc4..b0b6f99c8f0f 100644 --- a/drivers/misc/vmw_balloon.c +++ b/drivers/misc/vmw_balloon.c @@ -450,7 +450,7 @@ static int vmballoon_send_lock_page(struct vmballoon *b, unsigned long pfn, pfn32 = (u32)pfn; if (pfn32 != pfn) - return -1; + return -EINVAL; STATS_INC(b->stats.lock[false]); @@ -460,7 +460,7 @@ static int vmballoon_send_lock_page(struct vmballoon *b, unsigned long pfn, pr_debug("%s - ppn %lx, hv returns %ld\n", __func__, pfn, status); STATS_INC(b->stats.lock_fail[false]); - return 1; + return -EIO; } static int vmballoon_send_batched_lock(struct vmballoon *b, @@ -597,11 +597,12 @@ static int vmballoon_lock_page(struct vmballoon *b, unsigned int num_pages, locked = vmballoon_send_lock_page(b, page_to_pfn(page), &hv_status, target); - if (locked > 0) { + if (locked) { STATS_INC(b->stats.refused_alloc[false]); - if (hv_status == VMW_BALLOON_ERROR_RESET || - hv_status == VMW_BALLOON_ERROR_PPN_NOTNEEDED) { + if (locked == -EIO && + (hv_status == VMW_BALLOON_ERROR_RESET || + hv_status == VMW_BALLOON_ERROR_PPN_NOTNEEDED)) { vmballoon_free_page(page, false); return -EIO; } @@ -617,7 +618,7 @@ static int vmballoon_lock_page(struct vmballoon *b, unsigned int num_pages, } else { vmballoon_free_page(page, false); } - return -EIO; + return locked; } /* track allocated page */ From 059766538c16541bf56764869757a29b85840312 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Tue, 19 Jun 2018 16:00:25 -0700 Subject: [PATCH 0796/1212] vmw_balloon: do not use 2MB without batching commit 5081efd112560d3febb328e627176235b250d59d upstream. If the hypervisor sets 2MB batching is on, while batching is cleared, the balloon code breaks. In this case the legacy mechanism is used with 2MB page. The VM would report a 2MB page is ballooned, and the hypervisor would only take the first 4KB. While the hypervisor should not report such settings, make the code more robust by not enabling 2MB support without batching. Fixes: 365bd7ef7ec8e ("VMware balloon: Support 2m page ballooning.") Cc: stable@vger.kernel.org Reviewed-by: Xavier Deguillard Signed-off-by: Nadav Amit Signed-off-by: Greg Kroah-Hartman --- drivers/misc/vmw_balloon.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c index b0b6f99c8f0f..b6ccd551c00e 100644 --- a/drivers/misc/vmw_balloon.c +++ b/drivers/misc/vmw_balloon.c @@ -341,7 +341,13 @@ static bool vmballoon_send_start(struct vmballoon *b, unsigned long req_caps) success = false; } - if (b->capabilities & VMW_BALLOON_BATCHED_2M_CMDS) + /* + * 2MB pages are only supported with batching. If batching is for some + * reason disabled, do not use 2MB pages, since otherwise the legacy + * mechanism is used with 2MB pages, causing a failure. + */ + if ((b->capabilities & VMW_BALLOON_BATCHED_2M_CMDS) && + (b->capabilities & VMW_BALLOON_BATCHED_CMDS)) b->supported_page_sizes = 2; else b->supported_page_sizes = 1; From cae45e44dc46f3f793b974307955c5fe3eac0170 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Tue, 19 Jun 2018 16:00:26 -0700 Subject: [PATCH 0797/1212] vmw_balloon: VMCI_DOORBELL_SET does not check status commit ce664331b2487a5d244a51cbdd8cb54f866fbe5d upstream. When vmballoon_vmci_init() sets a doorbell using VMCI_DOORBELL_SET, for some reason it does not consider the status and looks at the result. However, the hypervisor does not update the result - it updates the status. This might cause VMCI doorbell not to be enabled, resulting in degraded performance. Fixes: 48e3d668b790 ("VMware balloon: Enable notification via VMCI") Cc: stable@vger.kernel.org Reviewed-by: Xavier Deguillard Signed-off-by: Nadav Amit Signed-off-by: Greg Kroah-Hartman --- drivers/misc/vmw_balloon.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c index b6ccd551c00e..8e739b29079e 100644 --- a/drivers/misc/vmw_balloon.c +++ b/drivers/misc/vmw_balloon.c @@ -1036,29 +1036,30 @@ static void vmballoon_vmci_cleanup(struct vmballoon *b) */ static int vmballoon_vmci_init(struct vmballoon *b) { - int error = 0; + unsigned long error, dummy; - if ((b->capabilities & VMW_BALLOON_SIGNALLED_WAKEUP_CMD) != 0) { - error = vmci_doorbell_create(&b->vmci_doorbell, - VMCI_FLAG_DELAYED_CB, - VMCI_PRIVILEGE_FLAG_RESTRICTED, - vmballoon_doorbell, b); + if ((b->capabilities & VMW_BALLOON_SIGNALLED_WAKEUP_CMD) == 0) + return 0; - if (error == VMCI_SUCCESS) { - VMWARE_BALLOON_CMD(VMCI_DOORBELL_SET, - b->vmci_doorbell.context, - b->vmci_doorbell.resource, error); - STATS_INC(b->stats.doorbell_set); - } - } + error = vmci_doorbell_create(&b->vmci_doorbell, VMCI_FLAG_DELAYED_CB, + VMCI_PRIVILEGE_FLAG_RESTRICTED, + vmballoon_doorbell, b); - if (error != 0) { - vmballoon_vmci_cleanup(b); + if (error != VMCI_SUCCESS) + goto fail; - return -EIO; - } + error = VMWARE_BALLOON_CMD(VMCI_DOORBELL_SET, b->vmci_doorbell.context, + b->vmci_doorbell.resource, dummy); + + STATS_INC(b->stats.doorbell_set); + + if (error != VMW_BALLOON_SUCCESS) + goto fail; return 0; +fail: + vmballoon_vmci_cleanup(b); + return -EIO; } /* From 3328bbe314e6d46aff6c074441a7a500209ca345 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Tue, 19 Jun 2018 16:00:27 -0700 Subject: [PATCH 0798/1212] vmw_balloon: fix VMCI use when balloon built into kernel commit c3cc1b0fc27508da53fe955a3b23d03964410682 upstream. Currently, when all modules, including VMCI and VMware balloon are built into the kernel, the initialization of the balloon happens before the VMCI is probed. As a result, the balloon fails to initialize the VMCI doorbell, which it uses to get asynchronous requests for balloon size changes. The problem can be seen in the logs, in the form of the following message: "vmw_balloon: failed to initialize vmci doorbell" The driver would work correctly but slightly less efficiently, probing for requests periodically. This patch changes the balloon to be initialized using late_initcall() instead of module_init() to address this issue. It does not address a situation in which VMCI is built as a module and the balloon is built into the kernel. Fixes: 48e3d668b790 ("VMware balloon: Enable notification via VMCI") Cc: stable@vger.kernel.org Reviewed-by: Xavier Deguillard Signed-off-by: Nadav Amit Signed-off-by: Greg Kroah-Hartman --- drivers/misc/vmw_balloon.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c index 8e739b29079e..518e2dec2aa2 100644 --- a/drivers/misc/vmw_balloon.c +++ b/drivers/misc/vmw_balloon.c @@ -1297,7 +1297,14 @@ static int __init vmballoon_init(void) return 0; } -module_init(vmballoon_init); + +/* + * Using late_initcall() instead of module_init() allows the balloon to use the + * VMCI doorbell even when the balloon is built into the kernel. Otherwise the + * VMCI is probed only after the balloon is initialized. If the balloon is used + * as a module, late_initcall() is equivalent to module_init(). + */ +late_initcall(vmballoon_init); static void __exit vmballoon_exit(void) { From 0943ce7b7e066a88210ca3ea53db5515d21b3312 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 1 Aug 2018 15:40:57 -0400 Subject: [PATCH 0799/1212] tracing: Do not call start/stop() functions when tracing_on does not change commit f143641bfef9a4a60c57af30de26c63057e7e695 upstream. Currently, when one echo's in 1 into tracing_on, the current tracer's "start()" function is executed, even if tracing_on was already one. This can lead to strange side effects. One being that if the hwlat tracer is enabled, and someone does "echo 1 > tracing_on" into tracing_on, the hwlat tracer's start() function is called again which will recreate another kernel thread, and make it unable to remove the old one. Link: http://lkml.kernel.org/r/1533120354-22923-1-git-send-email-erica.bugden@linutronix.de Cc: stable@vger.kernel.org Fixes: 2df8f8a6a897e ("tracing: Fix regression with irqsoff tracer and tracing_on file") Reported-by: Erica Bugden Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 11761b3dd7ba..e409ddce8754 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6496,7 +6496,9 @@ rb_simple_write(struct file *filp, const char __user *ubuf, if (buffer) { mutex_lock(&trace_types_lock); - if (val) { + if (!!val == tracer_tracing_is_on(tr)) { + val = 0; /* do nothing */ + } else if (val) { tracer_tracing_on(tr); if (tr->current_trace->start) tr->current_trace->start(tr); From 34324394f9fa3e92e355aba40ac1b0b1d1d8d3c3 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 16 Aug 2018 16:08:37 -0400 Subject: [PATCH 0800/1212] tracing/blktrace: Fix to allow setting same value commit 757d9140072054528b13bbe291583d9823cde195 upstream. Masami Hiramatsu reported: Current trace-enable attribute in sysfs returns an error if user writes the same setting value as current one, e.g. # cat /sys/block/sda/trace/enable 0 # echo 0 > /sys/block/sda/trace/enable bash: echo: write error: Invalid argument # echo 1 > /sys/block/sda/trace/enable # echo 1 > /sys/block/sda/trace/enable bash: echo: write error: Device or resource busy But this is not a preferred behavior, it should ignore if new setting is same as current one. This fixes the problem as below. # cat /sys/block/sda/trace/enable 0 # echo 0 > /sys/block/sda/trace/enable # echo 1 > /sys/block/sda/trace/enable # echo 1 > /sys/block/sda/trace/enable Link: http://lkml.kernel.org/r/20180816103802.08678002@gandalf.local.home Cc: Ingo Molnar Cc: Jens Axboe Cc: linux-block@vger.kernel.org Cc: stable@vger.kernel.org Fixes: cd649b8bb830d ("blktrace: remove sysfs_blk_trace_enable_show/store()") Reported-by: Masami Hiramatsu Tested-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- kernel/trace/blktrace.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7ab5eafea8b2..210b8e726a97 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1716,6 +1716,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, mutex_lock(&bdev->bd_mutex); if (attr == &dev_attr_enable) { + if (!!value == !!q->blk_trace) { + ret = 0; + goto out_unlock_bdev; + } if (value) ret = blk_trace_setup_queue(q, bdev); else From f6db350c9af9814d200188e94ae8682ca995ec84 Mon Sep 17 00:00:00 2001 From: Snild Dolkow Date: Thu, 26 Jul 2018 09:15:39 +0200 Subject: [PATCH 0801/1212] kthread, tracing: Don't expose half-written comm when creating kthreads commit 3e536e222f2930534c252c1cc7ae799c725c5ff9 upstream. There is a window for racing when printing directly to task->comm, allowing other threads to see a non-terminated string. The vsnprintf function fills the buffer, counts the truncated chars, then finally writes the \0 at the end. creator other vsnprintf: fill (not terminated) count the rest trace_sched_waking(p): ... memcpy(comm, p->comm, TASK_COMM_LEN) write \0 The consequences depend on how 'other' uses the string. In our case, it was copied into the tracing system's saved cmdlines, a buffer of adjacent TASK_COMM_LEN-byte buffers (note the 'n' where 0 should be): crash-arm64> x/1024s savedcmd->saved_cmdlines | grep 'evenk' 0xffffffd5b3818640: "irq/497-pwr_evenkworker/u16:12" ...and a strcpy out of there would cause stack corruption: [224761.522292] Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in: ffffff9bf9783c78 crash-arm64> kbt | grep 'comm\|trace_print_context' #6 0xffffff9bf9783c78 in trace_print_context+0x18c(+396) comm (char [16]) = "irq/497-pwr_even" crash-arm64> rd 0xffffffd4d0e17d14 8 ffffffd4d0e17d14: 2f71726900000000 5f7277702d373934 ....irq/497-pwr_ ffffffd4d0e17d24: 726f776b6e657665 3a3631752f72656b evenkworker/u16: ffffffd4d0e17d34: f9780248ff003231 cede60e0ffffff9b 12..H.x......`.. ffffffd4d0e17d44: cede60c8ffffffd4 00000fffffffffd4 .....`.......... The workaround in e09e28671 (use strlcpy in __trace_find_cmdline) was likely needed because of this same bug. Solved by vsnprintf:ing to a local buffer, then using set_task_comm(). This way, there won't be a window where comm is not terminated. Link: http://lkml.kernel.org/r/20180726071539.188015-1-snild@sony.com Cc: stable@vger.kernel.org Fixes: bc0c38d139ec7 ("ftrace: latency tracer infrastructure") Reviewed-by: Steven Rostedt (VMware) Signed-off-by: Snild Dolkow Signed-off-by: Steven Rostedt (VMware) [backported to 3.18 / 4.4 by Snild] Signed-off-by: Greg Kroah-Hartman --- kernel/kthread.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/kthread.c b/kernel/kthread.c index 850b255649a2..ac6849ee3057 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -313,10 +313,16 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), task = create->result; if (!IS_ERR(task)) { static const struct sched_param param = { .sched_priority = 0 }; + char name[TASK_COMM_LEN]; va_list args; va_start(args, namefmt); - vsnprintf(task->comm, sizeof(task->comm), namefmt, args); + /* + * task is already visible to other tasks, so updating + * COMM must be protected. + */ + vsnprintf(name, sizeof(name), namefmt, args); + set_task_comm(task, name); va_end(args); /* * root may have changed our (kthreadd's) priority or CPU mask. From 6977074c573e63619dbef40ab36d75fe5713b714 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 9 Aug 2018 15:37:59 -0400 Subject: [PATCH 0802/1212] uprobes: Use synchronize_rcu() not synchronize_sched() commit 016f8ffc48cb01d1e7701649c728c5d2e737d295 upstream. While debugging another bug, I was looking at all the synchronize*() functions being used in kernel/trace, and noticed that trace_uprobes was using synchronize_sched(), with a comment to synchronize with {u,ret}_probe_trace_func(). When looking at those functions, the data is protected with "rcu_read_lock()" and not with "rcu_read_lock_sched()". This is using the wrong synchronize_*() function. Link: http://lkml.kernel.org/r/20180809160553.469e1e32@gandalf.local.home Cc: stable@vger.kernel.org Fixes: 70ed91c6ec7f8 ("tracing/uprobes: Support ftrace_event_file base multibuffer") Acked-by: Oleg Nesterov Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace_uprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 68bb89ad9d28..1dc887bab085 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -969,7 +969,7 @@ probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file) list_del_rcu(&link->list); /* synchronize with u{,ret}probe_trace_func */ - synchronize_sched(); + synchronize_rcu(); kfree(link); if (!list_empty(&tu->tp.files)) From 6f9c611f0765f4caff557deefc5a63c0f688891e Mon Sep 17 00:00:00 2001 From: Tomas Bortoli Date: Fri, 27 Jul 2018 13:05:58 +0200 Subject: [PATCH 0803/1212] 9p: fix multiple NULL-pointer-dereferences commit 10aa14527f458e9867cf3d2cc6b8cb0f6704448b upstream. Added checks to prevent GPFs from raising. Link: http://lkml.kernel.org/r/20180727110558.5479-1-tomasbortoli@gmail.com Signed-off-by: Tomas Bortoli Reported-by: syzbot+1a262da37d3bead15c39@syzkaller.appspotmail.com Cc: stable@vger.kernel.org Signed-off-by: Dominique Martinet Signed-off-by: Greg Kroah-Hartman --- net/9p/trans_fd.c | 5 ++++- net/9p/trans_rdma.c | 3 +++ net/9p/trans_virtio.c | 3 +++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index c923221bb8b9..2f68ffda3715 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -935,7 +935,7 @@ p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args) if (err < 0) return err; - if (valid_ipaddr4(addr) < 0) + if (addr == NULL || valid_ipaddr4(addr) < 0) return -EINVAL; csocket = NULL; @@ -983,6 +983,9 @@ p9_fd_create_unix(struct p9_client *client, const char *addr, char *args) csocket = NULL; + if (addr == NULL) + return -EINVAL; + if (strlen(addr) >= UNIX_PATH_MAX) { pr_err("%s (%d): address too long: %s\n", __func__, task_pid_nr(current), addr); diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index 52b4a2f993f2..f42550dd3560 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -644,6 +644,9 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args) struct ib_qp_init_attr qp_attr; struct ib_cq_init_attr cq_attr = {}; + if (addr == NULL) + return -EINVAL; + /* Parse the transport specific mount options */ err = parse_opts(args, &opts); if (err < 0) diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 669198ac73db..6018a1c0dc28 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -654,6 +654,9 @@ p9_virtio_create(struct p9_client *client, const char *devname, char *args) int ret = -ENOENT; int found = 0; + if (devname == NULL) + return -EINVAL; + mutex_lock(&virtio_9p_lock); list_for_each_entry(chan, &virtio_chan_list, chan_list) { if (!strncmp(devname, chan->tag, chan->tag_len) && From bd3a83160c0d9ef4c0901ebd14ed77bdab93df2e Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Tue, 14 Aug 2018 10:34:42 +0800 Subject: [PATCH 0804/1212] PM / sleep: wakeup: Fix build error caused by missing SRCU support commit 3df6f61fff49632492490fb6e42646b803a9958a upstream. Commit ea0212f40c6 (power: auto select CONFIG_SRCU) made the code in drivers/base/power/wakeup.c use SRCU instead of RCU, but it forgot to select CONFIG_SRCU in Kconfig, which leads to the following build error if CONFIG_SRCU is not selected somewhere else: drivers/built-in.o: In function `wakeup_source_remove': (.text+0x3c6fc): undefined reference to `synchronize_srcu' drivers/built-in.o: In function `pm_print_active_wakeup_sources': (.text+0x3c7a8): undefined reference to `__srcu_read_lock' drivers/built-in.o: In function `pm_print_active_wakeup_sources': (.text+0x3c84c): undefined reference to `__srcu_read_unlock' drivers/built-in.o: In function `device_wakeup_arm_wake_irqs': (.text+0x3d1d8): undefined reference to `__srcu_read_lock' drivers/built-in.o: In function `device_wakeup_arm_wake_irqs': (.text+0x3d228): undefined reference to `__srcu_read_unlock' drivers/built-in.o: In function `device_wakeup_disarm_wake_irqs': (.text+0x3d24c): undefined reference to `__srcu_read_lock' drivers/built-in.o: In function `device_wakeup_disarm_wake_irqs': (.text+0x3d29c): undefined reference to `__srcu_read_unlock' drivers/built-in.o:(.data+0x4158): undefined reference to `process_srcu' Fix this error by selecting CONFIG_SRCU when PM_SLEEP is enabled. Fixes: ea0212f40c6 (power: auto select CONFIG_SRCU) Cc: 4.2+ # 4.2+ Signed-off-by: zhangyi (F) [ rjw: Minor subject/changelog fixups ] Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- kernel/power/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 02e8dfaa1ce2..9d76184279fe 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -105,6 +105,7 @@ config PM_SLEEP def_bool y depends on SUSPEND || HIBERNATE_CALLBACKS select PM + select SRCU config PM_SLEEP_SMP def_bool y From f5fa2009e3a9159e99fe6b11d51796008883cdf5 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 4 Jul 2018 12:59:58 +0300 Subject: [PATCH 0805/1212] pnfs/blocklayout: off by one in bl_map_stripe() commit 0914bb965e38a055e9245637aed117efbe976e91 upstream. "dev->nr_children" is the number of children which were parsed successfully in bl_parse_stripe(). It could be all of them and then, in that case, it is equal to v->stripe.volumes_count. Either way, the > should be >= so that we don't go beyond the end of what we're supposed to. Fixes: 5c83746a0cf2 ("pnfs/blocklayout: in-kernel GETDEVICEINFO XDR parsing") Signed-off-by: Dan Carpenter Reviewed-by: Christoph Hellwig Cc: stable@vger.kernel.org # 3.17+ Signed-off-by: Anna Schumaker Signed-off-by: Greg Kroah-Hartman --- fs/nfs/blocklayout/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index a861bbdfe577..fa8b484d035d 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -162,7 +162,7 @@ static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset, chunk = div_u64(offset, dev->chunk_size); div_u64_rem(chunk, dev->nr_children, &chunk_idx); - if (chunk_idx > dev->nr_children) { + if (chunk_idx >= dev->nr_children) { dprintk("%s: invalid chunk idx %d (%lld/%lld)\n", __func__, chunk_idx, offset, dev->chunk_size); /* error, should not happen */ From 2f04971a962949099881dfc7be23392ad14aa8c9 Mon Sep 17 00:00:00 2001 From: Jon Hunter Date: Tue, 3 Jul 2018 09:59:47 +0100 Subject: [PATCH 0806/1212] ARM: tegra: Fix Tegra30 Cardhu PCA954x reset commit 6e1811900b6fe6f2b4665dba6bd6ed32c6b98575 upstream. On all versions of Tegra30 Cardhu, the reset signal to the NXP PCA9546 I2C mux is connected to the Tegra GPIO BB0. Currently, this pin on the Tegra is not configured as a GPIO but as a special-function IO (SFIO) that is multiplexing the pin to an I2S controller. On exiting system suspend, I2C commands sent to the PCA9546 are failing because there is no ACK. Although it is not possible to see exactly what is happening to the reset during suspend, by ensuring it is configured as a GPIO and driven high, to de-assert the reset, the failures are no longer seen. Please note that this GPIO is also used to drive the reset signal going to the camera connector on the board. However, given that there is no camera support currently for Cardhu, this should not have any impact. Fixes: 40431d16ff11 ("ARM: tegra: enable PCA9546 on Cardhu") Cc: stable@vger.kernel.org Signed-off-by: Jon Hunter Signed-off-by: Thierry Reding Signed-off-by: Greg Kroah-Hartman --- arch/arm/boot/dts/tegra30-cardhu.dtsi | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/boot/dts/tegra30-cardhu.dtsi b/arch/arm/boot/dts/tegra30-cardhu.dtsi index bb1ca158273c..1922e7a93e40 100644 --- a/arch/arm/boot/dts/tegra30-cardhu.dtsi +++ b/arch/arm/boot/dts/tegra30-cardhu.dtsi @@ -201,6 +201,7 @@ i2cmux@70 { #address-cells = <1>; #size-cells = <0>; reg = <0x70>; + reset-gpio = <&gpio TEGRA_GPIO(BB, 0) GPIO_ACTIVE_LOW>; }; }; From 70201a4e368833c15625d8dc32fd9c0286a12b58 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Aug 2018 17:30:14 +0200 Subject: [PATCH 0807/1212] mm/tlb: Remove tlb_remove_table() non-concurrent condition commit a6f572084fbee8b30f91465f4a085d7a90901c57 upstream. Will noted that only checking mm_users is incorrect; we should also check mm_count in order to cover CPUs that have a lazy reference to this mm (and could do speculative TLB operations). If removing this turns out to be a performance issue, we can re-instate a more complete check, but in tlb_table_flush() eliding the call_rcu_sched(). Fixes: 267239116987 ("mm, powerpc: move the RCU page-table freeing into generic code") Reported-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rik van Riel Acked-by: Will Deacon Cc: Nicholas Piggin Cc: David Miller Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: stable@kernel.org Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/memory.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 42db644f5ec4..5aee9ec8b8c6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -361,15 +361,6 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table) { struct mmu_table_batch **batch = &tlb->batch; - /* - * When there's less then two users of this mm there cannot be a - * concurrent page-table walk. - */ - if (atomic_read(&tlb->mm->mm_users) < 2) { - __tlb_remove_table(table); - return; - } - if (*batch == NULL) { *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); if (*batch == NULL) { From d25b6212cc955482eefef191b02975c1fb87d65c Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Thu, 7 Jun 2018 09:56:59 -0700 Subject: [PATCH 0808/1212] iommu/vt-d: Add definitions for PFSID commit 0f725561e168485eff7277d683405c05b192f537 upstream. When SRIOV VF device IOTLB is invalidated, we need to provide the PF source ID such that IOMMU hardware can gauge the depth of invalidation queue which is shared among VFs. This is needed when device invalidation throttle (DIT) capability is supported. This patch adds bit definitions for checking and tracking PFSID. Signed-off-by: Jacob Pan Cc: stable@vger.kernel.org Cc: "Ashok Raj" Cc: "Lu Baolu" Signed-off-by: Joerg Roedel Signed-off-by: Greg Kroah-Hartman --- drivers/iommu/intel-iommu.c | 1 + include/linux/intel-iommu.h | 3 +++ 2 files changed, 4 insertions(+) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 4efec2db4ee2..652548ba6dcf 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -419,6 +419,7 @@ struct device_domain_info { struct list_head global; /* link to global list */ u8 bus; /* PCI bus number */ u8 devfn; /* PCI devfn number */ + u16 pfsid; /* SRIOV physical function source ID */ u8 pasid_supported:3; u8 pasid_enabled:1; u8 pri_supported:1; diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 23e129ef6726..0892615ce93d 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -125,6 +125,7 @@ static inline void dmar_writeq(void __iomem *addr, u64 val) * Extended Capability Register */ +#define ecap_dit(e) ((e >> 41) & 0x1) #define ecap_pasid(e) ((e >> 40) & 0x1) #define ecap_pss(e) ((e >> 35) & 0x1f) #define ecap_eafs(e) ((e >> 34) & 0x1) @@ -294,6 +295,7 @@ enum { #define QI_DEV_IOTLB_SID(sid) ((u64)((sid) & 0xffff) << 32) #define QI_DEV_IOTLB_QDEP(qdep) (((qdep) & 0x1f) << 16) #define QI_DEV_IOTLB_ADDR(addr) ((u64)(addr) & VTD_PAGE_MASK) +#define QI_DEV_IOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | ((u64)(pfsid & 0xfff) << 52)) #define QI_DEV_IOTLB_SIZE 1 #define QI_DEV_IOTLB_MAX_INVS 32 @@ -318,6 +320,7 @@ enum { #define QI_DEV_EIOTLB_PASID(p) (((u64)p) << 32) #define QI_DEV_EIOTLB_SID(sid) ((u64)((sid) & 0xffff) << 16) #define QI_DEV_EIOTLB_QDEP(qd) ((u64)((qd) & 0x1f) << 4) +#define QI_DEV_EIOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | ((u64)(pfsid & 0xfff) << 52)) #define QI_DEV_EIOTLB_MAX_INVS 32 #define QI_PGRP_IDX(idx) (((u64)(idx)) << 55) From d792799caa81f9b0a850380a9eacafa4922b3990 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Thu, 7 Jun 2018 09:57:00 -0700 Subject: [PATCH 0809/1212] iommu/vt-d: Fix dev iotlb pfsid use commit 1c48db44924298ad0cb5a6386b88017539be8822 upstream. PFSID should be used in the invalidation descriptor for flushing device IOTLBs on SRIOV VFs. Signed-off-by: Jacob Pan Cc: stable@vger.kernel.org Cc: "Ashok Raj" Cc: "Lu Baolu" Signed-off-by: Joerg Roedel Signed-off-by: Greg Kroah-Hartman --- drivers/iommu/dmar.c | 6 +++--- drivers/iommu/intel-iommu.c | 17 ++++++++++++++++- include/linux/intel-iommu.h | 5 ++--- 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c index e913a930ac80..5a63e32a4a6b 100644 --- a/drivers/iommu/dmar.c +++ b/drivers/iommu/dmar.c @@ -1315,8 +1315,8 @@ void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, qi_submit_sync(&desc, iommu); } -void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep, - u64 addr, unsigned mask) +void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, + u16 qdep, u64 addr, unsigned mask) { struct qi_desc desc; @@ -1331,7 +1331,7 @@ void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep, qdep = 0; desc.low = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) | - QI_DIOTLB_TYPE; + QI_DIOTLB_TYPE | QI_DEV_IOTLB_PFSID(pfsid); qi_submit_sync(&desc, iommu); } diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 652548ba6dcf..49b266433f4c 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -1480,6 +1480,20 @@ static void iommu_enable_dev_iotlb(struct device_domain_info *info) return; pdev = to_pci_dev(info->dev); + /* For IOMMU that supports device IOTLB throttling (DIT), we assign + * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge + * queue depth at PF level. If DIT is not set, PFSID will be treated as + * reserved, which should be set to 0. + */ + if (!ecap_dit(info->iommu->ecap)) + info->pfsid = 0; + else { + struct pci_dev *pf_pdev; + + /* pdev will be returned if device is not a vf */ + pf_pdev = pci_physfn(pdev); + info->pfsid = PCI_DEVID(pf_pdev->bus->number, pf_pdev->devfn); + } #ifdef CONFIG_INTEL_IOMMU_SVM /* The PCIe spec, in its wisdom, declares that the behaviour of @@ -1538,7 +1552,8 @@ static void iommu_flush_dev_iotlb(struct dmar_domain *domain, sid = info->bus << 8 | info->devfn; qdep = info->ats_qdep; - qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask); + qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, + qdep, addr, mask); } spin_unlock_irqrestore(&device_domain_lock, flags); } diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 0892615ce93d..e353f6600b0b 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -466,9 +466,8 @@ extern void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm, u64 type); extern void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, unsigned int size_order, u64 type); -extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep, - u64 addr, unsigned mask); - +extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, + u16 qdep, u64 addr, unsigned mask); extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu); extern int dmar_ir_support(void); From 1a9bc340b8d3f450bfda0b94465823548b1c677c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 13 May 2017 21:39:49 -0400 Subject: [PATCH 0810/1212] osf_getdomainname(): use copy_to_user() commit 9ba3eb5103cf56f0daaf07de4507df76e7813ed7 upstream. Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- arch/alpha/kernel/osf_sys.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index 63f06a2b1f7f..e0903215982d 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -561,25 +561,20 @@ SYSCALL_DEFINE0(getdtablesize) */ SYSCALL_DEFINE2(osf_getdomainname, char __user *, name, int, namelen) { - unsigned len; - int i; + int len, err = 0; + char *kname; - if (!access_ok(VERIFY_WRITE, name, namelen)) - return -EFAULT; - - len = namelen; - if (len > 32) - len = 32; + if (namelen > 32) + namelen = 32; down_read(&uts_sem); - for (i = 0; i < len; ++i) { - __put_user(utsname()->domainname[i], name + i); - if (utsname()->domainname[i] == '\0') - break; - } + kname = utsname()->domainname; + len = strnlen(kname, namelen); + if (copy_to_user(name, kname, min(len + 1, namelen))) + err = -EFAULT; up_read(&uts_sem); - return 0; + return err; } /* From 5c16a16fcf03789baddd43fe4ca734b4c2877db3 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 25 Jun 2018 18:34:10 +0200 Subject: [PATCH 0811/1212] sys: don't hold uts_sem while accessing userspace memory commit 42a0cc3478584d4d63f68f2f5af021ddbea771fa upstream. Holding uts_sem as a writer while accessing userspace memory allows a namespace admin to stall all processes that attempt to take uts_sem. Instead, move data through stack buffers and don't access userspace memory while uts_sem is held. Cc: stable@vger.kernel.org Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Jann Horn Signed-off-by: Eric W. Biederman Signed-off-by: Greg Kroah-Hartman --- arch/alpha/kernel/osf_sys.c | 51 ++++++++--------- arch/sparc/kernel/sys_sparc_32.c | 22 +++++--- arch/sparc/kernel/sys_sparc_64.c | 20 ++++--- kernel/sys.c | 95 +++++++++++++++----------------- kernel/utsname_sysctl.c | 45 +++++++++------ 5 files changed, 121 insertions(+), 112 deletions(-) diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index e0903215982d..bbc7cb9faa01 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -526,24 +526,19 @@ SYSCALL_DEFINE4(osf_mount, unsigned long, typenr, const char __user *, path, SYSCALL_DEFINE1(osf_utsname, char __user *, name) { int error; + char tmp[5 * 32]; down_read(&uts_sem); - error = -EFAULT; - if (copy_to_user(name + 0, utsname()->sysname, 32)) - goto out; - if (copy_to_user(name + 32, utsname()->nodename, 32)) - goto out; - if (copy_to_user(name + 64, utsname()->release, 32)) - goto out; - if (copy_to_user(name + 96, utsname()->version, 32)) - goto out; - if (copy_to_user(name + 128, utsname()->machine, 32)) - goto out; + memcpy(tmp + 0 * 32, utsname()->sysname, 32); + memcpy(tmp + 1 * 32, utsname()->nodename, 32); + memcpy(tmp + 2 * 32, utsname()->release, 32); + memcpy(tmp + 3 * 32, utsname()->version, 32); + memcpy(tmp + 4 * 32, utsname()->machine, 32); + up_read(&uts_sem); - error = 0; - out: - up_read(&uts_sem); - return error; + if (copy_to_user(name, tmp, sizeof(tmp))) + return -EFAULT; + return 0; } SYSCALL_DEFINE0(getpagesize) @@ -563,18 +558,21 @@ SYSCALL_DEFINE2(osf_getdomainname, char __user *, name, int, namelen) { int len, err = 0; char *kname; + char tmp[32]; - if (namelen > 32) + if (namelen < 0 || namelen > 32) namelen = 32; down_read(&uts_sem); kname = utsname()->domainname; len = strnlen(kname, namelen); - if (copy_to_user(name, kname, min(len + 1, namelen))) - err = -EFAULT; + len = min(len + 1, namelen); + memcpy(tmp, kname, len); up_read(&uts_sem); - return err; + if (copy_to_user(name, tmp, len)) + return -EFAULT; + return 0; } /* @@ -736,13 +734,14 @@ SYSCALL_DEFINE3(osf_sysinfo, int, command, char __user *, buf, long, count) }; unsigned long offset; const char *res; - long len, err = -EINVAL; + long len; + char tmp[__NEW_UTS_LEN + 1]; offset = command-1; if (offset >= ARRAY_SIZE(sysinfo_table)) { /* Digital UNIX has a few unpublished interfaces here */ printk("sysinfo(%d)", command); - goto out; + return -EINVAL; } down_read(&uts_sem); @@ -750,13 +749,11 @@ SYSCALL_DEFINE3(osf_sysinfo, int, command, char __user *, buf, long, count) len = strlen(res)+1; if ((unsigned long)len > (unsigned long)count) len = count; - if (copy_to_user(buf, res, len)) - err = -EFAULT; - else - err = 0; + memcpy(tmp, res, len); up_read(&uts_sem); - out: - return err; + if (copy_to_user(buf, tmp, len)) + return -EFAULT; + return 0; } SYSCALL_DEFINE5(osf_getsysinfo, unsigned long, op, void __user *, buffer, diff --git a/arch/sparc/kernel/sys_sparc_32.c b/arch/sparc/kernel/sys_sparc_32.c index 646988d4c1a3..740f43b9b541 100644 --- a/arch/sparc/kernel/sys_sparc_32.c +++ b/arch/sparc/kernel/sys_sparc_32.c @@ -201,23 +201,27 @@ SYSCALL_DEFINE5(rt_sigaction, int, sig, asmlinkage long sys_getdomainname(char __user *name, int len) { - int nlen, err; - + int nlen, err; + char tmp[__NEW_UTS_LEN + 1]; + if (len < 0) return -EINVAL; - down_read(&uts_sem); - + down_read(&uts_sem); + nlen = strlen(utsname()->domainname) + 1; err = -EINVAL; if (nlen > len) - goto out; + goto out_unlock; + memcpy(tmp, utsname()->domainname, nlen); - err = -EFAULT; - if (!copy_to_user(name, utsname()->domainname, nlen)) - err = 0; + up_read(&uts_sem); -out: + if (copy_to_user(name, tmp, nlen)) + return -EFAULT; + return 0; + +out_unlock: up_read(&uts_sem); return err; } diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index 7f0f7c01b297..f63cd2ea8470 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -524,23 +524,27 @@ extern void check_pending(int signum); SYSCALL_DEFINE2(getdomainname, char __user *, name, int, len) { - int nlen, err; + int nlen, err; + char tmp[__NEW_UTS_LEN + 1]; if (len < 0) return -EINVAL; - down_read(&uts_sem); - + down_read(&uts_sem); + nlen = strlen(utsname()->domainname) + 1; err = -EINVAL; if (nlen > len) - goto out; + goto out_unlock; + memcpy(tmp, utsname()->domainname, nlen); - err = -EFAULT; - if (!copy_to_user(name, utsname()->domainname, nlen)) - err = 0; + up_read(&uts_sem); -out: + if (copy_to_user(name, tmp, nlen)) + return -EFAULT; + return 0; + +out_unlock: up_read(&uts_sem); return err; } diff --git a/kernel/sys.c b/kernel/sys.c index f718742e55e6..e2446ade79ba 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1142,18 +1142,19 @@ static int override_release(char __user *release, size_t len) SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) { - int errno = 0; + struct new_utsname tmp; down_read(&uts_sem); - if (copy_to_user(name, utsname(), sizeof *name)) - errno = -EFAULT; + memcpy(&tmp, utsname(), sizeof(tmp)); up_read(&uts_sem); + if (copy_to_user(name, &tmp, sizeof(tmp))) + return -EFAULT; - if (!errno && override_release(name->release, sizeof(name->release))) - errno = -EFAULT; - if (!errno && override_architecture(name)) - errno = -EFAULT; - return errno; + if (override_release(name->release, sizeof(name->release))) + return -EFAULT; + if (override_architecture(name)) + return -EFAULT; + return 0; } #ifdef __ARCH_WANT_SYS_OLD_UNAME @@ -1162,55 +1163,46 @@ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) */ SYSCALL_DEFINE1(uname, struct old_utsname __user *, name) { - int error = 0; + struct old_utsname tmp; if (!name) return -EFAULT; down_read(&uts_sem); - if (copy_to_user(name, utsname(), sizeof(*name))) - error = -EFAULT; + memcpy(&tmp, utsname(), sizeof(tmp)); up_read(&uts_sem); + if (copy_to_user(name, &tmp, sizeof(tmp))) + return -EFAULT; - if (!error && override_release(name->release, sizeof(name->release))) - error = -EFAULT; - if (!error && override_architecture(name)) - error = -EFAULT; - return error; + if (override_release(name->release, sizeof(name->release))) + return -EFAULT; + if (override_architecture(name)) + return -EFAULT; + return 0; } SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name) { - int error; + struct oldold_utsname tmp = {}; if (!name) return -EFAULT; - if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname))) - return -EFAULT; down_read(&uts_sem); - error = __copy_to_user(&name->sysname, &utsname()->sysname, - __OLD_UTS_LEN); - error |= __put_user(0, name->sysname + __OLD_UTS_LEN); - error |= __copy_to_user(&name->nodename, &utsname()->nodename, - __OLD_UTS_LEN); - error |= __put_user(0, name->nodename + __OLD_UTS_LEN); - error |= __copy_to_user(&name->release, &utsname()->release, - __OLD_UTS_LEN); - error |= __put_user(0, name->release + __OLD_UTS_LEN); - error |= __copy_to_user(&name->version, &utsname()->version, - __OLD_UTS_LEN); - error |= __put_user(0, name->version + __OLD_UTS_LEN); - error |= __copy_to_user(&name->machine, &utsname()->machine, - __OLD_UTS_LEN); - error |= __put_user(0, name->machine + __OLD_UTS_LEN); + memcpy(&tmp.sysname, &utsname()->sysname, __OLD_UTS_LEN); + memcpy(&tmp.nodename, &utsname()->nodename, __OLD_UTS_LEN); + memcpy(&tmp.release, &utsname()->release, __OLD_UTS_LEN); + memcpy(&tmp.version, &utsname()->version, __OLD_UTS_LEN); + memcpy(&tmp.machine, &utsname()->machine, __OLD_UTS_LEN); up_read(&uts_sem); + if (copy_to_user(name, &tmp, sizeof(tmp))) + return -EFAULT; - if (!error && override_architecture(name)) - error = -EFAULT; - if (!error && override_release(name->release, sizeof(name->release))) - error = -EFAULT; - return error ? -EFAULT : 0; + if (override_architecture(name)) + return -EFAULT; + if (override_release(name->release, sizeof(name->release))) + return -EFAULT; + return 0; } #endif @@ -1224,17 +1216,18 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; - down_write(&uts_sem); errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { - struct new_utsname *u = utsname(); + struct new_utsname *u; + down_write(&uts_sem); + u = utsname(); memcpy(u->nodename, tmp, len); memset(u->nodename + len, 0, sizeof(u->nodename) - len); errno = 0; uts_proc_notify(UTS_PROC_HOSTNAME); + up_write(&uts_sem); } - up_write(&uts_sem); return errno; } @@ -1242,8 +1235,9 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) SYSCALL_DEFINE2(gethostname, char __user *, name, int, len) { - int i, errno; + int i; struct new_utsname *u; + char tmp[__NEW_UTS_LEN + 1]; if (len < 0) return -EINVAL; @@ -1252,11 +1246,11 @@ SYSCALL_DEFINE2(gethostname, char __user *, name, int, len) i = 1 + strlen(u->nodename); if (i > len) i = len; - errno = 0; - if (copy_to_user(name, u->nodename, i)) - errno = -EFAULT; + memcpy(tmp, u->nodename, i); up_read(&uts_sem); - return errno; + if (copy_to_user(name, tmp, i)) + return -EFAULT; + return 0; } #endif @@ -1275,17 +1269,18 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; - down_write(&uts_sem); errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { - struct new_utsname *u = utsname(); + struct new_utsname *u; + down_write(&uts_sem); + u = utsname(); memcpy(u->domainname, tmp, len); memset(u->domainname + len, 0, sizeof(u->domainname) - len); errno = 0; uts_proc_notify(UTS_PROC_DOMAINNAME); + up_write(&uts_sem); } - up_write(&uts_sem); return errno; } diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index c8eac43267e9..d2b3b2973456 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -17,7 +17,7 @@ #ifdef CONFIG_PROC_SYSCTL -static void *get_uts(struct ctl_table *table, int write) +static void *get_uts(struct ctl_table *table) { char *which = table->data; struct uts_namespace *uts_ns; @@ -25,21 +25,9 @@ static void *get_uts(struct ctl_table *table, int write) uts_ns = current->nsproxy->uts_ns; which = (which - (char *)&init_uts_ns) + (char *)uts_ns; - if (!write) - down_read(&uts_sem); - else - down_write(&uts_sem); return which; } -static void put_uts(struct ctl_table *table, int write, void *which) -{ - if (!write) - up_read(&uts_sem); - else - up_write(&uts_sem); -} - /* * Special case of dostring for the UTS structure. This has locks * to observe. Should this be in kernel/sys.c ???? @@ -49,13 +37,34 @@ static int proc_do_uts_string(struct ctl_table *table, int write, { struct ctl_table uts_table; int r; - memcpy(&uts_table, table, sizeof(uts_table)); - uts_table.data = get_uts(table, write); - r = proc_dostring(&uts_table, write, buffer, lenp, ppos); - put_uts(table, write, uts_table.data); + char tmp_data[__NEW_UTS_LEN + 1]; - if (write) + memcpy(&uts_table, table, sizeof(uts_table)); + uts_table.data = tmp_data; + + /* + * Buffer the value in tmp_data so that proc_dostring() can be called + * without holding any locks. + * We also need to read the original value in the write==1 case to + * support partial writes. + */ + down_read(&uts_sem); + memcpy(tmp_data, get_uts(table), sizeof(tmp_data)); + up_read(&uts_sem); + r = proc_dostring(&uts_table, write, buffer, lenp, ppos); + + if (write) { + /* + * Write back the new value. + * Note that, since we dropped uts_sem, the result can + * theoretically be incorrect if there are two parallel writes + * at non-zero offsets to the same sysctl. + */ + down_write(&uts_sem); + memcpy(get_uts(table), tmp_data, sizeof(tmp_data)); + up_write(&uts_sem); proc_sys_poll_notify(table->poll); + } return r; } From 3d4c43c8f0fb00ad5f58c06b382dc2dc769a63e1 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 25 Jun 2018 18:34:19 +0200 Subject: [PATCH 0812/1212] userns: move user access out of the mutex commit 5820f140edef111a9ea2ef414ab2428b8cb805b1 upstream. The old code would hold the userns_state_mutex indefinitely if memdup_user_nul stalled due to e.g. a userfault region. Prevent that by moving the memdup_user_nul in front of the mutex_lock(). Note: This changes the error precedence of invalid buf/count/*ppos vs map already written / capabilities missing. Fixes: 22d917d80e84 ("userns: Rework the user_namespace adding uid/gid...") Cc: stable@vger.kernel.org Signed-off-by: Jann Horn Acked-by: Christian Brauner Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman Signed-off-by: Greg Kroah-Hartman --- kernel/user_namespace.c | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 88fefa68c516..a965df4b54f5 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -602,9 +602,26 @@ static ssize_t map_write(struct file *file, const char __user *buf, struct uid_gid_map new_map; unsigned idx; struct uid_gid_extent *extent = NULL; - unsigned long page = 0; + unsigned long page; char *kbuf, *pos, *next_line; - ssize_t ret = -EINVAL; + ssize_t ret; + + /* Only allow < page size writes at the beginning of the file */ + if ((*ppos != 0) || (count >= PAGE_SIZE)) + return -EINVAL; + + /* Get a buffer */ + page = __get_free_page(GFP_TEMPORARY); + kbuf = (char *) page; + if (!page) + return -ENOMEM; + + /* Slurp in the user data */ + if (copy_from_user(kbuf, buf, count)) { + free_page(page); + return -EFAULT; + } + kbuf[count] = '\0'; /* * The userns_state_mutex serializes all writes to any given map. @@ -638,24 +655,6 @@ static ssize_t map_write(struct file *file, const char __user *buf, if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN)) goto out; - /* Get a buffer */ - ret = -ENOMEM; - page = __get_free_page(GFP_TEMPORARY); - kbuf = (char *) page; - if (!page) - goto out; - - /* Only allow < page size writes at the beginning of the file */ - ret = -EINVAL; - if ((*ppos != 0) || (count >= PAGE_SIZE)) - goto out; - - /* Slurp in the user data */ - ret = -EFAULT; - if (copy_from_user(kbuf, buf, count)) - goto out; - kbuf[count] = '\0'; - /* Parse the user data */ ret = -EINVAL; pos = kbuf; From 20da15a755e8c68194ed777813277daf4931147e Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Tue, 12 Jun 2018 20:49:45 +0200 Subject: [PATCH 0813/1212] ubifs: Fix memory leak in lprobs self-check commit eef19816ada3abd56d9f20c88794cc2fea83ebb2 upstream. Allocate the buffer after we return early. Otherwise memory is being leaked. Cc: Fixes: 1e51764a3c2a ("UBIFS: add new flash file system") Signed-off-by: Richard Weinberger Signed-off-by: Greg Kroah-Hartman --- fs/ubifs/lprops.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index a0011aa3a779..f43f162e36f4 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -1091,10 +1091,6 @@ static int scan_check_cb(struct ubifs_info *c, } } - buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); - if (!buf) - return -ENOMEM; - /* * After an unclean unmount, empty and freeable LEBs * may contain garbage - do not scan them. @@ -1113,6 +1109,10 @@ static int scan_check_cb(struct ubifs_info *c, return LPT_SCAN_CONTINUE; } + buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + if (!buf) + return -ENOMEM; + sleb = ubifs_scan(c, lnum, 0, buf, 0); if (IS_ERR(sleb)) { ret = PTR_ERR(sleb); From d90fea0256ce702abdfcbc0c0ff888002a13599c Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Sun, 1 Jul 2018 23:20:50 +0200 Subject: [PATCH 0814/1212] Revert "UBIFS: Fix potential integer overflow in allocation" commit 08acbdd6fd736b90f8d725da5a0de4de2dd6de62 upstream. This reverts commit 353748a359f1821ee934afc579cf04572406b420. It bypassed the linux-mtd review process and fixes the issue not as it should. Cc: Kees Cook Cc: Silvio Cesare Cc: stable@vger.kernel.org Signed-off-by: Richard Weinberger Signed-off-by: Greg Kroah-Hartman --- fs/ubifs/journal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 22dba8837a86..0b9da5b6e0f9 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -1107,7 +1107,7 @@ static int recomp_data_node(const struct ubifs_info *c, int err, len, compr_type, out_len; out_len = le32_to_cpu(dn->size); - buf = kmalloc_array(out_len, WORST_COMPR_FACTOR, GFP_NOFS); + buf = kmalloc(out_len * WORST_COMPR_FACTOR, GFP_NOFS); if (!buf) return -ENOMEM; From 2e052c5081e5f56b46ad28ce57b6ea721b4887b1 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Sun, 1 Jul 2018 23:20:51 +0200 Subject: [PATCH 0815/1212] ubifs: Check data node size before truncate commit 95a22d2084d72ea067d8323cc85677dba5d97cae upstream. Check whether the size is within bounds before using it. If the size is not correct, abort and dump the bad data node. Cc: Kees Cook Cc: Silvio Cesare Cc: stable@vger.kernel.org Fixes: 1e51764a3c2ac ("UBIFS: add new flash file system") Reported-by: Silvio Cesare Signed-off-by: Richard Weinberger Reviewed-by: Kees Cook Signed-off-by: Richard Weinberger Signed-off-by: Greg Kroah-Hartman --- fs/ubifs/journal.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 0b9da5b6e0f9..9887c03a631b 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -1186,7 +1186,16 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, else if (err) goto out_free; else { - if (le32_to_cpu(dn->size) <= dlen) + int dn_len = le32_to_cpu(dn->size); + + if (dn_len <= 0 || dn_len > UBIFS_BLOCK_SIZE) { + ubifs_err(c, "bad data node (block %u, inode %lu)", + blk, inode->i_ino); + ubifs_dump_node(c, dn); + goto out_free; + } + + if (dn_len <= dlen) dlen = 0; /* Nothing to do */ else { int compr_type = le16_to_cpu(dn->compr_type); From dfd7543a6734b982b007734826add041158618ef Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Tue, 12 Jun 2018 00:52:28 +0200 Subject: [PATCH 0816/1212] ubifs: Fix synced_i_size calculation for xattr inodes commit 59965593205fa4044850d35ee3557cf0b7edcd14 upstream. In ubifs_jnl_update() we sync parent and child inodes to the flash, in case of xattrs, the parent inode (AKA host inode) has a non-zero data_len. Therefore we need to adjust synced_i_size too. This issue was reported by ubifs self tests unter a xattr related work load. UBIFS error (ubi0:0 pid 1896): dbg_check_synced_i_size: ui_size is 4, synced_i_size is 0, but inode is clean UBIFS error (ubi0:0 pid 1896): dbg_check_synced_i_size: i_ino 65, i_mode 0x81a4, i_size 4 Cc: Fixes: 1e51764a3c2a ("UBIFS: add new flash file system") Signed-off-by: Richard Weinberger Signed-off-by: Greg Kroah-Hartman --- fs/ubifs/journal.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 9887c03a631b..539fa934ed93 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -661,6 +661,11 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, spin_lock(&ui->ui_lock); ui->synced_i_size = ui->ui_size; spin_unlock(&ui->ui_lock); + if (xent) { + spin_lock(&host_ui->ui_lock); + host_ui->synced_i_size = host_ui->ui_size; + spin_unlock(&host_ui->ui_lock); + } mark_inode_clean(c, ui); mark_inode_clean(c, host_ui); return 0; From 57d78f3e00b9877cdeea6b44a6f0c070e7b0d3a7 Mon Sep 17 00:00:00 2001 From: Vignesh R Date: Mon, 11 Jun 2018 11:39:56 +0530 Subject: [PATCH 0817/1212] pwm: tiehrpwm: Fix disabling of output of PWMs commit 38dabd91ff0bde33352ca3cc65ef515599b77a05 upstream. pwm-tiehrpwm driver disables PWM output by putting it in low output state via active AQCSFRC register in ehrpwm_pwm_disable(). But, the AQCSFRC shadow register is not updated. Therefore, when shadow AQCSFRC register is re-enabled in ehrpwm_pwm_enable() (say to enable second PWM output), previous settings are lost as shadow register value is loaded into active register. This results in things like PWMA getting enabled automatically, when PWMB is enabled and vice versa. Fix this by updating AQCSFRC shadow register as well during ehrpwm_pwm_disable(). Fixes: 19891b20e7c2 ("pwm: pwm-tiehrpwm: PWM driver support for EHRPWM") Cc: stable@vger.kernel.org Signed-off-by: Vignesh R Signed-off-by: Thierry Reding Signed-off-by: Greg Kroah-Hartman --- drivers/pwm/pwm-tiehrpwm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pwm/pwm-tiehrpwm.c b/drivers/pwm/pwm-tiehrpwm.c index 6a41e66015b6..062dff1c902d 100644 --- a/drivers/pwm/pwm-tiehrpwm.c +++ b/drivers/pwm/pwm-tiehrpwm.c @@ -384,6 +384,8 @@ static void ehrpwm_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm) aqcsfrc_mask = AQCSFRC_CSFA_MASK; } + /* Update shadow register first before modifying active register */ + ehrpwm_modify(pc->mmio_base, AQCSFRC, aqcsfrc_mask, aqcsfrc_val); /* * Changes to immediate action on Action Qualifier. This puts * Action Qualifier control on PWM output from next TBCLK From c602af2b76af159cc3ad0828d247484f99b4945c Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 25 Jul 2018 15:41:54 +0200 Subject: [PATCH 0818/1212] fb: fix lost console when the user unplugs a USB adapter commit 8c5b044299951acd91e830a688dd920477ea1eda upstream. I have a USB display adapter using the udlfb driver and I use it on an ARM board that doesn't have any graphics card. When I plug the adapter in, the console is properly displayed, however when I unplug and re-plug the adapter, the console is not displayed and I can't access it until I reboot the board. The reason is this: When the adapter is unplugged, dlfb_usb_disconnect calls unlink_framebuffer, then it waits until the reference count drops to zero and then it deallocates the framebuffer. However, the console that is attached to the framebuffer device keeps the reference count non-zero, so the framebuffer device is never destroyed. When the USB adapter is plugged again, it creates a new device /dev/fb1 and the console is not attached to it. This patch fixes the bug by unbinding the console from unlink_framebuffer. The code to unbind the console is moved from do_unregister_framebuffer to a function unbind_console. When the console is unbound, the reference count drops to zero and the udlfb driver frees the framebuffer. When the adapter is plugged back, a new framebuffer is created and the console is attached to it. Signed-off-by: Mikulas Patocka Cc: Dave Airlie Cc: Bernie Thompson Cc: Ladislav Michl Cc: stable@vger.kernel.org [b.zolnierkie: preserve old behavior for do_unregister_framebuffer()] Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Greg Kroah-Hartman --- drivers/video/fbdev/core/fbmem.c | 38 +++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c index 0705d8883ede..8a29ec5992fd 100644 --- a/drivers/video/fbdev/core/fbmem.c +++ b/drivers/video/fbdev/core/fbmem.c @@ -1687,12 +1687,12 @@ static int do_register_framebuffer(struct fb_info *fb_info) return 0; } -static int do_unregister_framebuffer(struct fb_info *fb_info) +static int unbind_console(struct fb_info *fb_info) { struct fb_event event; - int i, ret = 0; + int ret; + int i = fb_info->node; - i = fb_info->node; if (i < 0 || i >= FB_MAX || registered_fb[i] != fb_info) return -EINVAL; @@ -1707,17 +1707,29 @@ static int do_unregister_framebuffer(struct fb_info *fb_info) unlock_fb_info(fb_info); console_unlock(); + return ret; +} + +static int __unlink_framebuffer(struct fb_info *fb_info); + +static int do_unregister_framebuffer(struct fb_info *fb_info) +{ + struct fb_event event; + int ret; + + ret = unbind_console(fb_info); + if (ret) return -EINVAL; pm_vt_switch_unregister(fb_info->dev); - unlink_framebuffer(fb_info); + __unlink_framebuffer(fb_info); if (fb_info->pixmap.addr && (fb_info->pixmap.flags & FB_PIXMAP_DEFAULT)) kfree(fb_info->pixmap.addr); fb_destroy_modelist(&fb_info->modelist); - registered_fb[i] = NULL; + registered_fb[fb_info->node] = NULL; num_registered_fb--; fb_cleanup_device(fb_info); event.info = fb_info; @@ -1730,7 +1742,7 @@ static int do_unregister_framebuffer(struct fb_info *fb_info) return 0; } -int unlink_framebuffer(struct fb_info *fb_info) +static int __unlink_framebuffer(struct fb_info *fb_info) { int i; @@ -1742,6 +1754,20 @@ int unlink_framebuffer(struct fb_info *fb_info) device_destroy(fb_class, MKDEV(FB_MAJOR, i)); fb_info->dev = NULL; } + + return 0; +} + +int unlink_framebuffer(struct fb_info *fb_info) +{ + int ret; + + ret = __unlink_framebuffer(fb_info); + if (ret) + return ret; + + unbind_console(fb_info); + return 0; } EXPORT_SYMBOL(unlink_framebuffer); From 3130702ac3a6c71a1196dfa0560d6ec9e54bf7aa Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 25 Jul 2018 15:41:55 +0200 Subject: [PATCH 0819/1212] udlfb: set optimal write delay commit bb24153a3f13dd0dbc1f8055ad97fe346d598f66 upstream. The default delay 5 jiffies is too much when the kernel is compiled with HZ=100 - it results in jumpy cursor in Xwindow. In order to find out the optimal delay, I benchmarked the driver on 1280x720x30fps video. I found out that with HZ=1000, 10ms is acceptable, but with HZ=250 or HZ=300, we need 4ms, so that the video is played without any frame skips. This patch changes the delay to this value. Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Greg Kroah-Hartman --- include/video/udlfb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/video/udlfb.h b/include/video/udlfb.h index f9466fa54ba4..2ad9a6d37ff4 100644 --- a/include/video/udlfb.h +++ b/include/video/udlfb.h @@ -87,7 +87,7 @@ struct dlfb_data { #define MIN_RAW_PIX_BYTES 2 #define MIN_RAW_CMD_BYTES (RAW_HEADER_BYTES + MIN_RAW_PIX_BYTES) -#define DL_DEFIO_WRITE_DELAY 5 /* fb_deferred_io.delay in jiffies */ +#define DL_DEFIO_WRITE_DELAY msecs_to_jiffies(HZ <= 300 ? 4 : 10) /* optimal value for 720p video */ #define DL_DEFIO_WRITE_DISABLE (HZ*60) /* "disable" with long delay */ /* remove these once align.h patch is taken into kernel */ From f7f501c753f36021ffea48bf8b5b50992cb2bdac Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 7 Jun 2018 13:43:48 +0200 Subject: [PATCH 0820/1212] getxattr: use correct xattr length commit 82c9a927bc5df6e06b72d206d24a9d10cced4eb5 upstream. When running in a container with a user namespace, if you call getxattr with name = "system.posix_acl_access" and size % 8 != 4, then getxattr silently skips the user namespace fixup that it normally does resulting in un-fixed-up data being returned. This is caused by posix_acl_fix_xattr_to_user() being passed the total buffer size and not the actual size of the xattr as returned by vfs_getxattr(). This commit passes the actual length of the xattr as returned by vfs_getxattr() down. A reproducer for the issue is: touch acl_posix setfacl -m user:0:rwx acl_posix and the compile: #define _GNU_SOURCE #include #include #include #include #include #include #include /* Run in user namespace with nsuid 0 mapped to uid != 0 on the host. */ int main(int argc, void **argv) { ssize_t ret1, ret2; char buf1[128], buf2[132]; int fret = EXIT_SUCCESS; char *file; if (argc < 2) { fprintf(stderr, "Please specify a file with " "\"system.posix_acl_access\" permissions set\n"); _exit(EXIT_FAILURE); } file = argv[1]; ret1 = getxattr(file, "system.posix_acl_access", buf1, sizeof(buf1)); if (ret1 < 0) { fprintf(stderr, "%s - Failed to retrieve " "\"system.posix_acl_access\" " "from \"%s\"\n", strerror(errno), file); _exit(EXIT_FAILURE); } ret2 = getxattr(file, "system.posix_acl_access", buf2, sizeof(buf2)); if (ret2 < 0) { fprintf(stderr, "%s - Failed to retrieve " "\"system.posix_acl_access\" " "from \"%s\"\n", strerror(errno), file); _exit(EXIT_FAILURE); } if (ret1 != ret2) { fprintf(stderr, "The value of \"system.posix_acl_" "access\" for file \"%s\" changed " "between two successive calls\n", file); _exit(EXIT_FAILURE); } for (ssize_t i = 0; i < ret2; i++) { if (buf1[i] == buf2[i]) continue; fprintf(stderr, "Unexpected different in byte %zd: " "%02x != %02x\n", i, buf1[i], buf2[i]); fret = EXIT_FAILURE; } if (fret == EXIT_SUCCESS) fprintf(stderr, "Test passed\n"); else fprintf(stderr, "Test failed\n"); _exit(fret); } and run: ./tester acl_posix On a non-fixed up kernel this should return something like: root@c1:/# ./t Unexpected different in byte 16: ffffffa0 != 00 Unexpected different in byte 17: ffffff86 != 00 Unexpected different in byte 18: 01 != 00 and on a fixed kernel: root@c1:~# ./t Test passed Cc: stable@vger.kernel.org Fixes: 2f6f0654ab61 ("userns: Convert vfs posix_acl support to use kuids and kgids") Link: https://bugzilla.kernel.org/show_bug.cgi?id=199945 Reported-by: Colin Watson Signed-off-by: Christian Brauner Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman Signed-off-by: Greg Kroah-Hartman --- fs/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xattr.c b/fs/xattr.c index 76f01bf4b048..09441c396798 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -453,7 +453,7 @@ getxattr(struct dentry *d, const char __user *name, void __user *value, if (error > 0) { if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) - posix_acl_fix_xattr_to_user(kvalue, size); + posix_acl_fix_xattr_to_user(kvalue, error); if (size && copy_to_user(value, kvalue, error)) error = -EFAULT; } else if (error == -ERANGE && size >= XATTR_SIZE_MAX) { From 95861df6188b3960e1d80292c905e3d040fd619c Mon Sep 17 00:00:00 2001 From: Shan Hai Date: Thu, 23 Aug 2018 02:02:56 +0800 Subject: [PATCH 0821/1212] bcache: release dc->writeback_lock properly in bch_writeback_thread() commit 3943b040f11ed0cc6d4585fd286a623ca8634547 upstream. The writeback thread would exit with a lock held when the cache device is detached via sysfs interface, fix it by releasing the held lock before exiting the while-loop. Fixes: fadd94e05c02 (bcache: quit dc->writeback_thread when BCACHE_DEV_DETACHING is set) Signed-off-by: Shan Hai Signed-off-by: Coly Li Tested-by: Shenghui Wang Cc: stable@vger.kernel.org #4.17+ Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- drivers/md/bcache/writeback.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index f2c0000de613..95a6ae053714 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -462,8 +462,10 @@ static int bch_writeback_thread(void *arg) * data on cache. BCACHE_DEV_DETACHING flag is set in * bch_cached_dev_detach(). */ - if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) + if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) { + up_write(&dc->writeback_lock); break; + } } up_write(&dc->writeback_lock); From ef699421145eb85874c0ad6ca82575062b5ead34 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 14 Aug 2018 11:46:08 +0300 Subject: [PATCH 0822/1212] perf auxtrace: Fix queue resize commit 99cbbe56eb8bede625f410ab62ba34673ffa7d21 upstream. When the number of queues grows beyond 32, the array of queues is resized but not all members were being copied. Fix by also copying 'tid', 'cpu' and 'set'. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Cc: stable@vger.kernel.org Fixes: e502789302a6e ("perf auxtrace: Add helpers for queuing AUX area tracing data") Link: http://lkml.kernel.org/r/20180814084608.6563-1-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Greg Kroah-Hartman --- tools/perf/util/auxtrace.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index 7f10430af39c..58426e7d320d 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -186,6 +186,9 @@ static int auxtrace_queues__grow(struct auxtrace_queues *queues, for (i = 0; i < queues->nr_queues; i++) { list_splice_tail(&queues->queue_array[i].head, &queue_array[i].head); + queue_array[i].tid = queues->queue_array[i].tid; + queue_array[i].cpu = queues->queue_array[i].cpu; + queue_array[i].set = queues->queue_array[i].set; queue_array[i].priv = queues->queue_array[i].priv; } From 59825a7ef329a4fb7ab24869a058af540f1840d5 Mon Sep 17 00:00:00 2001 From: Jeremy Cline Date: Tue, 31 Jul 2018 01:37:31 +0000 Subject: [PATCH 0823/1212] fs/quota: Fix spectre gadget in do_quotactl commit 7b6924d94a60c6b8c1279ca003e8744e6cd9e8b1 upstream. 'type' is user-controlled, so sanitize it after the bounds check to avoid using it in speculative execution. This covers the following potential gadgets detected with the help of smatch: * fs/ext4/super.c:5741 ext4_quota_read() warn: potential spectre issue 'sb_dqopt(sb)->files' [r] * fs/ext4/super.c:5778 ext4_quota_write() warn: potential spectre issue 'sb_dqopt(sb)->files' [r] * fs/f2fs/super.c:1552 f2fs_quota_read() warn: potential spectre issue 'sb_dqopt(sb)->files' [r] * fs/f2fs/super.c:1608 f2fs_quota_write() warn: potential spectre issue 'sb_dqopt(sb)->files' [r] * fs/quota/dquot.c:412 mark_info_dirty() warn: potential spectre issue 'sb_dqopt(sb)->info' [w] * fs/quota/dquot.c:933 dqinit_needed() warn: potential spectre issue 'dquots' [r] * fs/quota/dquot.c:2112 dquot_commit_info() warn: potential spectre issue 'dqopt->ops' [r] * fs/quota/dquot.c:2362 vfs_load_quota_inode() warn: potential spectre issue 'dqopt->files' [w] (local cap) * fs/quota/dquot.c:2369 vfs_load_quota_inode() warn: potential spectre issue 'dqopt->ops' [w] (local cap) * fs/quota/dquot.c:2370 vfs_load_quota_inode() warn: potential spectre issue 'dqopt->info' [w] (local cap) * fs/quota/quota.c:110 quota_getfmt() warn: potential spectre issue 'sb_dqopt(sb)->info' [r] * fs/quota/quota_v2.c:84 v2_check_quota_file() warn: potential spectre issue 'quota_magics' [w] * fs/quota/quota_v2.c:85 v2_check_quota_file() warn: potential spectre issue 'quota_versions' [w] * fs/quota/quota_v2.c:96 v2_read_file_info() warn: potential spectre issue 'dqopt->info' [r] * fs/quota/quota_v2.c:172 v2_write_file_info() warn: potential spectre issue 'dqopt->info' [r] Additionally, a quick inspection indicates there are array accesses with 'type' in quota_on() and quota_off() functions which are also addressed by this. Cc: Josh Poimboeuf Cc: stable@vger.kernel.org Signed-off-by: Jeremy Cline Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/quota/quota.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 3746367098fd..bb0d643481c8 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -17,6 +17,7 @@ #include #include #include +#include static int check_quotactl_permission(struct super_block *sb, int type, int cmd, qid_t id) @@ -644,6 +645,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, if (type >= (XQM_COMMAND(cmd) ? XQM_MAXQUOTAS : MAXQUOTAS)) return -EINVAL; + type = array_index_nospec(type, MAXQUOTAS); /* * Quota not supported on this fs? Check this before s_quota_types * since they needn't be set if quota is not supported at all. From 1fc5fa527625d2cbddf9004b26e020ecc83d272d Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Mon, 24 Oct 2016 15:27:59 +1000 Subject: [PATCH 0824/1212] x86/io: add interface to reserve io memtype for a resource range. (v1.1) commit 8ef4227615e158faa4ee85a1d6466782f7e22f2f upstream. A recent change to the mm code in: 87744ab3832b mm: fix cache mode tracking in vm_insert_mixed() started enforcing checking the memory type against the registered list for amixed pfn insertion mappings. It happens that the drm drivers for a number of gpus relied on this being broken. Currently the driver only inserted VRAM mappings into the tracking table when they came from the kernel, and userspace mappings never landed in the table. This led to a regression where all the mapping end up as UC instead of WC now. I've considered a number of solutions but since this needs to be fixed in fixes and not next, and some of the solutions were going to introduce overhead that hadn't been there before I didn't consider them viable at this stage. These mainly concerned hooking into the TTM io reserve APIs, but these API have a bunch of fast paths I didn't want to unwind to add this to. The solution I've decided on is to add a new API like the arch_phys_wc APIs (these would have worked but wc_del didn't take a range), and use them from the drivers to add a WC compatible mapping to the table for all VRAM on those GPUs. This means we can then create userspace mapping that won't get degraded to UC. v1.1: use CONFIG_X86_PAT + add some comments in io.h Cc: Toshi Kani Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Brian Gerst Cc: x86@kernel.org Cc: mcgrof@suse.com Cc: Dan Williams Acked-by: Ingo Molnar Reviewed-by: Thomas Gleixner Signed-off-by: Dave Airlie Cc: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/io.h | 6 ++++++ arch/x86/mm/pat.c | 14 ++++++++++++++ include/linux/io.h | 22 ++++++++++++++++++++++ 3 files changed, 42 insertions(+) diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 9016b4b70375..6c5020163db0 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -351,4 +351,10 @@ extern void arch_phys_wc_del(int handle); #define arch_phys_wc_add arch_phys_wc_add #endif +#ifdef CONFIG_X86_PAT +extern int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size); +extern void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size); +#define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc +#endif + #endif /* _ASM_X86_IO_H */ diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 3146b1da6d72..5ff0cb74de55 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -726,6 +726,20 @@ void io_free_memtype(resource_size_t start, resource_size_t end) free_memtype(start, end); } +int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size) +{ + enum page_cache_mode type = _PAGE_CACHE_MODE_WC; + + return io_reserve_memtype(start, start + size, &type); +} +EXPORT_SYMBOL(arch_io_reserve_memtype_wc); + +void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size) +{ + io_free_memtype(start, start + size); +} +EXPORT_SYMBOL(arch_io_free_memtype_wc); + pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { diff --git a/include/linux/io.h b/include/linux/io.h index de64c1e53612..8ab45611fc35 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -154,4 +154,26 @@ enum { void *memremap(resource_size_t offset, size_t size, unsigned long flags); void memunmap(void *addr); +/* + * On x86 PAT systems we have memory tracking that keeps track of + * the allowed mappings on memory ranges. This tracking works for + * all the in-kernel mapping APIs (ioremap*), but where the user + * wishes to map a range from a physical device into user memory + * the tracking won't be updated. This API is to be used by + * drivers which remap physical device pages into userspace, + * and wants to make sure they are mapped WC and not UC. + */ +#ifndef arch_io_reserve_memtype_wc +static inline int arch_io_reserve_memtype_wc(resource_size_t base, + resource_size_t size) +{ + return 0; +} + +static inline void arch_io_free_memtype_wc(resource_size_t base, + resource_size_t size) +{ +} +#endif + #endif /* _LINUX_IO_H */ From c59fdc4cfbda52ce081c59540762185d765c3369 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Mon, 24 Oct 2016 15:37:48 +1000 Subject: [PATCH 0825/1212] drm/drivers: add support for using the arch wc mapping API. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 7cf321d118a825c1541b43ca45294126fd474efa upstream. This fixes a regression in all these drivers since the cache mode tracking was fixed for mixed mappings. It uses the new arch API to add the VRAM range to the PAT mapping tracking tables. Fixes: 87744ab3832 (mm: fix cache mode tracking in vm_insert_mixed()) Reviewed-by: Christian König . Signed-off-by: Dave Airlie Cc: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 5 +++++ drivers/gpu/drm/ast/ast_ttm.c | 6 ++++++ drivers/gpu/drm/cirrus/cirrus_ttm.c | 7 +++++++ drivers/gpu/drm/mgag200/mgag200_ttm.c | 7 +++++++ drivers/gpu/drm/nouveau/nouveau_ttm.c | 8 ++++++++ drivers/gpu/drm/radeon/radeon_object.c | 5 +++++ 6 files changed, 38 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 73628c7599e7..3aca9a9011fb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -492,6 +492,10 @@ void amdgpu_bo_force_delete(struct amdgpu_device *adev) int amdgpu_bo_init(struct amdgpu_device *adev) { + /* reserve PAT memory space to WC for VRAM */ + arch_io_reserve_memtype_wc(adev->mc.aper_base, + adev->mc.aper_size); + /* Add an MTRR for the VRAM */ adev->mc.vram_mtrr = arch_phys_wc_add(adev->mc.aper_base, adev->mc.aper_size); @@ -507,6 +511,7 @@ void amdgpu_bo_fini(struct amdgpu_device *adev) { amdgpu_ttm_fini(adev); arch_phys_wc_del(adev->mc.vram_mtrr); + arch_io_free_memtype_wc(adev->mc.aper_base, adev->mc.aper_size); } int amdgpu_bo_fbdev_mmap(struct amdgpu_bo *bo, diff --git a/drivers/gpu/drm/ast/ast_ttm.c b/drivers/gpu/drm/ast/ast_ttm.c index 08f82eae6939..ac12f74e6b32 100644 --- a/drivers/gpu/drm/ast/ast_ttm.c +++ b/drivers/gpu/drm/ast/ast_ttm.c @@ -275,6 +275,8 @@ int ast_mm_init(struct ast_private *ast) return ret; } + arch_io_reserve_memtype_wc(pci_resource_start(dev->pdev, 0), + pci_resource_len(dev->pdev, 0)); ast->fb_mtrr = arch_phys_wc_add(pci_resource_start(dev->pdev, 0), pci_resource_len(dev->pdev, 0)); @@ -283,11 +285,15 @@ int ast_mm_init(struct ast_private *ast) void ast_mm_fini(struct ast_private *ast) { + struct drm_device *dev = ast->dev; + ttm_bo_device_release(&ast->ttm.bdev); ast_ttm_global_release(ast); arch_phys_wc_del(ast->fb_mtrr); + arch_io_free_memtype_wc(pci_resource_start(dev->pdev, 0), + pci_resource_len(dev->pdev, 0)); } void ast_ttm_placement(struct ast_bo *bo, int domain) diff --git a/drivers/gpu/drm/cirrus/cirrus_ttm.c b/drivers/gpu/drm/cirrus/cirrus_ttm.c index dfffd528517a..393967025043 100644 --- a/drivers/gpu/drm/cirrus/cirrus_ttm.c +++ b/drivers/gpu/drm/cirrus/cirrus_ttm.c @@ -275,6 +275,9 @@ int cirrus_mm_init(struct cirrus_device *cirrus) return ret; } + arch_io_reserve_memtype_wc(pci_resource_start(dev->pdev, 0), + pci_resource_len(dev->pdev, 0)); + cirrus->fb_mtrr = arch_phys_wc_add(pci_resource_start(dev->pdev, 0), pci_resource_len(dev->pdev, 0)); @@ -284,6 +287,8 @@ int cirrus_mm_init(struct cirrus_device *cirrus) void cirrus_mm_fini(struct cirrus_device *cirrus) { + struct drm_device *dev = cirrus->dev; + if (!cirrus->mm_inited) return; @@ -293,6 +298,8 @@ void cirrus_mm_fini(struct cirrus_device *cirrus) arch_phys_wc_del(cirrus->fb_mtrr); cirrus->fb_mtrr = 0; + arch_io_free_memtype_wc(pci_resource_start(dev->pdev, 0), + pci_resource_len(dev->pdev, 0)); } void cirrus_ttm_placement(struct cirrus_bo *bo, int domain) diff --git a/drivers/gpu/drm/mgag200/mgag200_ttm.c b/drivers/gpu/drm/mgag200/mgag200_ttm.c index 05108b505fbf..d9df8d32fc35 100644 --- a/drivers/gpu/drm/mgag200/mgag200_ttm.c +++ b/drivers/gpu/drm/mgag200/mgag200_ttm.c @@ -274,6 +274,9 @@ int mgag200_mm_init(struct mga_device *mdev) return ret; } + arch_io_reserve_memtype_wc(pci_resource_start(dev->pdev, 0), + pci_resource_len(dev->pdev, 0)); + mdev->fb_mtrr = arch_phys_wc_add(pci_resource_start(dev->pdev, 0), pci_resource_len(dev->pdev, 0)); @@ -282,10 +285,14 @@ int mgag200_mm_init(struct mga_device *mdev) void mgag200_mm_fini(struct mga_device *mdev) { + struct drm_device *dev = mdev->dev; + ttm_bo_device_release(&mdev->ttm.bdev); mgag200_ttm_global_release(mdev); + arch_io_free_memtype_wc(pci_resource_start(dev->pdev, 0), + pci_resource_len(dev->pdev, 0)); arch_phys_wc_del(mdev->fb_mtrr); mdev->fb_mtrr = 0; } diff --git a/drivers/gpu/drm/nouveau/nouveau_ttm.c b/drivers/gpu/drm/nouveau/nouveau_ttm.c index d2e7d209f651..9835327a3214 100644 --- a/drivers/gpu/drm/nouveau/nouveau_ttm.c +++ b/drivers/gpu/drm/nouveau/nouveau_ttm.c @@ -397,6 +397,9 @@ nouveau_ttm_init(struct nouveau_drm *drm) /* VRAM init */ drm->gem.vram_available = drm->device.info.ram_user; + arch_io_reserve_memtype_wc(device->func->resource_addr(device, 1), + device->func->resource_size(device, 1)); + ret = ttm_bo_init_mm(&drm->ttm.bdev, TTM_PL_VRAM, drm->gem.vram_available >> PAGE_SHIFT); if (ret) { @@ -429,6 +432,8 @@ nouveau_ttm_init(struct nouveau_drm *drm) void nouveau_ttm_fini(struct nouveau_drm *drm) { + struct nvkm_device *device = nvxx_device(&drm->device); + ttm_bo_clean_mm(&drm->ttm.bdev, TTM_PL_VRAM); ttm_bo_clean_mm(&drm->ttm.bdev, TTM_PL_TT); @@ -438,4 +443,7 @@ nouveau_ttm_fini(struct nouveau_drm *drm) arch_phys_wc_del(drm->ttm.mtrr); drm->ttm.mtrr = 0; + arch_io_free_memtype_wc(device->func->resource_addr(device, 1), + device->func->resource_size(device, 1)); + } diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c index 83aee9e814ba..18ec38d0d3f5 100644 --- a/drivers/gpu/drm/radeon/radeon_object.c +++ b/drivers/gpu/drm/radeon/radeon_object.c @@ -447,6 +447,10 @@ void radeon_bo_force_delete(struct radeon_device *rdev) int radeon_bo_init(struct radeon_device *rdev) { + /* reserve PAT memory space to WC for VRAM */ + arch_io_reserve_memtype_wc(rdev->mc.aper_base, + rdev->mc.aper_size); + /* Add an MTRR for the VRAM */ if (!rdev->fastfb_working) { rdev->mc.vram_mtrr = arch_phys_wc_add(rdev->mc.aper_base, @@ -464,6 +468,7 @@ void radeon_bo_fini(struct radeon_device *rdev) { radeon_ttm_fini(rdev); arch_phys_wc_del(rdev->mc.vram_mtrr); + arch_io_free_memtype_wc(rdev->mc.aper_base, rdev->mc.aper_size); } /* Returns how many bytes TTM can move per IB. From fdf53713aebb1e8ccbfcadade2b8449e62394547 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 9 Sep 2018 20:04:37 +0200 Subject: [PATCH 0826/1212] Linux 4.4.155 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index b184286cf7e6..2d9f89ec8397 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 154 +SUBLEVEL = 155 EXTRAVERSION = NAME = Blurry Fish Butt From e3dea38fc8528c9d04acd9a28bcdd7dab3b461fa Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 27 Jun 2018 17:46:50 +0200 Subject: [PATCH 0827/1212] x86/speculation/l1tf: Fix up pte->pfn conversion for PAE commit e14d7dfb41f5807a0c1c26a13f2b8ef16af24935 upstream. Jan has noticed that pte_pfn and co. resp. pfn_pte are incorrect for CONFIG_PAE because phys_addr_t is wider than unsigned long and so the pte_val reps. shift left would get truncated. Fix this up by using proper types. [Just one chunk, again, needed here. Thanks to Ben and Guenter for finding and fixing this. - gregkh] Fixes: 6b28baca9b1f ("x86/speculation/l1tf: Protect PROT_NONE PTEs against speculation") Reported-by: Jan Beulich Signed-off-by: Michal Hocko Signed-off-by: Thomas Gleixner Acked-by: Vlastimil Babka Cc: Guenter Roeck Cc: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/pgtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 68a55273ce0f..a67d7f210b7c 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -385,7 +385,7 @@ static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot) { - phys_addr_t pfn = page_nr << PAGE_SHIFT; + phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT; pfn ^= protnone_mask(pgprot_val(pgprot)); pfn &= PHYSICAL_PUD_PAGE_MASK; return __pud(pfn | massage_pgprot(pgprot)); From 2c155709e4ef2d86d0176aac82e44c048a7e0255 Mon Sep 17 00:00:00 2001 From: Greg Hackmann Date: Tue, 4 Sep 2018 09:33:36 -0700 Subject: [PATCH 0828/1212] staging: android: ion: fix ION_IOC_{MAP,SHARE} use-after-free The ION_IOC_{MAP,SHARE} ioctls drop and reacquire client->lock several times while operating on one of the client's ion_handles. This creates windows where userspace can call ION_IOC_FREE on the same client with the same handle, and effectively make the kernel drop its own reference. For example: - thread A: ION_IOC_ALLOC creates an ion_handle with refcount 1 - thread A: starts ION_IOC_MAP and increments the refcount to 2 - thread B: ION_IOC_FREE decrements the refcount to 1 - thread B: ION_IOC_FREE decrements the refcount to 0 and frees the handle - thread A: continues ION_IOC_MAP with a dangling ion_handle * to freed memory Fix this by holding client->lock for the duration of ION_IOC_{MAP,SHARE}, preventing the concurrent ION_IOC_FREE. Also remove ion_handle_get_by_id(), since there's literally no way to use it safely. This patch is applied on top of 4.4.y, and applies to older kernels too. 4.9.y was fixed separately. Kernels 4.12 and later are unaffected, since all the underlying ion_handle infrastructure has been ripped out. Cc: stable@vger.kernel.org # v4.4- Signed-off-by: Greg Hackmann Acked-by: Laura Abbott Signed-off-by: Greg Kroah-Hartman --- drivers/staging/android/ion/ion.c | 60 +++++++++++++++++++------------ 1 file changed, 37 insertions(+), 23 deletions(-) diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c index 47cb163da9a0..4adb1138af09 100644 --- a/drivers/staging/android/ion/ion.c +++ b/drivers/staging/android/ion/ion.c @@ -449,18 +449,6 @@ static struct ion_handle *ion_handle_get_by_id_nolock(struct ion_client *client, return ERR_PTR(-EINVAL); } -struct ion_handle *ion_handle_get_by_id(struct ion_client *client, - int id) -{ - struct ion_handle *handle; - - mutex_lock(&client->lock); - handle = ion_handle_get_by_id_nolock(client, id); - mutex_unlock(&client->lock); - - return handle; -} - static bool ion_handle_validate(struct ion_client *client, struct ion_handle *handle) { @@ -1138,24 +1126,28 @@ static struct dma_buf_ops dma_buf_ops = { .kunmap = ion_dma_buf_kunmap, }; -struct dma_buf *ion_share_dma_buf(struct ion_client *client, - struct ion_handle *handle) +static struct dma_buf *__ion_share_dma_buf(struct ion_client *client, + struct ion_handle *handle, + bool lock_client) { DEFINE_DMA_BUF_EXPORT_INFO(exp_info); struct ion_buffer *buffer; struct dma_buf *dmabuf; bool valid_handle; - mutex_lock(&client->lock); + if (lock_client) + mutex_lock(&client->lock); valid_handle = ion_handle_validate(client, handle); if (!valid_handle) { WARN(1, "%s: invalid handle passed to share.\n", __func__); - mutex_unlock(&client->lock); + if (lock_client) + mutex_unlock(&client->lock); return ERR_PTR(-EINVAL); } buffer = handle->buffer; ion_buffer_get(buffer); - mutex_unlock(&client->lock); + if (lock_client) + mutex_unlock(&client->lock); exp_info.ops = &dma_buf_ops; exp_info.size = buffer->size; @@ -1170,14 +1162,21 @@ struct dma_buf *ion_share_dma_buf(struct ion_client *client, return dmabuf; } + +struct dma_buf *ion_share_dma_buf(struct ion_client *client, + struct ion_handle *handle) +{ + return __ion_share_dma_buf(client, handle, true); +} EXPORT_SYMBOL(ion_share_dma_buf); -int ion_share_dma_buf_fd(struct ion_client *client, struct ion_handle *handle) +static int __ion_share_dma_buf_fd(struct ion_client *client, + struct ion_handle *handle, bool lock_client) { struct dma_buf *dmabuf; int fd; - dmabuf = ion_share_dma_buf(client, handle); + dmabuf = __ion_share_dma_buf(client, handle, lock_client); if (IS_ERR(dmabuf)) return PTR_ERR(dmabuf); @@ -1187,8 +1186,19 @@ int ion_share_dma_buf_fd(struct ion_client *client, struct ion_handle *handle) return fd; } + +int ion_share_dma_buf_fd(struct ion_client *client, struct ion_handle *handle) +{ + return __ion_share_dma_buf_fd(client, handle, true); +} EXPORT_SYMBOL(ion_share_dma_buf_fd); +static int ion_share_dma_buf_fd_nolock(struct ion_client *client, + struct ion_handle *handle) +{ + return __ion_share_dma_buf_fd(client, handle, false); +} + struct ion_handle *ion_import_dma_buf(struct ion_client *client, int fd) { struct dma_buf *dmabuf; @@ -1335,11 +1345,15 @@ static long ion_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct ion_handle *handle; - handle = ion_handle_get_by_id(client, data.handle.handle); - if (IS_ERR(handle)) + mutex_lock(&client->lock); + handle = ion_handle_get_by_id_nolock(client, data.handle.handle); + if (IS_ERR(handle)) { + mutex_unlock(&client->lock); return PTR_ERR(handle); - data.fd.fd = ion_share_dma_buf_fd(client, handle); - ion_handle_put(handle); + } + data.fd.fd = ion_share_dma_buf_fd_nolock(client, handle); + ion_handle_put_nolock(handle); + mutex_unlock(&client->lock); if (data.fd.fd < 0) ret = data.fd.fd; break; From e1e4b0be0dec7bd5fb1f125c942770903f7d6eb1 Mon Sep 17 00:00:00 2001 From: Doug Berger Date: Tue, 28 Aug 2018 12:33:15 -0700 Subject: [PATCH 0829/1212] net: bcmgenet: use MAC link status for fixed phy [ Upstream commit c3c397c1f16c51601a3fac4fe0c63ad8aa85a904 ] When using the fixed PHY with GENET (e.g. MOCA) the PHY link status can be determined from the internal link status captured by the MAC. This allows the PHY state machine to use the correct link state with the fixed PHY even if MAC link event interrupts are missed when the net device is opened. Fixes: 8d88c6ebb34c ("net: bcmgenet: enable MoCA link state change detection") Signed-off-by: Doug Berger Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/broadcom/genet/bcmgenet.h | 3 +++ drivers/net/ethernet/broadcom/genet/bcmmii.c | 10 ++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.h b/drivers/net/ethernet/broadcom/genet/bcmgenet.h index cef53f2d9854..ce20bc939b38 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.h +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.h @@ -185,6 +185,9 @@ struct bcmgenet_mib_counters { #define UMAC_MAC1 0x010 #define UMAC_MAX_FRAME_LEN 0x014 +#define UMAC_MODE 0x44 +#define MODE_LINK_STATUS (1 << 5) + #define UMAC_EEE_CTRL 0x064 #define EN_LPI_RX_PAUSE (1 << 0) #define EN_LPI_TX_PFC (1 << 1) diff --git a/drivers/net/ethernet/broadcom/genet/bcmmii.c b/drivers/net/ethernet/broadcom/genet/bcmmii.c index e96d1f95bb47..4c73feca4842 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmmii.c +++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c @@ -167,8 +167,14 @@ void bcmgenet_mii_setup(struct net_device *dev) static int bcmgenet_fixed_phy_link_update(struct net_device *dev, struct fixed_phy_status *status) { - if (dev && dev->phydev && status) - status->link = dev->phydev->link; + struct bcmgenet_priv *priv; + u32 reg; + + if (dev && dev->phydev && status) { + priv = netdev_priv(dev); + reg = bcmgenet_umac_readl(priv, UMAC_MODE); + status->link = !!(reg & MODE_LINK_STATUS); + } return 0; } From 375e88743c94e9b9871b9e50c66e11b40667530f Mon Sep 17 00:00:00 2001 From: Manish Chopra Date: Thu, 23 Aug 2018 13:20:52 -0700 Subject: [PATCH 0830/1212] qlge: Fix netdev features configuration. [ Upstream commit 6750c87074c5b534d82fdaabb1deb45b8f1f57de ] qlge_fix_features() is not supposed to modify hardware or driver state, rather it is supposed to only fix requested fetures bits. Currently qlge_fix_features() also goes for interface down and up unnecessarily if there is not even any change in features set. This patch changes/fixes following - 1) Move reload of interface or device re-config from qlge_fix_features() to qlge_set_features(). 2) Reload of interface in qlge_set_features() only if relevant feature bit (NETIF_F_HW_VLAN_CTAG_RX) is changed. 3) Get rid of qlge_fix_features() since driver is not really required to fix any features bit. Signed-off-by: Manish Reviewed-by: Benjamin Poirier Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/qlogic/qlge/qlge_main.c | 23 +++++++------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_main.c b/drivers/net/ethernet/qlogic/qlge/qlge_main.c index b28e73ea2c25..f39ad0e66637 100644 --- a/drivers/net/ethernet/qlogic/qlge/qlge_main.c +++ b/drivers/net/ethernet/qlogic/qlge/qlge_main.c @@ -2388,26 +2388,20 @@ static int qlge_update_hw_vlan_features(struct net_device *ndev, return status; } -static netdev_features_t qlge_fix_features(struct net_device *ndev, - netdev_features_t features) -{ - int err; - - /* Update the behavior of vlan accel in the adapter */ - err = qlge_update_hw_vlan_features(ndev, features); - if (err) - return err; - - return features; -} - static int qlge_set_features(struct net_device *ndev, netdev_features_t features) { netdev_features_t changed = ndev->features ^ features; + int err; + + if (changed & NETIF_F_HW_VLAN_CTAG_RX) { + /* Update the behavior of vlan accel in the adapter */ + err = qlge_update_hw_vlan_features(ndev, features); + if (err) + return err; - if (changed & NETIF_F_HW_VLAN_CTAG_RX) qlge_vlan_mode(ndev, features); + } return 0; } @@ -4720,7 +4714,6 @@ static const struct net_device_ops qlge_netdev_ops = { .ndo_set_mac_address = qlge_set_mac_address, .ndo_validate_addr = eth_validate_addr, .ndo_tx_timeout = qlge_tx_timeout, - .ndo_fix_features = qlge_fix_features, .ndo_set_features = qlge_set_features, .ndo_vlan_rx_add_vid = qlge_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = qlge_vlan_rx_kill_vid, From 86a0a00794c21b35c72d767a98fb917b5b76b513 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 30 Aug 2018 14:24:29 +0200 Subject: [PATCH 0831/1212] tcp: do not restart timewait timer on rst reception [ Upstream commit 63cc357f7bba6729869565a12df08441a5995d9a ] RFC 1337 says: ''Ignore RST segments in TIME-WAIT state. If the 2 minute MSL is enforced, this fix avoids all three hazards.'' So with net.ipv4.tcp_rfc1337=1, expected behaviour is to have TIME-WAIT sk expire rather than removing it instantly when a reset is received. However, Linux will also re-start the TIME-WAIT timer. This causes connect to fail when tying to re-use ports or very long delays (until syn retry interval exceeds MSL). packetdrill test case: // Demonstrate bogus rearming of TIME-WAIT timer in rfc1337 mode. `sysctl net.ipv4.tcp_rfc1337=1` 0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 0.000 bind(3, ..., ...) = 0 0.000 listen(3, 1) = 0 0.100 < S 0:0(0) win 29200 0.100 > S. 0:0(0) ack 1 0.200 < . 1:1(0) ack 1 win 257 0.200 accept(3, ..., ...) = 4 // Receive first segment 0.310 < P. 1:1001(1000) ack 1 win 46 // Send one ACK 0.310 > . 1:1(0) ack 1001 // read 1000 byte 0.310 read(4, ..., 1000) = 1000 // Application writes 100 bytes 0.350 write(4, ..., 100) = 100 0.350 > P. 1:101(100) ack 1001 // ACK 0.500 < . 1001:1001(0) ack 101 win 257 // close the connection 0.600 close(4) = 0 0.600 > F. 101:101(0) ack 1001 win 244 // Our side is in FIN_WAIT_1 & waits for ack to fin 0.7 < . 1001:1001(0) ack 102 win 244 // Our side is in FIN_WAIT_2 with no outstanding data. 0.8 < F. 1001:1001(0) ack 102 win 244 0.8 > . 102:102(0) ack 1002 win 244 // Our side is now in TIME_WAIT state, send ack for fin. 0.9 < F. 1002:1002(0) ack 102 win 244 0.9 > . 102:102(0) ack 1002 win 244 // Peer reopens with in-window SYN: 1.000 < S 1000:1000(0) win 9200 // Therefore, reply with ACK. 1.000 > . 102:102(0) ack 1002 win 244 // Peer sends RST for this ACK. Normally this RST results // in tw socket removal, but rfc1337=1 setting prevents this. 1.100 < R 1002:1002(0) win 244 // second syn. Due to rfc1337=1 expect another pure ACK. 31.0 < S 1000:1000(0) win 9200 31.0 > . 102:102(0) ack 1002 win 244 // .. and another RST from peer. 31.1 < R 1002:1002(0) win 244 31.2 `echo no timer restart;ss -m -e -a -i -n -t -o state TIME-WAIT` // third syn after one minute. Time-Wait socket should have expired by now. 63.0 < S 1000:1000(0) win 9200 // so we expect a syn-ack & 3whs to proceed from here on. 63.0 > S. 0:0(0) ack 1 Without this patch, 'ss' shows restarts of tw timer and last packet is thus just another pure ack, more than one minute later. This restores the original code from commit 283fd6cf0be690a83 ("Merge in ANK networking jumbo patch") in netdev-vger-cvs.git . For some reason the else branch was removed/lost in 1f28b683339f7 ("Merge in TCP/UDP optimizations and [..]") and timer restart became unconditional. Reported-by: Michal Tesar Signed-off-by: Florian Westphal Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_minisocks.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 4c1c94fa8f08..d270870bf492 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -200,8 +200,9 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, inet_twsk_deschedule_put(tw); return TCP_TW_SUCCESS; } + } else { + inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); } - inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); if (tmp_opt.saw_tstamp) { tcptw->tw_ts_recent = tmp_opt.rcv_tsval; From 4890349d7902027cd7380c7a0e23f429907473e9 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Thu, 23 Aug 2018 19:49:54 +0300 Subject: [PATCH 0832/1212] vti6: remove !skb->ignore_df check from vti6_xmit() [ Upstream commit 9f2895461439fda2801a7906fb4c5fb3dbb37a0a ] Before the commit d6990976af7c ("vti6: fix PMTU caching and reporting on xmit") '!skb->ignore_df' check was always true because the function skb_scrub_packet() was called before it, resetting ignore_df to zero. In the commit, skb_scrub_packet() was moved below, and now this check can be false for the packet, e.g. when sending it in the two fragments, this prevents successful PMTU updates in such case. The next attempts to send the packet lead to the same tx error. Moreover, vti6 initial MTU value relies on PMTU adjustments. This issue can be reproduced with the following LTP test script: udp_ipsec_vti.sh -6 -p ah -m tunnel -s 2000 Fixes: ccd740cbc6e0 ("vti6: Add pmtu handling to vti6_xmit.") Signed-off-by: Alexey Kodanev Acked-by: Steffen Klassert Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/ip6_vti.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 6aca9a6b2303..60d4052d97a6 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -470,7 +470,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) } mtu = dst_mtu(dst); - if (!skb->ignore_df && skb->len > mtu) { + if (skb->len > mtu) { skb_dst(skb)->ops->update_pmtu(dst, NULL, skb, mtu); if (skb->protocol == htons(ETH_P_IPV6)) { From d98ec8a9e205f885f66b5a0890daa3ce3d2cac74 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Wed, 22 Aug 2018 12:19:24 +1000 Subject: [PATCH 0833/1212] cifs: check if SMB2 PDU size has been padded and suppress the warning [ Upstream commit e6c47dd0da1e3a484e778046fc10da0b20606a86 ] Some SMB2/3 servers, Win2016 but possibly others too, adds padding not only between PDUs in a compound but also to the final PDU. This padding extends the PDU to a multiple of 8 bytes. Check if the unexpected length looks like this might be the case and avoid triggering the log messages for : "SMB2 server sent bad RFC1001 len %d not %d\n" Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2misc.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 76ccf20fbfb7..0e62bf1ebbd7 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -184,6 +184,13 @@ smb2_check_message(char *buf, unsigned int length) if (clc_len == 4 + len + 1) return 0; + /* + * Some windows servers (win2016) will pad also the final + * PDU in a compound to 8 bytes. + */ + if (((clc_len + 7) & ~7) == len) + return 0; + /* * MacOS server pads after SMB2.1 write response with 3 bytes * of junk. Other servers match RFC1001 len to actual From 14957e348e78a760f70b7c31ed8573fd8b1c5e9e Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Tue, 21 Aug 2018 21:59:12 -0700 Subject: [PATCH 0834/1212] hfsplus: don't return 0 when fill_super() failed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 7464726cb5998846306ed0a7d6714afb2e37b25d ] syzbot is reporting NULL pointer dereference at mount_fs() [1]. This is because hfsplus_fill_super() is by error returning 0 when hfsplus_fill_super() detected invalid filesystem image, and mount_bdev() is returning NULL because dget(s->s_root) == NULL if s->s_root == NULL, and mount_fs() is accessing root->d_sb because IS_ERR(root) == false if root == NULL. Fix this by returning -EINVAL when hfsplus_fill_super() detected invalid filesystem image. [1] https://syzkaller.appspot.com/bug?id=21acb6850cecbc960c927229e597158cf35f33d0 Link: http://lkml.kernel.org/r/d83ce31a-874c-dd5b-f790-41405983a5be@I-love.SAKURA.ne.jp Signed-off-by: Tetsuo Handa Reported-by: syzbot Reviewed-by: Ernesto A. Fernández Reviewed-by: Andrew Morton Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/hfsplus/super.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index fa40e756c501..422e00dc5f3b 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -521,8 +521,10 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) goto out_put_root; if (!hfs_brec_read(&fd, &entry, sizeof(entry))) { hfs_find_exit(&fd); - if (entry.type != cpu_to_be16(HFSPLUS_FOLDER)) + if (entry.type != cpu_to_be16(HFSPLUS_FOLDER)) { + err = -EINVAL; goto out_put_root; + } inode = hfsplus_iget(sb, be32_to_cpu(entry.folder.id)); if (IS_ERR(inode)) { err = PTR_ERR(inode); From 189ff5b00004072b63969ea28b8d1512a651892c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ernesto=20A=2E=20Fern=C3=A1ndez?= Date: Thu, 23 Aug 2018 17:00:31 -0700 Subject: [PATCH 0835/1212] hfs: prevent crash on exit from failed search MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit dc2572791d3a41bab94400af2b6bca9d71ccd303 ] hfs_find_exit() expects fd->bnode to be NULL after a search has failed. hfs_brec_insert() may instead set it to an error-valued pointer. Fix this to prevent a crash. Link: http://lkml.kernel.org/r/53d9749a029c41b4016c495fc5838c9dba3afc52.1530294815.git.ernesto.mnd.fernandez@gmail.com Signed-off-by: Ernesto A. Fernández Cc: Anatoly Trosinenko Cc: Viacheslav Dubeyko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/hfs/brec.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c index 6fc766df0461..2a6f3c67cb3f 100644 --- a/fs/hfs/brec.c +++ b/fs/hfs/brec.c @@ -74,9 +74,10 @@ int hfs_brec_insert(struct hfs_find_data *fd, void *entry, int entry_len) if (!fd->bnode) { if (!tree->root) hfs_btree_inc_height(tree); - fd->bnode = hfs_bnode_find(tree, tree->leaf_head); - if (IS_ERR(fd->bnode)) - return PTR_ERR(fd->bnode); + node = hfs_bnode_find(tree, tree->leaf_head); + if (IS_ERR(node)) + return PTR_ERR(node); + fd->bnode = node; fd->record = -1; } new_node = NULL; From b7befd11e0b259699ed1ee69dd3ee66da25b2d5e Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 21 Aug 2018 22:00:58 -0700 Subject: [PATCH 0836/1212] fork: don't copy inconsistent signal handler state to child [ Upstream commit 06e62a46bbba20aa5286102016a04214bb446141 ] Before this change, if a multithreaded process forks while one of its threads is changing a signal handler using sigaction(), the memcpy() in copy_sighand() can race with the struct assignment in do_sigaction(). It isn't clear whether this can cause corruption of the userspace signal handler pointer, but it definitely can cause inconsistency between different fields of struct sigaction. Take the appropriate spinlock to avoid this. I have tested that this patch prevents inconsistency between sa_sigaction and sa_flags, which is possible before this patch. Link: http://lkml.kernel.org/r/20180702145108.73189-1-jannh@google.com Signed-off-by: Jann Horn Acked-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Rik van Riel Cc: "Peter Zijlstra (Intel)" Cc: Kees Cook Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- kernel/fork.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/fork.c b/kernel/fork.c index ac00f14208b7..37ec96fe739d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1109,7 +1109,9 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) return -ENOMEM; atomic_set(&sig->count, 1); + spin_lock_irq(¤t->sighand->siglock); memcpy(sig->action, current->sighand->action, sizeof(sig->action)); + spin_unlock_irq(¤t->sighand->siglock); return 0; } From ccbe4990bb1bf84a425015f68d7e7a2b9d1b3f8a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 21 Aug 2018 21:59:34 -0700 Subject: [PATCH 0837/1212] reiserfs: change j_timestamp type to time64_t [ Upstream commit 8b73ce6a4bae4fe12bcb2c361c0da4183c2e1b6f ] This uses the deprecated time_t type but is write-only, and could be removed, but as Jeff explains, having a timestamp can be usefule for post-mortem analysis in crash dumps. In order to remove one of the last instances of time_t, this changes the type to time64_t, same as j_trans_start_time. Link: http://lkml.kernel.org/r/20180622133315.221210-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Cc: Jan Kara Cc: Jeff Mahoney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/reiserfs/reiserfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index 6ca00471afbf..d920a646b578 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -270,7 +270,7 @@ struct reiserfs_journal_list { struct mutex j_commit_mutex; unsigned int j_trans_id; - time_t j_timestamp; + time64_t j_timestamp; /* write-only but useful for crash dump analysis */ struct reiserfs_list_bitmap *j_list_bitmap; struct buffer_head *j_commit_bh; /* commit buffer head */ struct reiserfs_journal_cnode *j_realblock; From fed5bd3352a3fbf9494449ffee3b4bab5e8cc3a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ernesto=20A=2E=20Fern=C3=A1ndez?= Date: Thu, 23 Aug 2018 17:00:25 -0700 Subject: [PATCH 0838/1212] hfsplus: fix NULL dereference in hfsplus_lookup() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit a7ec7a4193a2eb3b5341243fc0b621c1ac9e4ec4 ] An HFS+ filesystem can be mounted read-only without having a metadata directory, which is needed to support hardlinks. But if the catalog data is corrupted, a directory lookup may still find dentries claiming to be hardlinks. hfsplus_lookup() does check that ->hidden_dir is not NULL in such a situation, but mistakenly does so after dereferencing it for the first time. Reorder this check to prevent a crash. This happens when looking up corrupted catalog data (dentry) on a filesystem with no metadata directory (this could only ever happen on a read-only mount). Wen Xu sent the replication steps in detail to the fsdevel list: https://bugzilla.kernel.org/show_bug.cgi?id=200297 Link: http://lkml.kernel.org/r/20180712215344.q44dyrhymm4ajkao@eaf Signed-off-by: Ernesto A. Fernández Reported-by: Wen Xu Cc: Viacheslav Dubeyko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/hfsplus/dir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index d0f39dcbb58e..2b6e2ad57bf9 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -77,13 +77,13 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry, cpu_to_be32(HFSP_HARDLINK_TYPE) && entry.file.user_info.fdCreator == cpu_to_be32(HFSP_HFSPLUS_CREATOR) && + HFSPLUS_SB(sb)->hidden_dir && (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)-> create_date || entry.file.create_date == HFSPLUS_I(d_inode(sb->s_root))-> - create_date) && - HFSPLUS_SB(sb)->hidden_dir) { + create_date)) { struct qstr str; char name[32]; From 192710dab763194ca62d2169cf282a88888291a0 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Tue, 21 Aug 2018 21:59:44 -0700 Subject: [PATCH 0839/1212] fat: validate ->i_start before using [ Upstream commit 0afa9626667c3659ef8bd82d42a11e39fedf235c ] On corrupted FATfs may have invalid ->i_start. To handle it, this checks ->i_start before using, and return proper error code. Link: http://lkml.kernel.org/r/87o9f8y1t5.fsf_-_@mail.parknet.co.jp Signed-off-by: OGAWA Hirofumi Reported-by: Anatoly Trosinenko Tested-by: Anatoly Trosinenko Cc: Alan Cox Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/fat/cache.c | 19 ++++++++++++------- fs/fat/fat.h | 5 +++++ fs/fat/fatent.c | 6 +++--- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/fs/fat/cache.c b/fs/fat/cache.c index 93fc62232ec2..9ae2c4d7e921 100644 --- a/fs/fat/cache.c +++ b/fs/fat/cache.c @@ -224,7 +224,8 @@ static inline void cache_init(struct fat_cache_id *cid, int fclus, int dclus) int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus) { struct super_block *sb = inode->i_sb; - const int limit = sb->s_maxbytes >> MSDOS_SB(sb)->cluster_bits; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + const int limit = sb->s_maxbytes >> sbi->cluster_bits; struct fat_entry fatent; struct fat_cache_id cid; int nr; @@ -233,6 +234,12 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus) *fclus = 0; *dclus = MSDOS_I(inode)->i_start; + if (!fat_valid_entry(sbi, *dclus)) { + fat_fs_error_ratelimit(sb, + "%s: invalid start cluster (i_pos %lld, start %08x)", + __func__, MSDOS_I(inode)->i_pos, *dclus); + return -EIO; + } if (cluster == 0) return 0; @@ -249,9 +256,8 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus) /* prevent the infinite loop of cluster chain */ if (*fclus > limit) { fat_fs_error_ratelimit(sb, - "%s: detected the cluster chain loop" - " (i_pos %lld)", __func__, - MSDOS_I(inode)->i_pos); + "%s: detected the cluster chain loop (i_pos %lld)", + __func__, MSDOS_I(inode)->i_pos); nr = -EIO; goto out; } @@ -261,9 +267,8 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus) goto out; else if (nr == FAT_ENT_FREE) { fat_fs_error_ratelimit(sb, - "%s: invalid cluster chain (i_pos %lld)", - __func__, - MSDOS_I(inode)->i_pos); + "%s: invalid cluster chain (i_pos %lld)", + __func__, MSDOS_I(inode)->i_pos); nr = -EIO; goto out; } else if (nr == FAT_ENT_EOF) { diff --git a/fs/fat/fat.h b/fs/fat/fat.h index be5e15323bab..1849b1adb6b9 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -344,6 +344,11 @@ static inline void fatent_brelse(struct fat_entry *fatent) fatent->fat_inode = NULL; } +static inline bool fat_valid_entry(struct msdos_sb_info *sbi, int entry) +{ + return FAT_START_ENT <= entry && entry < sbi->max_cluster; +} + extern void fat_ent_access_init(struct super_block *sb); extern int fat_ent_read(struct inode *inode, struct fat_entry *fatent, int entry); diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 8226557130a2..a70e37c47a78 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -23,7 +23,7 @@ static void fat12_ent_blocknr(struct super_block *sb, int entry, { struct msdos_sb_info *sbi = MSDOS_SB(sb); int bytes = entry + (entry >> 1); - WARN_ON(entry < FAT_START_ENT || sbi->max_cluster <= entry); + WARN_ON(!fat_valid_entry(sbi, entry)); *offset = bytes & (sb->s_blocksize - 1); *blocknr = sbi->fat_start + (bytes >> sb->s_blocksize_bits); } @@ -33,7 +33,7 @@ static void fat_ent_blocknr(struct super_block *sb, int entry, { struct msdos_sb_info *sbi = MSDOS_SB(sb); int bytes = (entry << sbi->fatent_shift); - WARN_ON(entry < FAT_START_ENT || sbi->max_cluster <= entry); + WARN_ON(!fat_valid_entry(sbi, entry)); *offset = bytes & (sb->s_blocksize - 1); *blocknr = sbi->fat_start + (bytes >> sb->s_blocksize_bits); } @@ -353,7 +353,7 @@ int fat_ent_read(struct inode *inode, struct fat_entry *fatent, int entry) int err, offset; sector_t blocknr; - if (entry < FAT_START_ENT || sbi->max_cluster <= entry) { + if (!fat_valid_entry(sbi, entry)) { fatent_brelse(fatent); fat_fs_error(sb, "invalid access to FAT (entry 0x%08x)", entry); return -EIO; From af25dc4cf354bee251a422c4c7ed1bb6b7dd1649 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 15 Aug 2018 12:30:38 -0700 Subject: [PATCH 0840/1212] scripts: modpost: check memory allocation results [ Upstream commit 1f3aa9002dc6a0d59a4b599b4fc8f01cf43ef014 ] Fix missing error check for memory allocation functions in scripts/mod/modpost.c. Fixes kernel bugzilla #200319: https://bugzilla.kernel.org/show_bug.cgi?id=200319 Signed-off-by: Randy Dunlap Cc: Yuexing Wang Signed-off-by: Masahiro Yamada Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- scripts/mod/modpost.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index bd5151915e5a..064fbfbbb22c 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -649,7 +649,7 @@ static void handle_modversions(struct module *mod, struct elf_info *info, if (ELF_ST_TYPE(sym->st_info) == STT_SPARC_REGISTER) break; if (symname[0] == '.') { - char *munged = strdup(symname); + char *munged = NOFAIL(strdup(symname)); munged[0] = '_'; munged[1] = toupper(munged[1]); symname = munged; @@ -1311,7 +1311,7 @@ static Elf_Sym *find_elf_symbol2(struct elf_info *elf, Elf_Addr addr, static char *sec2annotation(const char *s) { if (match(s, init_exit_sections)) { - char *p = malloc(20); + char *p = NOFAIL(malloc(20)); char *r = p; *p++ = '_'; @@ -1331,7 +1331,7 @@ static char *sec2annotation(const char *s) strcat(p, " "); return r; } else { - return strdup(""); + return NOFAIL(strdup("")); } } @@ -2032,7 +2032,7 @@ void buf_write(struct buffer *buf, const char *s, int len) { if (buf->size - buf->pos < len) { buf->size += len + SZ; - buf->p = realloc(buf->p, buf->size); + buf->p = NOFAIL(realloc(buf->p, buf->size)); } strncpy(buf->p + buf->pos, s, len); buf->pos += len; From 4ca3b3df6d52aff8b42271a23d7f2218672cfbf8 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 17 Aug 2018 15:46:57 -0700 Subject: [PATCH 0841/1212] mm/fadvise.c: fix signed overflow UBSAN complaint [ Upstream commit a718e28f538441a3b6612da9ff226973376cdf0f ] Signed integer overflow is undefined according to the C standard. The overflow in ksys_fadvise64_64() is deliberate, but since it is signed overflow, UBSAN complains: UBSAN: Undefined behaviour in mm/fadvise.c:76:10 signed integer overflow: 4 + 9223372036854775805 cannot be represented in type 'long long int' Use unsigned types to do math. Unsigned overflow is defined so UBSAN will not complain about it. This patch doesn't change generated code. [akpm@linux-foundation.org: add comment explaining the casts] Link: http://lkml.kernel.org/r/20180629184453.7614-1-aryabinin@virtuozzo.com Signed-off-by: Andrey Ryabinin Reported-by: Reviewed-by: Andrew Morton Cc: Alexander Potapenko Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- mm/fadvise.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mm/fadvise.c b/mm/fadvise.c index b8a5bc66b0c0..001877e32f0c 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -68,8 +68,12 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) goto out; } - /* Careful about overflows. Len == 0 means "as much as possible" */ - endbyte = offset + len; + /* + * Careful about overflows. Len == 0 means "as much as possible". Use + * unsigned math because signed overflows are undefined and UBSan + * complains. + */ + endbyte = (u64)offset + (u64)len; if (!len || endbyte < len) endbyte = -1; else From 90d91af021986c31f14591e5e343fcf7293adda2 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 17 Aug 2018 15:44:34 -0700 Subject: [PATCH 0842/1212] fs/dcache.c: fix kmemcheck splat at take_dentry_name_snapshot() [ Upstream commit 6cd00a01f0c1ae6a852b09c59b8dd55cc6c35d1d ] Since only dentry->d_name.len + 1 bytes out of DNAME_INLINE_LEN bytes are initialized at __d_alloc(), we can't copy the whole size unconditionally. WARNING: kmemcheck: Caught 32-bit read from uninitialized memory (ffff8fa27465ac50) 636f6e66696766732e746d70000000000010000000000000020000000188ffff i i i i i i i i i i i i i u u u u u u u u u u i i i i i u u u u ^ RIP: 0010:take_dentry_name_snapshot+0x28/0x50 RSP: 0018:ffffa83000f5bdf8 EFLAGS: 00010246 RAX: 0000000000000020 RBX: ffff8fa274b20550 RCX: 0000000000000002 RDX: ffffa83000f5be40 RSI: ffff8fa27465ac50 RDI: ffffa83000f5be60 RBP: ffffa83000f5bdf8 R08: ffffa83000f5be48 R09: 0000000000000001 R10: ffff8fa27465ac00 R11: ffff8fa27465acc0 R12: ffff8fa27465ac00 R13: ffff8fa27465acc0 R14: 0000000000000000 R15: 0000000000000000 FS: 00007f79737ac8c0(0000) GS:ffffffff8fc30000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff8fa274c0b000 CR3: 0000000134aa7002 CR4: 00000000000606f0 take_dentry_name_snapshot+0x28/0x50 vfs_rename+0x128/0x870 SyS_rename+0x3b2/0x3d0 entry_SYSCALL_64_fastpath+0x1a/0xa4 0xffffffffffffffff Link: http://lkml.kernel.org/r/201709131912.GBG39012.QMJLOVFSFFOOtH@I-love.SAKURA.ne.jp Signed-off-by: Tetsuo Handa Cc: Vegard Nossum Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/dcache.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/dcache.c b/fs/dcache.c index 807efaab838e..141651b0c766 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -278,7 +278,8 @@ void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry spin_unlock(&dentry->d_lock); name->name = p->name; } else { - memcpy(name->inline_name, dentry->d_iname, DNAME_INLINE_LEN); + memcpy(name->inline_name, dentry->d_iname, + dentry->d_name.len + 1); spin_unlock(&dentry->d_lock); name->name = name->inline_name; } From 4ebf605de623c307db846463892267671db6b58b Mon Sep 17 00:00:00 2001 From: Tan Hu Date: Wed, 25 Jul 2018 15:23:07 +0800 Subject: [PATCH 0843/1212] ipvs: fix race between ip_vs_conn_new() and ip_vs_del_dest() [ Upstream commit a53b42c11815d2357e31a9403ae3950517525894 ] We came across infinite loop in ipvs when using ipvs in docker env. When ipvs receives new packets and cannot find an ipvs connection, it will create a new connection, then if the dest is unavailable (i.e. IP_VS_DEST_F_AVAILABLE), the packet will be dropped sliently. But if the dropped packet is the first packet of this connection, the connection control timer never has a chance to start and the ipvs connection cannot be released. This will lead to memory leak, or infinite loop in cleanup_net() when net namespace is released like this: ip_vs_conn_net_cleanup at ffffffffa0a9f31a [ip_vs] __ip_vs_cleanup at ffffffffa0a9f60a [ip_vs] ops_exit_list at ffffffff81567a49 cleanup_net at ffffffff81568b40 process_one_work at ffffffff810a851b worker_thread at ffffffff810a9356 kthread at ffffffff810b0b6f ret_from_fork at ffffffff81697a18 race condition: CPU1 CPU2 ip_vs_in() ip_vs_conn_new() ip_vs_del_dest() __ip_vs_unlink_dest() ~IP_VS_DEST_F_AVAILABLE cp->dest && !IP_VS_DEST_F_AVAILABLE __ip_vs_conn_put ... cleanup_net ---> infinite looping Fix this by checking whether the timer already started. Signed-off-by: Tan Hu Reviewed-by: Jiang Biao Acked-by: Julian Anastasov Acked-by: Simon Horman Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/netfilter/ipvs/ip_vs_core.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index dd1649caa2b2..ac212542a217 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1809,13 +1809,20 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { /* the destination server is not available */ - if (sysctl_expire_nodest_conn(ipvs)) { + __u32 flags = cp->flags; + + /* when timer already started, silently drop the packet.*/ + if (timer_pending(&cp->timer)) + __ip_vs_conn_put(cp); + else + ip_vs_conn_put(cp); + + if (sysctl_expire_nodest_conn(ipvs) && + !(flags & IP_VS_CONN_F_ONE_PACKET)) { /* try to expire the connection immediately */ ip_vs_conn_expire_now(cp); } - /* don't restart its timer, and silently - drop the packet. */ - __ip_vs_conn_put(cp); + return NF_DROP; } From a429a299eb4836e03d40683d7cc723b07d3ad6bd Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Fri, 3 Aug 2018 20:59:51 -0700 Subject: [PATCH 0844/1212] mfd: sm501: Set coherent_dma_mask when creating subdevices [ Upstream commit 2f606da78230f09cf1a71fde6ee91d0c710fa2b2 ] Instantiating the sm501 OHCI subdevice results in a kernel warning. sm501-usb sm501-usb: SM501 OHCI sm501-usb sm501-usb: new USB bus registered, assigned bus number 1 WARNING: CPU: 0 PID: 1 at ./include/linux/dma-mapping.h:516 ohci_init+0x194/0x2d8 Modules linked in: CPU: 0 PID: 1 Comm: swapper Tainted: G W 4.18.0-rc7-00178-g0b5b1f9a78b5 #1 PC is at ohci_init+0x194/0x2d8 PR is at ohci_init+0x168/0x2d8 PC : 8c27844c SP : 8f81dd94 SR : 40008001 TEA : 29613060 R0 : 00000000 R1 : 00000000 R2 : 00000000 R3 : 00000202 R4 : 8fa98b88 R5 : 8c277e68 R6 : 00000000 R7 : 00000000 R8 : 8f965814 R9 : 8c388100 R10 : 8fa98800 R11 : 8fa98928 R12 : 8c48302c R13 : 8fa98920 R14 : 8c48302c MACH: 00000096 MACL: 0000017c GBR : 00000000 PR : 8c278420 Call trace: [<(ptrval)>] usb_add_hcd+0x1e8/0x6ec [<(ptrval)>] _dev_info+0x0/0x54 [<(ptrval)>] arch_local_save_flags+0x0/0x8 [<(ptrval)>] arch_local_irq_restore+0x0/0x24 [<(ptrval)>] ohci_hcd_sm501_drv_probe+0x114/0x2d8 ... Initialize coherent_dma_mask when creating SM501 subdevices to fix the problem. Fixes: b6d6454fdb66f ("mfd: SM501 core driver") Signed-off-by: Guenter Roeck Signed-off-by: Lee Jones Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/mfd/sm501.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mfd/sm501.c b/drivers/mfd/sm501.c index c646784c5a7d..fbec711c4195 100644 --- a/drivers/mfd/sm501.c +++ b/drivers/mfd/sm501.c @@ -714,6 +714,7 @@ sm501_create_subdev(struct sm501_devdata *sm, char *name, smdev->pdev.name = name; smdev->pdev.id = sm->pdev_id; smdev->pdev.dev.parent = sm->dev; + smdev->pdev.dev.coherent_dma_mask = 0xffffffff; if (res_count) { smdev->pdev.resource = (struct resource *)(smdev+1); From ad09041e9335398ab36fcaaf50fa3c4e7b8a29f7 Mon Sep 17 00:00:00 2001 From: Aleh Filipovich Date: Fri, 10 Aug 2018 22:07:25 +0200 Subject: [PATCH 0845/1212] platform/x86: asus-nb-wmi: Add keymap entry for lid flip action on UX360 [ Upstream commit 880b29ac107d15644bf4da228376ba3cd6af6d71 ] Add entry to WMI keymap for lid flip event on Asus UX360. On Asus Zenbook ux360 flipping lid from/to tablet mode triggers keyscan code 0xfa which cannot be handled and results in kernel log message "Unknown key fa pressed". Signed-off-by: Aleh Filipovich Signed-off-by: Andy Shevchenko Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/platform/x86/asus-nb-wmi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/x86/asus-nb-wmi.c b/drivers/platform/x86/asus-nb-wmi.c index 0e0403e024c5..852d2de7f69f 100644 --- a/drivers/platform/x86/asus-nb-wmi.c +++ b/drivers/platform/x86/asus-nb-wmi.c @@ -392,6 +392,7 @@ static const struct key_entry asus_nb_wmi_keymap[] = { { KE_KEY, 0xC4, { KEY_KBDILLUMUP } }, { KE_KEY, 0xC5, { KEY_KBDILLUMDOWN } }, { KE_IGNORE, 0xC6, }, /* Ambient Light Sensor notification */ + { KE_KEY, 0xFA, { KEY_PROG2 } }, /* Lid flip action */ { KE_END, 0}, }; From 3537179891cae88b1f115af01b10c57594a31d71 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Thu, 9 Aug 2018 10:59:01 +0200 Subject: [PATCH 0846/1212] irqchip/bcm7038-l1: Hide cpu offline callback when building for !SMP [ Upstream commit 0702bc4d2fe793018ad9aa0eb14bff7f526c4095 ] When compiling bmips with SMP disabled, the build fails with: drivers/irqchip/irq-bcm7038-l1.o: In function `bcm7038_l1_cpu_offline': drivers/irqchip/irq-bcm7038-l1.c:242: undefined reference to `irq_set_affinity_locked' make[5]: *** [vmlinux] Error 1 Fix this by adding and setting bcm7038_l1_cpu_offline only when actually compiling for SMP. It wouldn't have been used anyway, as it requires CPU_HOTPLUG, which in turn requires SMP. Fixes: 34c535793bcb ("irqchip/bcm7038-l1: Implement irq_cpu_offline() callback") Signed-off-by: Jonas Gorski Signed-off-by: Marc Zyngier Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/irqchip/irq-bcm7038-l1.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/irqchip/irq-bcm7038-l1.c b/drivers/irqchip/irq-bcm7038-l1.c index d7af88534971..6fb34bf0f352 100644 --- a/drivers/irqchip/irq-bcm7038-l1.c +++ b/drivers/irqchip/irq-bcm7038-l1.c @@ -216,6 +216,7 @@ static int bcm7038_l1_set_affinity(struct irq_data *d, return 0; } +#ifdef CONFIG_SMP static void bcm7038_l1_cpu_offline(struct irq_data *d) { struct cpumask *mask = irq_data_get_affinity_mask(d); @@ -240,6 +241,7 @@ static void bcm7038_l1_cpu_offline(struct irq_data *d) } irq_set_affinity_locked(d, &new_affinity, false); } +#endif static int __init bcm7038_l1_init_one(struct device_node *dn, unsigned int idx, @@ -292,7 +294,9 @@ static struct irq_chip bcm7038_l1_irq_chip = { .irq_mask = bcm7038_l1_mask, .irq_unmask = bcm7038_l1_unmask, .irq_set_affinity = bcm7038_l1_set_affinity, +#ifdef CONFIG_SMP .irq_cpu_offline = bcm7038_l1_cpu_offline, +#endif }; static int bcm7038_l1_map(struct irq_domain *d, unsigned int virq, From 75c55cbd46b15113f74a31ddfc8fa0930826e371 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Tue, 17 Jul 2018 19:14:45 -0700 Subject: [PATCH 0847/1212] net/9p: fix error path of p9_virtio_probe [ Upstream commit 92aef4675d5b1b55404e1532379e343bed0e5cf2 ] Currently when virtio_find_single_vq fails, we go through del_vqs which throws a warning (Trying to free already-free IRQ). Skip del_vqs if vq allocation failed. Link: http://lkml.kernel.org/r/20180524101021.49880-1-jean-philippe.brucker@arm.com Signed-off-by: Jean-Philippe Brucker Reviewed-by: Greg Kurz Cc: Eric Van Hensbergen Cc: Ron Minnich Cc: Latchesar Ionkov Signed-off-by: Andrew Morton Signed-off-by: Dominique Martinet Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/9p/trans_virtio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 6018a1c0dc28..2a15b6aa9cdd 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -574,7 +574,7 @@ static int p9_virtio_probe(struct virtio_device *vdev) chan->vq = virtio_find_single_vq(vdev, req_done, "requests"); if (IS_ERR(chan->vq)) { err = PTR_ERR(chan->vq); - goto out_free_vq; + goto out_free_chan; } chan->vq->vdev->priv = chan; spin_lock_init(&chan->lock); @@ -627,6 +627,7 @@ static int p9_virtio_probe(struct virtio_device *vdev) kfree(tag); out_free_vq: vdev->config->del_vqs(vdev); +out_free_chan: kfree(chan); fail: return err; From f675ab001e78e6bff87bfd0018175119c3a5796b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 8 Aug 2018 14:57:24 +0300 Subject: [PATCH 0848/1212] powerpc: Fix size calculation using resource_size() [ Upstream commit c42d3be0c06f0c1c416054022aa535c08a1f9b39 ] The problem is the the calculation should be "end - start + 1" but the plus one is missing in this calculation. Fixes: 8626816e905e ("powerpc: add support for MPIC message register API") Signed-off-by: Dan Carpenter Reviewed-by: Tyrel Datwyler Signed-off-by: Michael Ellerman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/sysdev/mpic_msgr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/sysdev/mpic_msgr.c b/arch/powerpc/sysdev/mpic_msgr.c index 3f165d972a0e..994fe73c2ed0 100644 --- a/arch/powerpc/sysdev/mpic_msgr.c +++ b/arch/powerpc/sysdev/mpic_msgr.c @@ -196,7 +196,7 @@ static int mpic_msgr_probe(struct platform_device *dev) /* IO map the message register block. */ of_address_to_resource(np, 0, &rsrc); - msgr_block_addr = ioremap(rsrc.start, rsrc.end - rsrc.start); + msgr_block_addr = ioremap(rsrc.start, resource_size(&rsrc)); if (!msgr_block_addr) { dev_err(&dev->dev, "Failed to iomap MPIC message registers"); return -EFAULT; From 4057a20078fe3259f6e898ce41968928966f0116 Mon Sep 17 00:00:00 2001 From: Stefan Haberland Date: Wed, 25 Jul 2018 14:00:47 +0200 Subject: [PATCH 0849/1212] s390/dasd: fix hanging offline processing due to canceled worker [ Upstream commit 669f3765b755fd8739ab46ce3a9c6292ce8b3d2a ] During offline processing two worker threads are canceled without freeing the device reference which leads to a hanging offline process. Reviewed-by: Jan Hoeppner Signed-off-by: Stefan Haberland Signed-off-by: Martin Schwidefsky Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/s390/block/dasd_eckd.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index 21d174e9ebdb..dac2f6883e28 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -2101,8 +2101,11 @@ static int dasd_eckd_basic_to_ready(struct dasd_device *device) static int dasd_eckd_online_to_ready(struct dasd_device *device) { - cancel_work_sync(&device->reload_device); - cancel_work_sync(&device->kick_validate); + if (cancel_work_sync(&device->reload_device)) + dasd_put_device(device); + if (cancel_work_sync(&device->kick_validate)) + dasd_put_device(device); + return 0; }; From 242343ebf645ca5c89aac3ee9503484177c7bb6a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 8 Aug 2018 17:29:09 +0300 Subject: [PATCH 0850/1212] scsi: aic94xx: fix an error code in aic94xx_init() [ Upstream commit 0756c57bce3d26da2592d834d8910b6887021701 ] We accidentally return success instead of -ENOMEM on this error path. Fixes: 2908d778ab3e ("[SCSI] aic94xx: new driver") Signed-off-by: Dan Carpenter Reviewed-by: Johannes Thumshirn Reviewed-by: John Garry Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/aic94xx/aic94xx_init.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/aic94xx/aic94xx_init.c b/drivers/scsi/aic94xx/aic94xx_init.c index 662b2321d1b0..913ebb6d0d29 100644 --- a/drivers/scsi/aic94xx/aic94xx_init.c +++ b/drivers/scsi/aic94xx/aic94xx_init.c @@ -1031,8 +1031,10 @@ static int __init aic94xx_init(void) aic94xx_transport_template = sas_domain_attach_transport(&aic94xx_transport_functions); - if (!aic94xx_transport_template) + if (!aic94xx_transport_template) { + err = -ENOMEM; goto out_destroy_caches; + } err = pci_register_driver(&aic94xx_pci_driver); if (err) From d07bbe50d1b7e98fb1afeed62506c28dd1a21700 Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Fri, 3 Aug 2018 16:38:44 +0200 Subject: [PATCH 0851/1212] PCI: mvebu: Fix I/O space end address calculation [ Upstream commit dfd0309fd7b30a5baffaf47b2fccb88b46d64d69 ] pcie->realio.end should be the address of last byte of the area, therefore using resource_size() of another resource is not correct, we must substract 1 to get the address of the last byte. Fixes: 11be65472a427 ("PCI: mvebu: Adapt to the new device tree layout") Signed-off-by: Thomas Petazzoni Signed-off-by: Lorenzo Pieralisi Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/pci/host/pci-mvebu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/host/pci-mvebu.c b/drivers/pci/host/pci-mvebu.c index 379d08f76146..d0a4652bb9ac 100644 --- a/drivers/pci/host/pci-mvebu.c +++ b/drivers/pci/host/pci-mvebu.c @@ -1235,7 +1235,7 @@ static int mvebu_pcie_probe(struct platform_device *pdev) pcie->realio.start = PCIBIOS_MIN_IO; pcie->realio.end = min_t(resource_size_t, IO_SPACE_LIMIT, - resource_size(&pcie->io)); + resource_size(&pcie->io) - 1); } else pcie->realio = pcie->io; From 66236f1b06f6d2564fa49247cb4d26612fd6c23f Mon Sep 17 00:00:00 2001 From: John Pittman Date: Mon, 6 Aug 2018 15:53:12 -0400 Subject: [PATCH 0852/1212] dm kcopyd: avoid softlockup in run_complete_job [ Upstream commit 784c9a29e99eb40b842c29ecf1cc3a79e00fb629 ] It was reported that softlockups occur when using dm-snapshot ontop of slow (rbd) storage. E.g.: [ 4047.990647] watchdog: BUG: soft lockup - CPU#10 stuck for 22s! [kworker/10:23:26177] ... [ 4048.034151] Workqueue: kcopyd do_work [dm_mod] [ 4048.034156] RIP: 0010:copy_callback+0x41/0x160 [dm_snapshot] ... [ 4048.034190] Call Trace: [ 4048.034196] ? __chunk_is_tracked+0x70/0x70 [dm_snapshot] [ 4048.034200] run_complete_job+0x5f/0xb0 [dm_mod] [ 4048.034205] process_jobs+0x91/0x220 [dm_mod] [ 4048.034210] ? kcopyd_put_pages+0x40/0x40 [dm_mod] [ 4048.034214] do_work+0x46/0xa0 [dm_mod] [ 4048.034219] process_one_work+0x171/0x370 [ 4048.034221] worker_thread+0x1fc/0x3f0 [ 4048.034224] kthread+0xf8/0x130 [ 4048.034226] ? max_active_store+0x80/0x80 [ 4048.034227] ? kthread_bind+0x10/0x10 [ 4048.034231] ret_from_fork+0x35/0x40 [ 4048.034233] Kernel panic - not syncing: softlockup: hung tasks Fix this by calling cond_resched() after run_complete_job()'s callout to the dm_kcopyd_notify_fn (which is dm-snap.c:copy_callback in the above trace). Signed-off-by: John Pittman Signed-off-by: Mike Snitzer Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/md/dm-kcopyd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 1452ed9aacb4..54c308e6704f 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -454,6 +454,8 @@ static int run_complete_job(struct kcopyd_job *job) if (atomic_dec_and_test(&kc->nr_jobs)) wake_up(&kc->destroyq); + cond_resched(); + return 0; } From 9ad681c4ba53e507a11de6a5ee9af89d6c6de7e2 Mon Sep 17 00:00:00 2001 From: Ian Abbott Date: Mon, 6 Aug 2018 11:05:13 +0100 Subject: [PATCH 0853/1212] staging: comedi: ni_mio_common: fix subdevice flags for PFI subdevice [ Upstream commit e083926b3e269d4064825dcf2ad50c636fddf8cf ] The PFI subdevice flags indicate that the subdevice is readable and writeable, but that is only true for the supported "M-series" boards, not the older "E-series" boards. Only set the SDF_READABLE and SDF_WRITABLE subdevice flags for the M-series boards. These two flags are mainly for informational purposes. Signed-off-by: Ian Abbott Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/staging/comedi/drivers/ni_mio_common.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/staging/comedi/drivers/ni_mio_common.c b/drivers/staging/comedi/drivers/ni_mio_common.c index 8f181caffca3..619c989c5f37 100644 --- a/drivers/staging/comedi/drivers/ni_mio_common.c +++ b/drivers/staging/comedi/drivers/ni_mio_common.c @@ -5275,11 +5275,11 @@ static int ni_E_init(struct comedi_device *dev, /* Digital I/O (PFI) subdevice */ s = &dev->subdevices[NI_PFI_DIO_SUBDEV]; s->type = COMEDI_SUBD_DIO; - s->subdev_flags = SDF_READABLE | SDF_WRITABLE | SDF_INTERNAL; s->maxdata = 1; if (devpriv->is_m_series) { s->n_chan = 16; s->insn_bits = ni_pfi_insn_bits; + s->subdev_flags = SDF_READABLE | SDF_WRITABLE | SDF_INTERNAL; ni_writew(dev, s->state, NI_M_PFI_DO_REG); for (i = 0; i < NUM_PFI_OUTPUT_SELECT_REGS; ++i) { @@ -5288,6 +5288,7 @@ static int ni_E_init(struct comedi_device *dev, } } else { s->n_chan = 10; + s->subdev_flags = SDF_INTERNAL; } s->insn_config = ni_pfi_insn_config; From 8e676abeaf4b0c2dc46b5b47f78e72dcc66a9f9f Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 7 Aug 2018 11:15:39 -0300 Subject: [PATCH 0854/1212] selftests/powerpc: Kill child processes on SIGINT [ Upstream commit 7c27a26e1ed5a7dd709aa19685d2c98f64e1cf0c ] There are some powerpc selftests, as tm/tm-unavailable, that run for a long period (>120 seconds), and if it is interrupted, as pressing CRTL-C (SIGINT), the foreground process (harness) dies but the child process and threads continue to execute (with PPID = 1 now) in background. In this case, you'd think the whole test exited, but there are remaining threads and processes being executed in background. Sometimes these zombies processes are doing annoying things, as consuming the whole CPU or dumping things to STDOUT. This patch fixes this problem by attaching an empty signal handler to SIGINT in the harness process. This handler will interrupt (EINTR) the parent process waitpid() call, letting the code to follow through the normal flow, which will kill all the processes in the child process group. This patch also fixes a typo. Signed-off-by: Breno Leitao Signed-off-by: Gustavo Romero Signed-off-by: Michael Ellerman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- tools/testing/selftests/powerpc/harness.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/powerpc/harness.c b/tools/testing/selftests/powerpc/harness.c index f45cee80c58b..af2b1e66e35e 100644 --- a/tools/testing/selftests/powerpc/harness.c +++ b/tools/testing/selftests/powerpc/harness.c @@ -85,13 +85,13 @@ int run_test(int (test_function)(void), char *name) return status; } -static void alarm_handler(int signum) +static void sig_handler(int signum) { - /* Jut wake us up from waitpid */ + /* Just wake us up from waitpid */ } -static struct sigaction alarm_action = { - .sa_handler = alarm_handler, +static struct sigaction sig_action = { + .sa_handler = sig_handler, }; int test_harness(int (test_function)(void), char *name) @@ -101,8 +101,14 @@ int test_harness(int (test_function)(void), char *name) test_start(name); test_set_git_version(GIT_VERSION); - if (sigaction(SIGALRM, &alarm_action, NULL)) { - perror("sigaction"); + if (sigaction(SIGINT, &sig_action, NULL)) { + perror("sigaction (sigint)"); + test_error(name); + return 1; + } + + if (sigaction(SIGALRM, &sig_action, NULL)) { + perror("sigaction (sigalrm)"); test_error(name); return 1; } From d6773f4061ceb8a4dbc655c386b2531b2db44b04 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 1 Aug 2018 00:56:12 -0500 Subject: [PATCH 0855/1212] smb3: fix reset of bytes read and written stats [ Upstream commit c281bc0c7412308c7ec0888904f7c99353da4796 ] echo 0 > /proc/fs/cifs/Stats is supposed to reset the stats but there were four (see example below) that were not reset (bytes read and witten, total vfs ops and max ops at one time). ... 0 session 0 share reconnects Total vfs operations: 100 maximum at one time: 2 1) \\localhost\test SMBs: 0 Bytes read: 502092 Bytes written: 31457286 TreeConnects: 0 total 0 failed TreeDisconnects: 0 total 0 failed ... This patch fixes cifs_stats_proc_write to properly reset those four. Signed-off-by: Steve French Reviewed-by: Aurelien Aptel Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/cifs/cifs_debug.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index bcbe42fb7e92..0e72a14228f8 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -285,6 +285,10 @@ static ssize_t cifs_stats_proc_write(struct file *file, atomic_set(&totBufAllocCount, 0); atomic_set(&totSmBufAllocCount, 0); #endif /* CONFIG_CIFS_STATS2 */ + spin_lock(&GlobalMid_Lock); + GlobalMaxActiveXid = 0; + GlobalCurrentXid = 0; + spin_unlock(&GlobalMid_Lock); spin_lock(&cifs_tcp_ses_lock); list_for_each(tmp1, &cifs_tcp_ses_list) { server = list_entry(tmp1, struct TCP_Server_Info, @@ -297,6 +301,10 @@ static ssize_t cifs_stats_proc_write(struct file *file, struct cifs_tcon, tcon_list); atomic_set(&tcon->num_smbs_sent, 0); + spin_lock(&tcon->stat_lock); + tcon->bytes_read = 0; + tcon->bytes_written = 0; + spin_unlock(&tcon->stat_lock); if (server->ops->clear_stats) server->ops->clear_stats(tcon); } From a9997f8873511a1b50d1b1afdb645fe0f23e2b65 Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 23 Jul 2018 09:15:18 -0500 Subject: [PATCH 0856/1212] SMB3: Number of requests sent should be displayed for SMB3 not just CIFS [ Upstream commit 289131e1f1e6ad8c661ec05e176b8f0915672059 ] For SMB2/SMB3 the number of requests sent was not displayed in /proc/fs/cifs/Stats unless CONFIG_CIFS_STATS2 was enabled (only number of failed requests displayed). As with earlier dialects, we should be displaying these counters if CONFIG_CIFS_STATS is enabled. They are important for debugging. e.g. when you cat /proc/fs/cifs/Stats (before the patch) Resources in use CIFS Session: 1 Share (unique mount targets): 2 SMB Request/Response Buffer: 1 Pool size: 5 SMB Small Req/Resp Buffer: 1 Pool size: 30 Operations (MIDs): 0 0 session 0 share reconnects Total vfs operations: 690 maximum at one time: 2 1) \\localhost\test SMBs: 975 Negotiates: 0 sent 0 failed SessionSetups: 0 sent 0 failed Logoffs: 0 sent 0 failed TreeConnects: 0 sent 0 failed TreeDisconnects: 0 sent 0 failed Creates: 0 sent 2 failed Closes: 0 sent 0 failed Flushes: 0 sent 0 failed Reads: 0 sent 0 failed Writes: 0 sent 0 failed Locks: 0 sent 0 failed IOCTLs: 0 sent 1 failed Cancels: 0 sent 0 failed Echos: 0 sent 0 failed QueryDirectories: 0 sent 63 failed Signed-off-by: Steve French Reviewed-by: Aurelien Aptel Reviewed-by: Pavel Shilovsky Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 5f5ba807b414..52d79fb04115 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -315,7 +315,7 @@ small_smb2_init(__le16 smb2_command, struct cifs_tcon *tcon, smb2_hdr_assemble((struct smb2_hdr *) *request_buf, smb2_command, tcon); if (tcon != NULL) { -#ifdef CONFIG_CIFS_STATS2 +#ifdef CONFIG_CIFS_STATS uint16_t com_code = le16_to_cpu(smb2_command); cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_sent[com_code]); #endif From 2dc310f837540defcae83edde8bad940e12abcd1 Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Wed, 4 Jul 2018 23:27:02 +0530 Subject: [PATCH 0857/1212] powerpc/pseries: Avoid using the size greater than RTAS_ERROR_LOG_MAX. [ Upstream commit 74e96bf44f430cf7a01de19ba6cf49b361cdfd6e ] The global mce data buffer that used to copy rtas error log is of 2048 (RTAS_ERROR_LOG_MAX) bytes in size. Before the copy we read extended_log_length from rtas error log header, then use max of extended_log_length and RTAS_ERROR_LOG_MAX as a size of data to be copied. Ideally the platform (phyp) will never send extended error log with size > 2048. But if that happens, then we have a risk of buffer overrun and corruption. Fix this by using min_t instead. Fixes: d368514c3097 ("powerpc: Fix corruption when grabbing FWNMI data") Reported-by: Michal Suchanek Signed-off-by: Mahesh Salgaonkar Signed-off-by: Michael Ellerman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/platforms/pseries/ras.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index f5313a78e5d6..9795e52bab3d 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -311,7 +311,7 @@ static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs) int len, error_log_length; error_log_length = 8 + rtas_error_extended_log_length(h); - len = max_t(int, error_log_length, RTAS_ERROR_LOG_MAX); + len = min_t(int, error_log_length, RTAS_ERROR_LOG_MAX); memset(global_mce_data_buf, 0, RTAS_ERROR_LOG_MAX); memcpy(global_mce_data_buf, h, len); errhdr = (struct rtas_error_log *)global_mce_data_buf; From accb3e424b8b6b9fa51b77fcf7e6673f40d182a6 Mon Sep 17 00:00:00 2001 From: Misono Tomohiro Date: Tue, 31 Jul 2018 16:20:21 +0900 Subject: [PATCH 0858/1212] btrfs: replace: Reset on-disk dev stats value after replace [ Upstream commit 1e7e1f9e3aba00c9b9c323bfeeddafe69ff21ff6 ] on-disk devs stats value is updated in btrfs_run_dev_stats(), which is called during commit transaction, if device->dev_stats_ccnt is not zero. Since current replace operation does not touch dev_stats_ccnt, on-disk dev stats value is not updated. Therefore "btrfs device stats" may return old device's value after umount/mount (Example: See "btrfs ins dump-t -t DEV $DEV" after btrfs/100 finish). Fix this by just incrementing dev_stats_ccnt in btrfs_dev_replace_finishing() when replace is succeeded and this will update the values. Signed-off-by: Misono Tomohiro Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/dev-replace.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 1e668fb7dd4c..176a27bc63aa 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -573,6 +573,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, btrfs_rm_dev_replace_unblocked(fs_info); + /* + * Increment dev_stats_ccnt so that btrfs_run_dev_stats() will + * update on-disk dev stats value during commit transaction + */ + atomic_inc(&tgt_device->dev_stats_ccnt); + /* * this is again a consistent state where no dev_replace procedure * is running, the target device is part of the filesystem, the From 510825b3f8c1f5dc29b81660e1eb68e7fb0b8d50 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 3 Jul 2018 17:10:07 +0800 Subject: [PATCH 0859/1212] btrfs: relocation: Only remove reloc rb_trees if reloc control has been initialized [ Upstream commit 389305b2aa68723c754f88d9dbd268a400e10664 ] Invalid reloc tree can cause kernel NULL pointer dereference when btrfs does some cleanup of the reloc roots. It turns out that fs_info::reloc_ctl can be NULL in btrfs_recover_relocation() as we allocate relocation control after all reloc roots have been verified. So when we hit: note, we haven't called set_reloc_control() thus fs_info::reloc_ctl is still NULL. Link: https://bugzilla.kernel.org/show_bug.cgi?id=199833 Reported-by: Xu Wen Signed-off-by: Qu Wenruo Tested-by: Gu Jinxiang Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/relocation.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 9ebe027cc4b7..cfe913d2d3df 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1318,18 +1318,19 @@ static void __del_reloc_root(struct btrfs_root *root) struct mapping_node *node = NULL; struct reloc_control *rc = root->fs_info->reloc_ctl; - spin_lock(&rc->reloc_root_tree.lock); - rb_node = tree_search(&rc->reloc_root_tree.rb_root, - root->node->start); - if (rb_node) { - node = rb_entry(rb_node, struct mapping_node, rb_node); - rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); + if (rc) { + spin_lock(&rc->reloc_root_tree.lock); + rb_node = tree_search(&rc->reloc_root_tree.rb_root, + root->node->start); + if (rb_node) { + node = rb_entry(rb_node, struct mapping_node, rb_node); + rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); + } + spin_unlock(&rc->reloc_root_tree.lock); + if (!node) + return; + BUG_ON((struct btrfs_root *)node->data != root); } - spin_unlock(&rc->reloc_root_tree.lock); - - if (!node) - return; - BUG_ON((struct btrfs_root *)node->data != root); spin_lock(&root->fs_info->trans_lock); list_del_init(&root->root_list); From 02e48c4d57ccaa89ad1d9fbf39ae6a4bb20aa4e5 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 22 Jun 2018 12:35:00 +0800 Subject: [PATCH 0860/1212] btrfs: Don't remove block group that still has pinned down bytes [ Upstream commit 43794446548730ac8461be30bbe47d5d027d1d16 ] [BUG] Under certain KVM load and LTP tests, it is possible to hit the following calltrace if quota is enabled: BTRFS critical (device vda2): unable to find logical 8820195328 length 4096 BTRFS critical (device vda2): unable to find logical 8820195328 length 4096 WARNING: CPU: 0 PID: 49 at ../block/blk-core.c:172 blk_status_to_errno+0x1a/0x30 CPU: 0 PID: 49 Comm: kworker/u2:1 Not tainted 4.12.14-15-default #1 SLE15 (unreleased) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.0.0-prebuilt.qemu-project.org 04/01/2014 Workqueue: btrfs-endio-write btrfs_endio_write_helper [btrfs] task: ffff9f827b340bc0 task.stack: ffffb4f8c0304000 RIP: 0010:blk_status_to_errno+0x1a/0x30 Call Trace: submit_extent_page+0x191/0x270 [btrfs] ? btrfs_create_repair_bio+0x130/0x130 [btrfs] __do_readpage+0x2d2/0x810 [btrfs] ? btrfs_create_repair_bio+0x130/0x130 [btrfs] ? run_one_async_done+0xc0/0xc0 [btrfs] __extent_read_full_page+0xe7/0x100 [btrfs] ? run_one_async_done+0xc0/0xc0 [btrfs] read_extent_buffer_pages+0x1ab/0x2d0 [btrfs] ? run_one_async_done+0xc0/0xc0 [btrfs] btree_read_extent_buffer_pages+0x94/0xf0 [btrfs] read_tree_block+0x31/0x60 [btrfs] read_block_for_search.isra.35+0xf0/0x2e0 [btrfs] btrfs_search_slot+0x46b/0xa00 [btrfs] ? kmem_cache_alloc+0x1a8/0x510 ? btrfs_get_token_32+0x5b/0x120 [btrfs] find_parent_nodes+0x11d/0xeb0 [btrfs] ? leaf_space_used+0xb8/0xd0 [btrfs] ? btrfs_leaf_free_space+0x49/0x90 [btrfs] ? btrfs_find_all_roots_safe+0x93/0x100 [btrfs] btrfs_find_all_roots_safe+0x93/0x100 [btrfs] btrfs_find_all_roots+0x45/0x60 [btrfs] btrfs_qgroup_trace_extent_post+0x20/0x40 [btrfs] btrfs_add_delayed_data_ref+0x1a3/0x1d0 [btrfs] btrfs_alloc_reserved_file_extent+0x38/0x40 [btrfs] insert_reserved_file_extent.constprop.71+0x289/0x2e0 [btrfs] btrfs_finish_ordered_io+0x2f4/0x7f0 [btrfs] ? pick_next_task_fair+0x2cd/0x530 ? __switch_to+0x92/0x4b0 btrfs_worker_helper+0x81/0x300 [btrfs] process_one_work+0x1da/0x3f0 worker_thread+0x2b/0x3f0 ? process_one_work+0x3f0/0x3f0 kthread+0x11a/0x130 ? kthread_create_on_node+0x40/0x40 ret_from_fork+0x35/0x40 BTRFS critical (device vda2): unable to find logical 8820195328 length 16384 BTRFS: error (device vda2) in btrfs_finish_ordered_io:3023: errno=-5 IO failure BTRFS info (device vda2): forced readonly BTRFS error (device vda2): pending csums is 2887680 [CAUSE] It's caused by race with block group auto removal: - There is a meta block group X, which has only one tree block The tree block belongs to fs tree 257. - In current transaction, some operation modified fs tree 257 The tree block gets COWed, so the block group X is empty, and marked as unused, queued to be deleted. - Some workload (like fsync) wakes up cleaner_kthread() Which will call btrfs_delete_unused_bgs() to remove unused block groups. So block group X along its chunk map get removed. - Some delalloc work finished for fs tree 257 Quota needs to get the original reference of the extent, which will read tree blocks of commit root of 257. Then since the chunk map gets removed, the above warning gets triggered. [FIX] Just let btrfs_delete_unused_bgs() skip block group which still has pinned bytes. However there is a minor side effect: currently we only queue empty blocks at update_block_group(), and such empty block group with pinned bytes won't go through update_block_group() again, such block group won't be removed, until it gets new extent allocated and removed. Signed-off-by: Qu Wenruo Reviewed-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 493c7354ec0b..a72f941ca750 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -10410,7 +10410,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) /* Don't want to race with allocators so take the groups_sem */ down_write(&space_info->groups_sem); spin_lock(&block_group->lock); - if (block_group->reserved || + if (block_group->reserved || block_group->pinned || btrfs_block_group_used(&block_group->item) || block_group->ro || list_is_singular(&block_group->list)) { From 98d122a4a74667ffc16d50baa086e9616fb44f28 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 23 Jul 2018 14:25:31 -0700 Subject: [PATCH 0861/1212] debugobjects: Make stack check warning more informative commit fc91a3c4c27acdca0bc13af6fbb68c35cfd519f2 upstream. While debugging an issue debugobject tracking warned about an annotation issue of an object on stack. It turned out that the issue was due to the object in concern being on a different stack which was due to another issue. Thomas suggested to print the pointers and the location of the stack for the currently running task. This helped to figure out that the object was on the wrong stack. As this is general useful information for debugging similar issues, make the error message more informative by printing the pointers. [ tglx: Massaged changelog ] Signed-off-by: Joel Fernandes (Google) Signed-off-by: Thomas Gleixner Acked-by: Waiman Long Acked-by: Yang Shi Cc: kernel-team@android.com Cc: Arnd Bergmann Cc: astrachan@google.com Link: https://lkml.kernel.org/r/20180723212531.202328-1-joel@joelfernandes.org Signed-off-by: Greg Kroah-Hartman --- lib/debugobjects.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 547f7f923dbc..a26328ec39f1 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -295,9 +295,12 @@ static void debug_object_is_on_stack(void *addr, int onstack) limit++; if (is_on_stack) - pr_warn("object is on stack, but not annotated\n"); + pr_warn("object %p is on stack %p, but NOT annotated.\n", addr, + task_stack_page(current)); else - pr_warn("object is not on stack, but annotated\n"); + pr_warn("object %p is NOT on stack %p, but annotated.\n", addr, + task_stack_page(current)); + WARN_ON(1); } From f46d2b99a6acd87d56822c600fd2587a37e4d56c Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 21 Aug 2018 17:37:55 +0200 Subject: [PATCH 0862/1212] x86/pae: use 64 bit atomic xchg function in native_ptep_get_and_clear commit b2d7a075a1ccef2fb321d595802190c8e9b39004 upstream. Using only 32-bit writes for the pte will result in an intermediate L1TF vulnerable PTE. When running as a Xen PV guest this will at once switch the guest to shadow mode resulting in a loss of performance. Use arch_atomic64_xchg() instead which will perform the requested operation atomically with all 64 bits. Some performance considerations according to: https://software.intel.com/sites/default/files/managed/ad/dc/Intel-Xeon-Scalable-Processor-throughput-latency.pdf The main number should be the latency, as there is no tight loop around native_ptep_get_and_clear(). "lock cmpxchg8b" has a latency of 20 cycles, while "lock xchg" (with a memory operand) isn't mentioned in that document. "lock xadd" (with xadd having 3 cycles less latency than xchg) has a latency of 11, so we can assume a latency of 14 for "lock xchg". Signed-off-by: Juergen Gross Reviewed-by: Thomas Gleixner Reviewed-by: Jan Beulich Tested-by: Jason Andryuk Signed-off-by: Boris Ostrovsky [ Atomic operations gained an arch_ prefix in 8bf705d13039 ("locking/atomic/x86: Switch atomic.h to use atomic-instrumented.h") so s/arch_atomic64_xchg/atomic64_xchg/ for backport.] Signed-off-by: Jason Andryuk Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/pgtable-3level.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 5c686382d84b..095dbc25122a 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_PGTABLE_3LEVEL_H #define _ASM_X86_PGTABLE_3LEVEL_H +#include + /* * Intel Physical Address Extension (PAE) Mode - three-level page * tables on PPro+ CPUs. @@ -142,10 +144,7 @@ static inline pte_t native_ptep_get_and_clear(pte_t *ptep) { pte_t res; - /* xchg acts as a barrier before the setting of the high bits */ - res.pte_low = xchg(&ptep->pte_low, 0); - res.pte_high = ptep->pte_high; - ptep->pte_high = 0; + res.pte = (pteval_t)atomic64_xchg((atomic64_t *)ptep, 0); return res; } From accf294af41897950d1c83318c44a032d755188d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 28 Aug 2018 12:59:10 -0700 Subject: [PATCH 0863/1212] kbuild: make missing $DEPMOD a Warning instead of an Error commit 914b087ff9e0e9a399a4927fa30793064afc0178 upstream. When $DEPMOD is not found, only print a warning instead of exiting with an error message and error status: Warning: 'make modules_install' requires /sbin/depmod. Please install it. This is probably in the kmod package. Change the Error to a Warning because "not all build hosts for cross compiling Linux are Linux systems and are able to provide a working port of depmod, especially at the file patch /sbin/depmod." I.e., "make modules_install" may be used to copy/install the loadable modules files to a target directory on a build system and then transferred to an embedded device where /sbin/depmod is run instead of it being run on the build system. Fixes: 934193a654c1 ("kbuild: verify that $DEPMOD is installed") Signed-off-by: Randy Dunlap Reported-by: H. Nikolaus Schaller Cc: stable@vger.kernel.org Cc: Lucas De Marchi Cc: Lucas De Marchi Cc: Michal Marek Cc: Jessica Yu Cc: Chih-Wei Huang Signed-off-by: Masahiro Yamada Signed-off-by: Maxim Zhukov Signed-off-by: Greg Kroah-Hartman --- scripts/depmod.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/depmod.sh b/scripts/depmod.sh index ea1e96921e3b..baedaef53ca0 100755 --- a/scripts/depmod.sh +++ b/scripts/depmod.sh @@ -15,9 +15,9 @@ if ! test -r System.map ; then fi if [ -z $(command -v $DEPMOD) ]; then - echo "'make modules_install' requires $DEPMOD. Please install it." >&2 + echo "Warning: 'make modules_install' requires $DEPMOD. Please install it." >&2 echo "This is probably in the kmod package." >&2 - exit 1 + exit 0 fi # older versions of depmod don't support -P From 4a7811bb3ae10d76d9e76c2b0ce7b27bc02a9370 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Tue, 4 Sep 2018 15:24:04 +0000 Subject: [PATCH 0864/1212] irda: Fix memory leak caused by repeated binds of irda socket The irda_bind() function allocates memory for self->ias_obj without checking to see if the socket is already bound. A userspace process could repeatedly bind the socket, have each new object added into the LM-IAS database, and lose the reference to the old object assigned to the socket to exhaust memory resources. This patch errors out of the bind operation when self->ias_obj is already assigned. CVE-2018-6554 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Tyler Hicks Reviewed-by: Seth Arnold Reviewed-by: Stefan Bader Signed-off-by: Greg Kroah-Hartman --- net/irda/af_irda.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 4a116d766c15..82e632b2c5a1 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -774,6 +774,13 @@ static int irda_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) return -EINVAL; lock_sock(sk); + + /* Ensure that the socket is not already bound */ + if (self->ias_obj) { + err = -EINVAL; + goto out; + } + #ifdef CONFIG_IRDA_ULTRA /* Special care for Ultra sockets */ if ((sk->sk_type == SOCK_DGRAM) && From 131a3b82c853483b1809cad06f8997421dd49500 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Tue, 4 Sep 2018 15:24:05 +0000 Subject: [PATCH 0865/1212] irda: Only insert new objects into the global database via setsockopt The irda_setsockopt() function conditionally allocates memory for a new self->ias_object or, in some cases, reuses the existing self->ias_object. Existing objects were incorrectly reinserted into the LM_IAS database which corrupted the doubly linked list used for the hashbin implementation of the LM_IAS database. When combined with a memory leak in irda_bind(), this issue could be leveraged to create a use-after-free vulnerability in the hashbin list. This patch fixes the issue by only inserting newly allocated objects into the database. CVE-2018-6555 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Tyler Hicks Reviewed-by: Seth Arnold Reviewed-by: Stefan Bader Signed-off-by: Greg Kroah-Hartman --- net/irda/af_irda.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 82e632b2c5a1..7cc9db38e1b6 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -2027,7 +2027,11 @@ static int irda_setsockopt(struct socket *sock, int level, int optname, err = -EINVAL; goto out; } - irias_insert_object(ias_obj); + + /* Only insert newly allocated objects */ + if (free_ias) + irias_insert_object(ias_obj); + kfree(ias_opt); break; case IRLMP_IAS_DEL: From a37c70426caa22b59e82c598e9795bead06fd9a9 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Mon, 3 Sep 2018 10:39:48 -0300 Subject: [PATCH 0866/1212] Revert "ARM: imx_v6_v7_defconfig: Select ULPI support" This reverts commit 0d0af17ae83d6feb29d676c72423461419df5110. This commit causes reboot to fail on imx6 wandboard, so let's revert it. Cc: #4.4 Reported-by: Rasmus Villemoes Signed-off-by: Fabio Estevam Signed-off-by: Greg Kroah-Hartman --- arch/arm/configs/imx_v6_v7_defconfig | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm/configs/imx_v6_v7_defconfig b/arch/arm/configs/imx_v6_v7_defconfig index b3490c1c49d1..4187f69f6630 100644 --- a/arch/arm/configs/imx_v6_v7_defconfig +++ b/arch/arm/configs/imx_v6_v7_defconfig @@ -261,7 +261,6 @@ CONFIG_USB_STORAGE=y CONFIG_USB_CHIPIDEA=y CONFIG_USB_CHIPIDEA_UDC=y CONFIG_USB_CHIPIDEA_HOST=y -CONFIG_USB_CHIPIDEA_ULPI=y CONFIG_USB_SERIAL=m CONFIG_USB_SERIAL_GENERIC=y CONFIG_USB_SERIAL_FTDI_SIO=m @@ -288,7 +287,6 @@ CONFIG_USB_G_NCM=m CONFIG_USB_GADGETFS=m CONFIG_USB_MASS_STORAGE=m CONFIG_USB_G_SERIAL=m -CONFIG_USB_ULPI_BUS=y CONFIG_MMC=y CONFIG_MMC_SDHCI=y CONFIG_MMC_SDHCI_PLTFM=y From a50422747502a5f3b925a96f36f85e2412e798ec Mon Sep 17 00:00:00 2001 From: Govindarajulu Varadarajan Date: Mon, 30 Jul 2018 09:56:54 -0700 Subject: [PATCH 0867/1212] enic: do not call enic_change_mtu in enic_probe commit cb5c6568867325f9905e80c96531d963bec8e5ea upstream. In commit ab123fe071c9 ("enic: handle mtu change for vf properly") ASSERT_RTNL() is added to _enic_change_mtu() to prevent it from being called without rtnl held. enic_probe() calls enic_change_mtu() without rtnl held. At this point netdev is not registered yet. Remove call to enic_change_mtu and assign the mtu to netdev->mtu. Fixes: ab123fe071c9 ("enic: handle mtu change for vf properly") Signed-off-by: Govindarajulu Varadarajan Signed-off-by: David S. Miller Cc: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/cisco/enic/enic_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c index b20bce2c7da1..0433fdebda25 100644 --- a/drivers/net/ethernet/cisco/enic/enic_main.c +++ b/drivers/net/ethernet/cisco/enic/enic_main.c @@ -2683,7 +2683,6 @@ static int enic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) */ enic->port_mtu = enic->config.mtu; - (void)enic_change_mtu(netdev, enic->port_mtu); err = enic_set_mac_addr(netdev, enic->mac_addr); if (err) { @@ -2732,6 +2731,7 @@ static int enic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) netdev->features |= NETIF_F_HIGHDMA; netdev->priv_flags |= IFF_UNICAST_FLT; + netdev->mtu = enic->port_mtu; err = register_netdev(netdev); if (err) { From e72977e87482759dba7181e0ec210c2db00c6124 Mon Sep 17 00:00:00 2001 From: Chas Williams Date: Thu, 6 Sep 2018 11:11:27 -0400 Subject: [PATCH 0868/1212] Fixes: Commit cdbf92675fad ("mm: numa: avoid waiting on freed migrated pages") Commit cdbf92675fad ("mm: numa: avoid waiting on freed migrated pages") was an incomplete backport of the upstream commit. It is necessary to always reset page_nid before attempting any early exit. The original commit conflicted due to lack of commit 82b0f8c39a38 ("mm: join struct fault_env and vm_fault") in 4.9 so it wasn't a clean application, and the change must have just gotten lost in the noise. Signed-off-by: Chas Williams Signed-off-by: Greg Kroah-Hartman --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0127b788272f..c4ea57ee2fd1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1393,12 +1393,12 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Migration could have started since the pmd_trans_migrating check */ if (!page_locked) { + page_nid = -1; if (!get_page_unless_zero(page)) goto out_unlock; spin_unlock(ptl); wait_on_page_locked(page); put_page(page); - page_nid = -1; goto out; } From 27e83f7dd91dee0d7d4a8289db3de4c12f0436bd Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Wed, 4 Nov 2015 18:32:37 +0000 Subject: [PATCH 0869/1212] genirq: Delay incrementing interrupt count if it's disabled/pending commit a946e8c717f9355d1abd5408ed0adc0002d1aed1 upstream. In case of a wakeup interrupt, irq_pm_check_wakeup disables the interrupt and marks it pending and suspended, disables it and notifies the pm core about the wake event. The interrupt gets handled later once the system is resumed. However the irq stats is updated twice: once when it's disabled waiting for the system to resume and later when it's handled, resulting in wrong counting of the wakeup interrupt when waking up the system. This patch updates the interrupt count so that it's updated only when the interrupt gets handled. It's already handled correctly in handle_edge_irq and handle_edge_eoi_irq. Reported-by: Manoil Claudiu Signed-off-by: Sudeep Holla Cc: Marc Zyngier Link: http://lkml.kernel.org/r/1446661957-1019-1-git-send-email-sudeep.holla@arm.com Signed-off-by: Thomas Gleixner Signed-off-by: Hanjun Guo Signed-off-by: Greg Kroah-Hartman --- kernel/irq/chip.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index e4453d9f788c..3c74e13a95dc 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -338,7 +338,6 @@ void handle_nested_irq(unsigned int irq) raw_spin_lock_irq(&desc->lock); desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - kstat_incr_irqs_this_cpu(desc); action = desc->action; if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) { @@ -346,6 +345,7 @@ void handle_nested_irq(unsigned int irq) goto out_unlock; } + kstat_incr_irqs_this_cpu(desc); irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock_irq(&desc->lock); @@ -412,13 +412,13 @@ void handle_simple_irq(struct irq_desc *desc) goto out_unlock; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - kstat_incr_irqs_this_cpu(desc); if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { desc->istate |= IRQS_PENDING; goto out_unlock; } + kstat_incr_irqs_this_cpu(desc); handle_irq_event(desc); out_unlock: @@ -462,7 +462,6 @@ void handle_level_irq(struct irq_desc *desc) goto out_unlock; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - kstat_incr_irqs_this_cpu(desc); /* * If its disabled or no action available @@ -473,6 +472,7 @@ void handle_level_irq(struct irq_desc *desc) goto out_unlock; } + kstat_incr_irqs_this_cpu(desc); handle_irq_event(desc); cond_unmask_irq(desc); @@ -532,7 +532,6 @@ void handle_fasteoi_irq(struct irq_desc *desc) goto out; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - kstat_incr_irqs_this_cpu(desc); /* * If its disabled or no action available @@ -544,6 +543,7 @@ void handle_fasteoi_irq(struct irq_desc *desc) goto out; } + kstat_incr_irqs_this_cpu(desc); if (desc->istate & IRQS_ONESHOT) mask_irq(desc); From ad37cd79428ee7206be05edba64de6aa3397587e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 26 Jan 2016 14:24:15 +0000 Subject: [PATCH 0870/1212] irqchip/gic-v3-its: Recompute the number of pages on page size change commit 18aa60ce2751c95d3412ed06a58b8b6cfb6f88f2 upstream. When the programming of a GITS_BASERn register fails because of an unsupported ITS page size, we retry it with a smaller page size. Unfortunately, we don't recompute the number of allocated ITS pages, indicating the wrong value computed in the original allocation. A convenient fix is to free the pages we allocated, update the page size, and restart the allocation. This will ensure that we always allocate the right amount in the case of a device table, specially if we have to reduce the allocation order to stay within the boundaries of the ITS maximum allocation. Reported-and-tested-by: Ma Jun Signed-off-by: Marc Zyngier Cc: linux-arm-kernel@lists.infradead.org Cc: Jason Cooper Link: http://lkml.kernel.org/r/1453818255-1289-1-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner Signed-off-by: Hanjun Guo Signed-off-by: Greg Kroah-Hartman --- drivers/irqchip/irq-gic-v3-its.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 82e00e3ad0e0..af61a2f226ca 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -884,6 +884,7 @@ static int its_alloc_tables(const char *node_name, struct its_node *its) } alloc_size = (1 << order) * PAGE_SIZE; +retry_alloc_baser: alloc_pages = (alloc_size / psz); if (alloc_pages > GITS_BASER_PAGES_MAX) { alloc_pages = GITS_BASER_PAGES_MAX; @@ -947,13 +948,16 @@ static int its_alloc_tables(const char *node_name, struct its_node *its) * size and retry. If we reach 4K, then * something is horribly wrong... */ + free_pages((unsigned long)base, order); + its->tables[i] = NULL; + switch (psz) { case SZ_16K: psz = SZ_4K; - goto retry_baser; + goto retry_alloc_baser; case SZ_64K: psz = SZ_16K; - goto retry_baser; + goto retry_alloc_baser; } } From 4a0c7f6afea7d76ce53ca1f9517c2f9ac9ea13ea Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Mon, 1 Feb 2016 20:19:44 -0600 Subject: [PATCH 0871/1212] irqchip/gicv3-its: Fix memory leak in its_free_tables() commit 1a485f4d2e28efd77075b2952926683d6c245633 upstream. The current ITS driver has a memory leak in its_free_tables(). It happens on tear down path of the driver when its_probe() call fails. its_free_tables() should free the exact number of pages that have been allocated, not just a single page as current code does. This patch records the memory size for each ITS_BASERn at the time of page allocation and uses the same size information when freeing pages to fix the issue. Signed-off-by: Shanker Donthineni Acked-by: Marc Zyngier Cc: Jason Cooper Cc: Vikram Sethi Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1454379584-21772-1-git-send-email-shankerd@codeaurora.org Signed-off-by: Thomas Gleixner Signed-off-by: Hanjun Guo Signed-off-by: Greg Kroah-Hartman --- drivers/irqchip/irq-gic-v3-its.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index af61a2f226ca..f0cbb7631a81 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -67,7 +67,10 @@ struct its_node { unsigned long phys_base; struct its_cmd_block *cmd_base; struct its_cmd_block *cmd_write; - void *tables[GITS_BASER_NR_REGS]; + struct { + void *base; + u32 order; + } tables[GITS_BASER_NR_REGS]; struct its_collection *collections; struct list_head its_device_list; u64 flags; @@ -816,9 +819,10 @@ static void its_free_tables(struct its_node *its) int i; for (i = 0; i < GITS_BASER_NR_REGS; i++) { - if (its->tables[i]) { - free_page((unsigned long)its->tables[i]); - its->tables[i] = NULL; + if (its->tables[i].base) { + free_pages((unsigned long)its->tables[i].base, + its->tables[i].order); + its->tables[i].base = NULL; } } } @@ -899,7 +903,8 @@ static int its_alloc_tables(const char *node_name, struct its_node *its) goto out_free; } - its->tables[i] = base; + its->tables[i].base = base; + its->tables[i].order = order; retry_baser: val = (virt_to_phys(base) | @@ -949,7 +954,7 @@ static int its_alloc_tables(const char *node_name, struct its_node *its) * something is horribly wrong... */ free_pages((unsigned long)base, order); - its->tables[i] = NULL; + its->tables[i].base = NULL; switch (psz) { case SZ_16K: From 5e56ddc78f0e3669205ca50acee64fc6c75887b0 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Tue, 16 Feb 2016 18:00:36 -0600 Subject: [PATCH 0872/1212] irqchip/gicv3-its: Avoid cache flush beyond ITS_BASERn memory size commit 2eca0d6ceea1f108b2d3ac81fb34698c4fd41006 upstream. Function its_alloc_tables() maintains two local variables, "order" and and "alloc_size", to hold memory size that has been allocated to ITS_BASEn. We don't always refresh the variable alloc_size whenever value of the variable order changes, causing the following two problems. - Cache flush operation with size more than required. - Information reported by pr_info is not correct. Use a helper macro that converts page order to size in bytes instead of variable "alloc_size" to fix both the problems. Signed-off-by: Shanker Donthineni Signed-off-by: Marc Zyngier Signed-off-by: Hanjun Guo Signed-off-by: Greg Kroah-Hartman --- drivers/irqchip/irq-gic-v3-its.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index f0cbb7631a81..c3d7a1461043 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -80,6 +80,9 @@ struct its_node { #define ITS_ITT_ALIGN SZ_256 +/* Convert page order to size in bytes */ +#define PAGE_ORDER_TO_SIZE(o) (PAGE_SIZE << (o)) + struct event_lpi_map { unsigned long *lpi_map; u16 *col_map; @@ -855,7 +858,6 @@ static int its_alloc_tables(const char *node_name, struct its_node *its) u64 type = GITS_BASER_TYPE(val); u64 entry_size = GITS_BASER_ENTRY_SIZE(val); int order = get_order(psz); - int alloc_size; int alloc_pages; u64 tmp; void *base; @@ -887,9 +889,8 @@ static int its_alloc_tables(const char *node_name, struct its_node *its) } } - alloc_size = (1 << order) * PAGE_SIZE; retry_alloc_baser: - alloc_pages = (alloc_size / psz); + alloc_pages = (PAGE_ORDER_TO_SIZE(order) / psz); if (alloc_pages > GITS_BASER_PAGES_MAX) { alloc_pages = GITS_BASER_PAGES_MAX; order = get_order(GITS_BASER_PAGES_MAX * psz); @@ -942,7 +943,7 @@ static int its_alloc_tables(const char *node_name, struct its_node *its) shr = tmp & GITS_BASER_SHAREABILITY_MASK; if (!shr) { cache = GITS_BASER_nC; - __flush_dcache_area(base, alloc_size); + __flush_dcache_area(base, PAGE_ORDER_TO_SIZE(order)); } goto retry_baser; } @@ -975,7 +976,7 @@ static int its_alloc_tables(const char *node_name, struct its_node *its) } pr_info("ITS: allocated %d %s @%lx (psz %dK, shr %d)\n", - (int)(alloc_size / entry_size), + (int)(PAGE_ORDER_TO_SIZE(order) / entry_size), its_base_type_string[type], (unsigned long)virt_to_phys(base), psz / SZ_1K, (int)shr >> GITS_BASER_SHAREABILITY_SHIFT); From 3107eb31aba1367111d0243fa376536dd451c6d6 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 18 Feb 2016 19:15:45 +0000 Subject: [PATCH 0873/1212] irqchip/gic-v3: Add missing barrier to 32bit version of gic_read_iar() commit 8f318526a292c5e7cebb82f3f766b83c22343293 upstream. Commit 1a1ebd5 ("irqchip/gic-v3: Make sure read from ICC_IAR1_EL1 is visible on redestributor") fixed the missing barrier on arm64, but forgot to update the 32bit counterpart, which has the same requirements. Let's fix it. Fixes: 1a1ebd5 ("irqchip/gic-v3: Make sure read from ICC_IAR1_EL1 is visible on redestributor") Signed-off-by: Marc Zyngier Signed-off-by: Hanjun Guo Signed-off-by: Greg Kroah-Hartman --- arch/arm/include/asm/arch_gicv3.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/include/asm/arch_gicv3.h b/arch/arm/include/asm/arch_gicv3.h index 7da5503c0591..e08d15184056 100644 --- a/arch/arm/include/asm/arch_gicv3.h +++ b/arch/arm/include/asm/arch_gicv3.h @@ -117,6 +117,7 @@ static inline u32 gic_read_iar(void) u32 irqstat; asm volatile("mrc " __stringify(ICC_IAR1) : "=r" (irqstat)); + dsb(sy); return irqstat; } From eadbe44f3978fcb2ac7cd98e91ce2d986a7ae1b8 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 16 Dec 2015 14:11:22 +0000 Subject: [PATCH 0874/1212] irqchip/gic: Make interrupt ID 1020 invalid commit 327ebe1f3a9b7e20e298b39d0cff627169a28012 upstream. The GIC has no such thing as interrupt 1020: the last valid ID is 1019, and the range 1020-1023 is reserved - 1023 indicating that no interrupt is pending. So let's make sure we don't try to handle this ID. This bug has been in since the initial GIC code was introduced in 8ad68bbf7a06 ("[ARM] Add support for ARM RealView board"). Reported-by: Eric Auger Cc: Catalin Marinas Signed-off-by: Marc Zyngier Signed-off-by: Hanjun Guo Signed-off-by: Greg Kroah-Hartman --- drivers/irqchip/irq-gic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index cebd8efe651a..5fe968a4338a 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c @@ -336,7 +336,7 @@ static void __exception_irq_entry gic_handle_irq(struct pt_regs *regs) irqstat = readl_relaxed(cpu_base + GIC_CPU_INTACK); irqnr = irqstat & GICC_IAR_INT_ID_MASK; - if (likely(irqnr > 15 && irqnr < 1021)) { + if (likely(irqnr > 15 && irqnr < 1020)) { if (static_key_true(&supports_deactivate)) writel_relaxed(irqstat, cpu_base + GIC_CPU_EOI); handle_domain_irq(gic->domain, irqnr, regs); From 6586f61ab5bb8f216d79c40a9997920c1ed01544 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 21 Mar 2016 17:31:46 +0100 Subject: [PATCH 0875/1212] ovl: rename is_merge to is_lowest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 56656e960b555cb98bc414382566dcb59aae99a2 upstream. The 'is_merge' is an historical naming from when only a single lower layer could exist. With the introduction of multiple lower layers the meaning of this flag was changed to mean only the "lowest layer" (while all lower layers were being merged). So now 'is_merge' is inaccurate and hence renaming to 'is_lowest' Signed-off-by: Miklos Szeredi Signed-off-by: SZ Lin (林上智) Signed-off-by: Greg Kroah-Hartman --- fs/overlayfs/readdir.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 0c59955c4653..42f2612bfd98 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -36,7 +36,7 @@ struct ovl_dir_cache { struct ovl_readdir_data { struct dir_context ctx; - bool is_merge; + bool is_lowest; struct rb_root root; struct list_head *list; struct list_head middle; @@ -140,9 +140,9 @@ static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, return 0; } -static int ovl_fill_lower(struct ovl_readdir_data *rdd, - const char *name, int namelen, - loff_t offset, u64 ino, unsigned int d_type) +static int ovl_fill_lowest(struct ovl_readdir_data *rdd, + const char *name, int namelen, + loff_t offset, u64 ino, unsigned int d_type) { struct ovl_cache_entry *p; @@ -194,10 +194,10 @@ static int ovl_fill_merge(struct dir_context *ctx, const char *name, container_of(ctx, struct ovl_readdir_data, ctx); rdd->count++; - if (!rdd->is_merge) + if (!rdd->is_lowest) return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type); else - return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type); + return ovl_fill_lowest(rdd, name, namelen, offset, ino, d_type); } static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd) @@ -290,7 +290,7 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list) .ctx.actor = ovl_fill_merge, .list = list, .root = RB_ROOT, - .is_merge = false, + .is_lowest = false, }; int idx, next; @@ -307,7 +307,7 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list) * allows offsets to be reasonably constant */ list_add(&rdd.middle, rdd.list); - rdd.is_merge = true; + rdd.is_lowest = true; err = ovl_dir_read(&realpath, &rdd); list_del(&rdd.middle); } From 121b09d30d48a59a0ae621b130f3b4e42e724e68 Mon Sep 17 00:00:00 2001 From: Antonio Murdaca Date: Thu, 7 Apr 2016 15:48:25 +0200 Subject: [PATCH 0876/1212] ovl: override creds with the ones from the superblock mounter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 3fe6e52f062643676eb4518d68cee3bc1272091b upstream. In user namespace the whiteout creation fails with -EPERM because the current process isn't capable(CAP_SYS_ADMIN) when setting xattr. A simple reproducer: $ mkdir upper lower work merged lower/dir $ sudo mount -t overlay overlay -olowerdir=lower,upperdir=upper,workdir=work merged $ unshare -m -p -f -U -r bash Now as root in the user namespace: \# touch merged/dir/{1,2,3} # this will force a copy up of lower/dir \# rm -fR merged/* This ends up failing with -EPERM after the files in dir has been correctly deleted: unlinkat(4, "2", 0) = 0 unlinkat(4, "1", 0) = 0 unlinkat(4, "3", 0) = 0 close(4) = 0 unlinkat(AT_FDCWD, "merged/dir", AT_REMOVEDIR) = -1 EPERM (Operation not permitted) Interestingly, if you don't place files in merged/dir you can remove it, meaning if upper/dir does not exist, creating the char device file works properly in that same location. This patch uses ovl_sb_creator_cred() to get the cred struct from the superblock mounter and override the old cred with these new ones so that the whiteout creation is possible because overlay is wrong in assuming that the creds it will get with prepare_creds will be in the initial user namespace. The old cap_raise game is removed in favor of just overriding the old cred struct. This patch also drops from ovl_copy_up_one() the following two lines: override_cred->fsuid = stat->uid; override_cred->fsgid = stat->gid; This is because the correct uid and gid are taken directly with the stat struct and correctly set with ovl_set_attr(). Signed-off-by: Antonio Murdaca Signed-off-by: Miklos Szeredi Signed-off-by: SZ Lin (林上智) Signed-off-by: Greg Kroah-Hartman --- fs/overlayfs/copy_up.c | 26 +--------------- fs/overlayfs/dir.c | 67 +++------------------------------------- fs/overlayfs/overlayfs.h | 1 + fs/overlayfs/readdir.c | 14 ++------- fs/overlayfs/super.c | 18 ++++++++++- 5 files changed, 27 insertions(+), 99 deletions(-) diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 63a0d0ba36de..64c5386d0c1b 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -317,7 +317,6 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, struct dentry *upperdir; struct dentry *upperdentry; const struct cred *old_cred; - struct cred *override_cred; char *link = NULL; if (WARN_ON(!workdir)) @@ -336,28 +335,7 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, return PTR_ERR(link); } - err = -ENOMEM; - override_cred = prepare_creds(); - if (!override_cred) - goto out_free_link; - - override_cred->fsuid = stat->uid; - override_cred->fsgid = stat->gid; - /* - * CAP_SYS_ADMIN for copying up extended attributes - * CAP_DAC_OVERRIDE for create - * CAP_FOWNER for chmod, timestamp update - * CAP_FSETID for chmod - * CAP_CHOWN for chown - * CAP_MKNOD for mknod - */ - cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); - cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); - cap_raise(override_cred->cap_effective, CAP_FOWNER); - cap_raise(override_cred->cap_effective, CAP_FSETID); - cap_raise(override_cred->cap_effective, CAP_CHOWN); - cap_raise(override_cred->cap_effective, CAP_MKNOD); - old_cred = override_creds(override_cred); + old_cred = ovl_override_creds(dentry->d_sb); err = -EIO; if (lock_rename(workdir, upperdir) != NULL) { @@ -380,9 +358,7 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, out_unlock: unlock_rename(workdir, upperdir); revert_creds(old_cred); - put_cred(override_cred); -out_free_link: if (link) free_page((unsigned long) link); diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 327177df03a5..f8aa54272121 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -408,28 +408,13 @@ static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev, err = ovl_create_upper(dentry, inode, &stat, link, hardlink); } else { const struct cred *old_cred; - struct cred *override_cred; - err = -ENOMEM; - override_cred = prepare_creds(); - if (!override_cred) - goto out_iput; - - /* - * CAP_SYS_ADMIN for setting opaque xattr - * CAP_DAC_OVERRIDE for create in workdir, rename - * CAP_FOWNER for removing whiteout from sticky dir - */ - cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); - cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); - cap_raise(override_cred->cap_effective, CAP_FOWNER); - old_cred = override_creds(override_cred); + old_cred = ovl_override_creds(dentry->d_sb); err = ovl_create_over_whiteout(dentry, inode, &stat, link, hardlink); revert_creds(old_cred); - put_cred(override_cred); } if (!err) @@ -659,32 +644,11 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) if (OVL_TYPE_PURE_UPPER(type)) { err = ovl_remove_upper(dentry, is_dir); } else { - const struct cred *old_cred; - struct cred *override_cred; - - err = -ENOMEM; - override_cred = prepare_creds(); - if (!override_cred) - goto out_drop_write; - - /* - * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir - * CAP_DAC_OVERRIDE for create in workdir, rename - * CAP_FOWNER for removing whiteout from sticky dir - * CAP_FSETID for chmod of opaque dir - * CAP_CHOWN for chown of opaque dir - */ - cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); - cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); - cap_raise(override_cred->cap_effective, CAP_FOWNER); - cap_raise(override_cred->cap_effective, CAP_FSETID); - cap_raise(override_cred->cap_effective, CAP_CHOWN); - old_cred = override_creds(override_cred); + const struct cred *old_cred = ovl_override_creds(dentry->d_sb); err = ovl_remove_and_whiteout(dentry, is_dir); revert_creds(old_cred); - put_cred(override_cred); } out_drop_write: ovl_drop_write(dentry); @@ -723,7 +687,6 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, bool new_is_dir = false; struct dentry *opaquedir = NULL; const struct cred *old_cred = NULL; - struct cred *override_cred = NULL; err = -EINVAL; if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE)) @@ -792,26 +755,8 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, old_opaque = !OVL_TYPE_PURE_UPPER(old_type); new_opaque = !OVL_TYPE_PURE_UPPER(new_type); - if (old_opaque || new_opaque) { - err = -ENOMEM; - override_cred = prepare_creds(); - if (!override_cred) - goto out_drop_write; - - /* - * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir - * CAP_DAC_OVERRIDE for create in workdir - * CAP_FOWNER for removing whiteout from sticky dir - * CAP_FSETID for chmod of opaque dir - * CAP_CHOWN for chown of opaque dir - */ - cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); - cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); - cap_raise(override_cred->cap_effective, CAP_FOWNER); - cap_raise(override_cred->cap_effective, CAP_FSETID); - cap_raise(override_cred->cap_effective, CAP_CHOWN); - old_cred = override_creds(override_cred); - } + if (old_opaque || new_opaque) + old_cred = ovl_override_creds(old->d_sb); if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) { opaquedir = ovl_check_empty_and_clear(new); @@ -942,10 +887,8 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, out_unlock: unlock_rename(new_upperdir, old_upperdir); out_revert_creds: - if (old_opaque || new_opaque) { + if (old_opaque || new_opaque) revert_creds(old_cred); - put_cred(override_cred); - } out_drop_write: ovl_drop_write(old); out: diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 28316b292b8a..6d01bd46880c 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -150,6 +150,7 @@ void ovl_drop_write(struct dentry *dentry); bool ovl_dentry_is_opaque(struct dentry *dentry); void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque); bool ovl_is_whiteout(struct dentry *dentry); +const struct cred *ovl_override_creds(struct super_block *sb); void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry); struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 42f2612bfd98..7613041231fc 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -36,6 +36,7 @@ struct ovl_dir_cache { struct ovl_readdir_data { struct dir_context ctx; + struct dentry *dentry; bool is_lowest; struct rb_root root; struct list_head *list; @@ -206,17 +207,8 @@ static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd) struct ovl_cache_entry *p; struct dentry *dentry; const struct cred *old_cred; - struct cred *override_cred; - override_cred = prepare_creds(); - if (!override_cred) - return -ENOMEM; - - /* - * CAP_DAC_OVERRIDE for lookup - */ - cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); - old_cred = override_creds(override_cred); + old_cred = ovl_override_creds(rdd->dentry->d_sb); err = mutex_lock_killable(&dir->d_inode->i_mutex); if (!err) { @@ -232,7 +224,6 @@ static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd) mutex_unlock(&dir->d_inode->i_mutex); } revert_creds(old_cred); - put_cred(override_cred); return err; } @@ -288,6 +279,7 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list) struct path realpath; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_merge, + .dentry = dentry, .list = list, .root = RB_ROOT, .is_lowest = false, diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 0035cb80ecd1..e9a382b94a23 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -42,6 +42,8 @@ struct ovl_fs { long lower_namelen; /* pathnames of lower and upper dirs, for show_options */ struct ovl_config config; + /* creds of process who forced instantiation of super block */ + const struct cred *creator_cred; }; struct ovl_dir_cache; @@ -246,6 +248,13 @@ bool ovl_is_whiteout(struct dentry *dentry) return inode && IS_WHITEOUT(inode); } +const struct cred *ovl_override_creds(struct super_block *sb) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + return override_creds(ofs->creator_cred); +} + static bool ovl_is_opaquedir(struct dentry *dentry) { int res; @@ -587,6 +596,7 @@ static void ovl_put_super(struct super_block *sb) kfree(ufs->config.lowerdir); kfree(ufs->config.upperdir); kfree(ufs->config.workdir); + put_cred(ufs->creator_cred); kfree(ufs); } @@ -1107,10 +1117,14 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) else sb->s_d_op = &ovl_dentry_operations; + ufs->creator_cred = prepare_creds(); + if (!ufs->creator_cred) + goto out_put_lower_mnt; + err = -ENOMEM; oe = ovl_alloc_entry(numlower); if (!oe) - goto out_put_lower_mnt; + goto out_put_cred; root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, oe)); if (!root_dentry) @@ -1143,6 +1157,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) out_free_oe: kfree(oe); +out_put_cred: + put_cred(ufs->creator_cred); out_put_lower_mnt: for (i = 0; i < ufs->numlower; i++) mntput(ufs->lower_mnt[i]); From 89f15c6e8212ab97dbdcaf561709357cd0b398f5 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 1 Sep 2016 11:11:59 +0200 Subject: [PATCH 0877/1212] ovl: proper cleanup of workdir MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit eea2fb4851e9dcbab6b991aaf47e2e024f1f55a0 upstream. When mounting overlayfs it needs a clean "work" directory under the supplied workdir. Previously the mount code removed this directory if it already existed and created a new one. If the removal failed (e.g. directory was not empty) then it fell back to a read-only mount not using the workdir. While this has never been reported, it is possible to get a non-empty "work" dir from a previous mount of overlayfs in case of crash in the middle of an operation using the work directory. In this case the left over state should be discarded and the overlay filesystem will be consistent, guaranteed by the atomicity of operations on moving to/from the workdir to the upper layer. This patch implements cleaning out any files left in workdir. It is implemented using real recursion for simplicity, but the depth is limited to 2, because the worst case is that of a directory containing whiteouts under "work". Signed-off-by: Miklos Szeredi Cc: Signed-off-by: SZ Lin (林上智) Signed-off-by: Greg Kroah-Hartman --- fs/overlayfs/overlayfs.h | 2 ++ fs/overlayfs/readdir.c | 63 +++++++++++++++++++++++++++++++++++++++- fs/overlayfs/super.c | 2 +- 3 files changed, 65 insertions(+), 2 deletions(-) diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 6d01bd46880c..27a42975d7cd 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -165,6 +165,8 @@ int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list); void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list); void ovl_cache_free(struct list_head *list); int ovl_check_d_type_supported(struct path *realpath); +void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt, + struct dentry *dentry, int level); /* inode.c */ int ovl_setattr(struct dentry *dentry, struct iattr *attr); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 7613041231fc..da999e73c97a 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -248,7 +248,7 @@ static inline int ovl_dir_read(struct path *realpath, err = rdd->err; } while (!err && rdd->count); - if (!err && rdd->first_maybe_whiteout) + if (!err && rdd->first_maybe_whiteout && rdd->dentry) err = ovl_check_whiteouts(realpath->dentry, rdd); fput(realfile); @@ -610,3 +610,64 @@ int ovl_check_d_type_supported(struct path *realpath) return rdd.d_type_supported; } + +static void ovl_workdir_cleanup_recurse(struct path *path, int level) +{ + int err; + struct inode *dir = path->dentry->d_inode; + LIST_HEAD(list); + struct ovl_cache_entry *p; + struct ovl_readdir_data rdd = { + .ctx.actor = ovl_fill_merge, + .dentry = NULL, + .list = &list, + .root = RB_ROOT, + .is_lowest = false, + }; + + err = ovl_dir_read(path, &rdd); + if (err) + goto out; + + inode_lock_nested(dir, I_MUTEX_PARENT); + list_for_each_entry(p, &list, l_node) { + struct dentry *dentry; + + if (p->name[0] == '.') { + if (p->len == 1) + continue; + if (p->len == 2 && p->name[1] == '.') + continue; + } + dentry = lookup_one_len(p->name, path->dentry, p->len); + if (IS_ERR(dentry)) + continue; + if (dentry->d_inode) + ovl_workdir_cleanup(dir, path->mnt, dentry, level); + dput(dentry); + } + inode_unlock(dir); +out: + ovl_cache_free(&list); +} + +void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt, + struct dentry *dentry, int level) +{ + int err; + + if (!d_is_dir(dentry) || level > 1) { + ovl_cleanup(dir, dentry); + return; + } + + err = ovl_do_rmdir(dir, dentry); + if (err) { + struct path path = { .mnt = mnt, .dentry = dentry }; + + inode_unlock(dir); + ovl_workdir_cleanup_recurse(&path, level + 1); + inode_lock_nested(dir, I_MUTEX_PARENT); + ovl_cleanup(dir, dentry); + } +} diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index e9a382b94a23..fa20c95bd456 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -784,7 +784,7 @@ static struct dentry *ovl_workdir_create(struct vfsmount *mnt, goto out_dput; retried = true; - ovl_cleanup(dir, work); + ovl_workdir_cleanup(dir, mnt, work, 0); dput(work); goto retry; } From 7edd04ddb3f37d8bdecae07f05aae5bb48416211 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 30 Aug 2017 12:48:57 +0300 Subject: [PATCH 0878/1212] sch_htb: fix crash on init failure commit 88c2ace69dbef696edba77712882af03879abc9c upstream. The commit below added a call to the ->destroy() callback for all qdiscs which failed in their ->init(), but some were not prepared for such change and can't handle partially initialized qdisc. HTB is one of them and if any error occurs before the qdisc watchdog timer and qdisc work are initialized then we can hit either a null ptr deref (timer->base) when canceling in ->destroy or lockdep error info about trying to register a non-static key and a stack dump. So to fix these two move the watchdog timer and workqueue init before anything that can err out. To reproduce userspace needs to send broken htb qdisc create request, tested with a modified tc (q_htb.c). Trace log: [ 2710.897602] BUG: unable to handle kernel NULL pointer dereference at (null) [ 2710.897977] IP: hrtimer_active+0x17/0x8a [ 2710.898174] PGD 58fab067 [ 2710.898175] P4D 58fab067 [ 2710.898353] PUD 586c0067 [ 2710.898531] PMD 0 [ 2710.898710] [ 2710.899045] Oops: 0000 [#1] SMP [ 2710.899232] Modules linked in: [ 2710.899419] CPU: 1 PID: 950 Comm: tc Not tainted 4.13.0-rc6+ #54 [ 2710.899646] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140531_083030-gandalf 04/01/2014 [ 2710.900035] task: ffff880059ed2700 task.stack: ffff88005ad4c000 [ 2710.900262] RIP: 0010:hrtimer_active+0x17/0x8a [ 2710.900467] RSP: 0018:ffff88005ad4f960 EFLAGS: 00010246 [ 2710.900684] RAX: 0000000000000000 RBX: ffff88003701e298 RCX: 0000000000000000 [ 2710.900933] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff88003701e298 [ 2710.901177] RBP: ffff88005ad4f980 R08: 0000000000000001 R09: 0000000000000001 [ 2710.901419] R10: ffff88005ad4f800 R11: 0000000000000400 R12: 0000000000000000 [ 2710.901663] R13: ffff88003701e298 R14: ffffffff822a4540 R15: ffff88005ad4fac0 [ 2710.901907] FS: 00007f2f5e90f740(0000) GS:ffff88005d880000(0000) knlGS:0000000000000000 [ 2710.902277] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 2710.902500] CR2: 0000000000000000 CR3: 0000000058ca3000 CR4: 00000000000406e0 [ 2710.902744] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 2710.902977] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 2710.903180] Call Trace: [ 2710.903332] hrtimer_try_to_cancel+0x1a/0x93 [ 2710.903504] hrtimer_cancel+0x15/0x20 [ 2710.903667] qdisc_watchdog_cancel+0x12/0x14 [ 2710.903866] htb_destroy+0x2e/0xf7 [ 2710.904097] qdisc_create+0x377/0x3fd [ 2710.904330] tc_modify_qdisc+0x4d2/0x4fd [ 2710.904511] rtnetlink_rcv_msg+0x188/0x197 [ 2710.904682] ? rcu_read_unlock+0x3e/0x5f [ 2710.904849] ? rtnl_newlink+0x729/0x729 [ 2710.905017] netlink_rcv_skb+0x6c/0xce [ 2710.905183] rtnetlink_rcv+0x23/0x2a [ 2710.905345] netlink_unicast+0x103/0x181 [ 2710.905511] netlink_sendmsg+0x326/0x337 [ 2710.905679] sock_sendmsg_nosec+0x14/0x3f [ 2710.905847] sock_sendmsg+0x29/0x2e [ 2710.906010] ___sys_sendmsg+0x209/0x28b [ 2710.906176] ? do_raw_spin_unlock+0xcd/0xf8 [ 2710.906346] ? _raw_spin_unlock+0x27/0x31 [ 2710.906514] ? __handle_mm_fault+0x651/0xdb1 [ 2710.906685] ? check_chain_key+0xb0/0xfd [ 2710.906855] __sys_sendmsg+0x45/0x63 [ 2710.907018] ? __sys_sendmsg+0x45/0x63 [ 2710.907185] SyS_sendmsg+0x19/0x1b [ 2710.907344] entry_SYSCALL_64_fastpath+0x23/0xc2 Note that probably this bug goes further back because the default qdisc handling always calls ->destroy on init failure too. Fixes: 87b60cfacf9f ("net_sched: fix error recovery at qdisc creation") Fixes: 0fbbeb1ba43b ("[PKT_SCHED]: Fix missing qdisc_destroy() in qdisc_create_dflt()") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller [AmitP: Rebased for linux-4.4.y] Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- net/sched/sch_htb.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 87b02ed3d5f2..daa01d5604c2 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1025,6 +1025,9 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt) int err; int i; + qdisc_watchdog_init(&q->watchdog, sch); + INIT_WORK(&q->work, htb_work_func); + if (!opt) return -EINVAL; @@ -1045,8 +1048,6 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt) for (i = 0; i < TC_HTB_NUMPRIO; i++) INIT_LIST_HEAD(q->drops + i); - qdisc_watchdog_init(&q->watchdog, sch); - INIT_WORK(&q->work, htb_work_func); __skb_queue_head_init(&q->direct_queue); if (tb[TCA_HTB_DIRECT_QLEN]) From 68858be0c1b5a2387c93f5bd4de8efddac149cbb Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 30 Aug 2017 12:48:58 +0300 Subject: [PATCH 0879/1212] sch_multiq: fix double free on init failure commit e89d469e3be3ed3d7124a803211a463ff83d0964 upstream. The below commit added a call to ->destroy() on init failure, but multiq still frees ->queues on error in init, but ->queues is also freed by ->destroy() thus we get double free and corrupted memory. Very easy to reproduce (eth0 not multiqueue): $ tc qdisc add dev eth0 root multiq RTNETLINK answers: Operation not supported $ ip l add dumdum type dummy (crash) Trace log: [ 3929.467747] general protection fault: 0000 [#1] SMP [ 3929.468083] Modules linked in: [ 3929.468302] CPU: 3 PID: 967 Comm: ip Not tainted 4.13.0-rc6+ #56 [ 3929.468625] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140531_083030-gandalf 04/01/2014 [ 3929.469124] task: ffff88003716a700 task.stack: ffff88005872c000 [ 3929.469449] RIP: 0010:__kmalloc_track_caller+0x117/0x1be [ 3929.469746] RSP: 0018:ffff88005872f6a0 EFLAGS: 00010246 [ 3929.470042] RAX: 00000000000002de RBX: 0000000058a59000 RCX: 00000000000002df [ 3929.470406] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff821f7020 [ 3929.470770] RBP: ffff88005872f6e8 R08: 000000000001f010 R09: 0000000000000000 [ 3929.471133] R10: ffff88005872f730 R11: 0000000000008cdd R12: ff006d75646d7564 [ 3929.471496] R13: 00000000014000c0 R14: ffff88005b403c00 R15: ffff88005b403c00 [ 3929.471869] FS: 00007f0b70480740(0000) GS:ffff88005d980000(0000) knlGS:0000000000000000 [ 3929.472286] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 3929.472677] CR2: 00007ffcee4f3000 CR3: 0000000059d45000 CR4: 00000000000406e0 [ 3929.473209] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 3929.474109] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 3929.474873] Call Trace: [ 3929.475337] ? kstrdup_const+0x23/0x25 [ 3929.475863] kstrdup+0x2e/0x4b [ 3929.476338] kstrdup_const+0x23/0x25 [ 3929.478084] __kernfs_new_node+0x28/0xbc [ 3929.478478] kernfs_new_node+0x35/0x55 [ 3929.478929] kernfs_create_link+0x23/0x76 [ 3929.479478] sysfs_do_create_link_sd.isra.2+0x85/0xd7 [ 3929.480096] sysfs_create_link+0x33/0x35 [ 3929.480649] device_add+0x200/0x589 [ 3929.481184] netdev_register_kobject+0x7c/0x12f [ 3929.481711] register_netdevice+0x373/0x471 [ 3929.482174] rtnl_newlink+0x614/0x729 [ 3929.482610] ? rtnl_newlink+0x17f/0x729 [ 3929.483080] rtnetlink_rcv_msg+0x188/0x197 [ 3929.483533] ? rcu_read_unlock+0x3e/0x5f [ 3929.483984] ? rtnl_newlink+0x729/0x729 [ 3929.484420] netlink_rcv_skb+0x6c/0xce [ 3929.484858] rtnetlink_rcv+0x23/0x2a [ 3929.485291] netlink_unicast+0x103/0x181 [ 3929.485735] netlink_sendmsg+0x326/0x337 [ 3929.486181] sock_sendmsg_nosec+0x14/0x3f [ 3929.486614] sock_sendmsg+0x29/0x2e [ 3929.486973] ___sys_sendmsg+0x209/0x28b [ 3929.487340] ? do_raw_spin_unlock+0xcd/0xf8 [ 3929.487719] ? _raw_spin_unlock+0x27/0x31 [ 3929.488092] ? __handle_mm_fault+0x651/0xdb1 [ 3929.488471] ? check_chain_key+0xb0/0xfd [ 3929.488847] __sys_sendmsg+0x45/0x63 [ 3929.489206] ? __sys_sendmsg+0x45/0x63 [ 3929.489576] SyS_sendmsg+0x19/0x1b [ 3929.489901] entry_SYSCALL_64_fastpath+0x23/0xc2 [ 3929.490172] RIP: 0033:0x7f0b6fb93690 [ 3929.490423] RSP: 002b:00007ffcee4ed588 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 3929.490881] RAX: ffffffffffffffda RBX: ffffffff810d278c RCX: 00007f0b6fb93690 [ 3929.491198] RDX: 0000000000000000 RSI: 00007ffcee4ed5d0 RDI: 0000000000000003 [ 3929.491521] RBP: ffff88005872ff98 R08: 0000000000000001 R09: 0000000000000000 [ 3929.491801] R10: 00007ffcee4ed350 R11: 0000000000000246 R12: 0000000000000002 [ 3929.492075] R13: 000000000066f1a0 R14: 00007ffcee4f5680 R15: 0000000000000000 [ 3929.492352] ? trace_hardirqs_off_caller+0xa7/0xcf [ 3929.492590] Code: 8b 45 c0 48 8b 45 b8 74 17 48 8b 4d c8 83 ca ff 44 89 ee 4c 89 f7 e8 83 ca ff ff 49 89 c4 eb 49 49 63 56 20 48 8d 48 01 4d 8b 06 <49> 8b 1c 14 48 89 c2 4c 89 e0 65 49 0f c7 08 0f 94 c0 83 f0 01 [ 3929.493335] RIP: __kmalloc_track_caller+0x117/0x1be RSP: ffff88005872f6a0 Fixes: 87b60cfacf9f ("net_sched: fix error recovery at qdisc creation") Fixes: f07d1501292b ("multiq: Further multiqueue cleanup") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller [AmitP: Removed unused variable 'err' in multiq_init()] Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- net/sched/sch_multiq.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index bcdd54bb101c..cef36ad691dd 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -254,7 +254,7 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt) static int multiq_init(struct Qdisc *sch, struct nlattr *opt) { struct multiq_sched_data *q = qdisc_priv(sch); - int i, err; + int i; q->queues = NULL; @@ -269,12 +269,7 @@ static int multiq_init(struct Qdisc *sch, struct nlattr *opt) for (i = 0; i < q->max_bands; i++) q->queues[i] = &noop_qdisc; - err = multiq_tune(sch, opt); - - if (err) - kfree(q->queues); - - return err; + return multiq_tune(sch, opt); } static int multiq_dump(struct Qdisc *sch, struct sk_buff *skb) From 9dafa62c875599b077445866d2bd903afdc7e60e Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 30 Aug 2017 12:48:59 +0300 Subject: [PATCH 0880/1212] sch_hhf: fix null pointer dereference on init failure commit 32db864d33c21fd70a217ba53cb7224889354ffb upstream. If sch_hhf fails in its ->init() function (either due to wrong user-space arguments as below or memory alloc failure of hh_flows) it will do a null pointer deref of q->hh_flows in its ->destroy() function. To reproduce the crash: $ tc qdisc add dev eth0 root hhf quantum 2000000 non_hh_weight 10000000 Crash log: [ 690.654882] BUG: unable to handle kernel NULL pointer dereference at (null) [ 690.655565] IP: hhf_destroy+0x48/0xbc [ 690.655944] PGD 37345067 [ 690.655948] P4D 37345067 [ 690.656252] PUD 58402067 [ 690.656554] PMD 0 [ 690.656857] [ 690.657362] Oops: 0000 [#1] SMP [ 690.657696] Modules linked in: [ 690.658032] CPU: 3 PID: 920 Comm: tc Not tainted 4.13.0-rc6+ #57 [ 690.658525] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140531_083030-gandalf 04/01/2014 [ 690.659255] task: ffff880058578000 task.stack: ffff88005acbc000 [ 690.659747] RIP: 0010:hhf_destroy+0x48/0xbc [ 690.660146] RSP: 0018:ffff88005acbf9e0 EFLAGS: 00010246 [ 690.660601] RAX: 0000000000000000 RBX: 0000000000000020 RCX: 0000000000000000 [ 690.661155] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffffffff821f63f0 [ 690.661710] RBP: ffff88005acbfa08 R08: ffffffff81b10a90 R09: 0000000000000000 [ 690.662267] R10: 00000000f42b7019 R11: ffff880058578000 R12: 00000000ffffffea [ 690.662820] R13: ffff8800372f6400 R14: 0000000000000000 R15: 0000000000000000 [ 690.663769] FS: 00007f8ae5e8b740(0000) GS:ffff88005d980000(0000) knlGS:0000000000000000 [ 690.667069] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 690.667965] CR2: 0000000000000000 CR3: 0000000058523000 CR4: 00000000000406e0 [ 690.668918] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 690.669945] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 690.671003] Call Trace: [ 690.671743] qdisc_create+0x377/0x3fd [ 690.672534] tc_modify_qdisc+0x4d2/0x4fd [ 690.673324] rtnetlink_rcv_msg+0x188/0x197 [ 690.674204] ? rcu_read_unlock+0x3e/0x5f [ 690.675091] ? rtnl_newlink+0x729/0x729 [ 690.675877] netlink_rcv_skb+0x6c/0xce [ 690.676648] rtnetlink_rcv+0x23/0x2a [ 690.677405] netlink_unicast+0x103/0x181 [ 690.678179] netlink_sendmsg+0x326/0x337 [ 690.678958] sock_sendmsg_nosec+0x14/0x3f [ 690.679743] sock_sendmsg+0x29/0x2e [ 690.680506] ___sys_sendmsg+0x209/0x28b [ 690.681283] ? __handle_mm_fault+0xc7d/0xdb1 [ 690.681915] ? check_chain_key+0xb0/0xfd [ 690.682449] __sys_sendmsg+0x45/0x63 [ 690.682954] ? __sys_sendmsg+0x45/0x63 [ 690.683471] SyS_sendmsg+0x19/0x1b [ 690.683974] entry_SYSCALL_64_fastpath+0x23/0xc2 [ 690.684516] RIP: 0033:0x7f8ae529d690 [ 690.685016] RSP: 002b:00007fff26d2d6b8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 690.685931] RAX: ffffffffffffffda RBX: ffffffff810d278c RCX: 00007f8ae529d690 [ 690.686573] RDX: 0000000000000000 RSI: 00007fff26d2d700 RDI: 0000000000000003 [ 690.687047] RBP: ffff88005acbff98 R08: 0000000000000001 R09: 0000000000000000 [ 690.687519] R10: 00007fff26d2d480 R11: 0000000000000246 R12: 0000000000000002 [ 690.687996] R13: 0000000001258070 R14: 0000000000000001 R15: 0000000000000000 [ 690.688475] ? trace_hardirqs_off_caller+0xa7/0xcf [ 690.688887] Code: 00 00 e8 2a 02 ae ff 49 8b bc 1d 60 02 00 00 48 83 c3 08 e8 19 02 ae ff 48 83 fb 20 75 dc 45 31 f6 4d 89 f7 4d 03 bd 20 02 00 00 <49> 8b 07 49 39 c7 75 24 49 83 c6 10 49 81 fe 00 40 00 00 75 e1 [ 690.690200] RIP: hhf_destroy+0x48/0xbc RSP: ffff88005acbf9e0 [ 690.690636] CR2: 0000000000000000 Fixes: 87b60cfacf9f ("net_sched: fix error recovery at qdisc creation") Fixes: 10239edf86f1 ("net-qdisc-hhf: Heavy-Hitter Filter (HHF) qdisc") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- net/sched/sch_hhf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index 45d4b2f22f62..aff2a1b46f7f 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -501,6 +501,9 @@ static void hhf_destroy(struct Qdisc *sch) hhf_free(q->hhf_valid_bits[i]); } + if (!q->hh_flows) + return; + for (i = 0; i < HH_FLOWS_CNT; i++) { struct hh_flow_state *flow, *next; struct list_head *head = &q->hh_flows[i]; From 7a4eae7ae6f47f23aba15eca83f3798a6bc1b855 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 30 Aug 2017 12:49:03 +0300 Subject: [PATCH 0881/1212] sch_netem: avoid null pointer deref on init failure commit 634576a1844dba15bc5e6fc61d72f37e13a21615 upstream. netem can fail in ->init due to missing options (either not supplied by user-space or used as a default qdisc) causing a timer->base null pointer deref in its ->destroy() and ->reset() callbacks. Reproduce: $ sysctl net.core.default_qdisc=netem $ ip l set ethX up Crash log: [ 1814.846943] BUG: unable to handle kernel NULL pointer dereference at (null) [ 1814.847181] IP: hrtimer_active+0x17/0x8a [ 1814.847270] PGD 59c34067 [ 1814.847271] P4D 59c34067 [ 1814.847337] PUD 37374067 [ 1814.847403] PMD 0 [ 1814.847468] [ 1814.847582] Oops: 0000 [#1] SMP [ 1814.847655] Modules linked in: sch_netem(O) sch_fq_codel(O) [ 1814.847761] CPU: 3 PID: 1573 Comm: ip Tainted: G O 4.13.0-rc6+ #62 [ 1814.847884] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140531_083030-gandalf 04/01/2014 [ 1814.848043] task: ffff88003723a700 task.stack: ffff88005adc8000 [ 1814.848235] RIP: 0010:hrtimer_active+0x17/0x8a [ 1814.848407] RSP: 0018:ffff88005adcb590 EFLAGS: 00010246 [ 1814.848590] RAX: 0000000000000000 RBX: ffff880058e359d8 RCX: 0000000000000000 [ 1814.848793] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff880058e359d8 [ 1814.848998] RBP: ffff88005adcb5b0 R08: 00000000014080c0 R09: 00000000ffffffff [ 1814.849204] R10: ffff88005adcb660 R11: 0000000000000020 R12: 0000000000000000 [ 1814.849410] R13: ffff880058e359d8 R14: 00000000ffffffff R15: 0000000000000001 [ 1814.849616] FS: 00007f733bbca740(0000) GS:ffff88005d980000(0000) knlGS:0000000000000000 [ 1814.849919] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1814.850107] CR2: 0000000000000000 CR3: 0000000059f0d000 CR4: 00000000000406e0 [ 1814.850313] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1814.850518] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1814.850723] Call Trace: [ 1814.850875] hrtimer_try_to_cancel+0x1a/0x93 [ 1814.851047] hrtimer_cancel+0x15/0x20 [ 1814.851211] qdisc_watchdog_cancel+0x12/0x14 [ 1814.851383] netem_reset+0xe6/0xed [sch_netem] [ 1814.851561] qdisc_destroy+0x8b/0xe5 [ 1814.851723] qdisc_create_dflt+0x86/0x94 [ 1814.851890] ? dev_activate+0x129/0x129 [ 1814.852057] attach_one_default_qdisc+0x36/0x63 [ 1814.852232] netdev_for_each_tx_queue+0x3d/0x48 [ 1814.852406] dev_activate+0x4b/0x129 [ 1814.852569] __dev_open+0xe7/0x104 [ 1814.852730] __dev_change_flags+0xc6/0x15c [ 1814.852899] dev_change_flags+0x25/0x59 [ 1814.853064] do_setlink+0x30c/0xb3f [ 1814.853228] ? check_chain_key+0xb0/0xfd [ 1814.853396] ? check_chain_key+0xb0/0xfd [ 1814.853565] rtnl_newlink+0x3a4/0x729 [ 1814.853728] ? rtnl_newlink+0x117/0x729 [ 1814.853905] ? ns_capable_common+0xd/0xb1 [ 1814.854072] ? ns_capable+0x13/0x15 [ 1814.854234] rtnetlink_rcv_msg+0x188/0x197 [ 1814.854404] ? rcu_read_unlock+0x3e/0x5f [ 1814.854572] ? rtnl_newlink+0x729/0x729 [ 1814.854737] netlink_rcv_skb+0x6c/0xce [ 1814.854902] rtnetlink_rcv+0x23/0x2a [ 1814.855064] netlink_unicast+0x103/0x181 [ 1814.855230] netlink_sendmsg+0x326/0x337 [ 1814.855398] sock_sendmsg_nosec+0x14/0x3f [ 1814.855584] sock_sendmsg+0x29/0x2e [ 1814.855747] ___sys_sendmsg+0x209/0x28b [ 1814.855912] ? do_raw_spin_unlock+0xcd/0xf8 [ 1814.856082] ? _raw_spin_unlock+0x27/0x31 [ 1814.856251] ? __handle_mm_fault+0x651/0xdb1 [ 1814.856421] ? check_chain_key+0xb0/0xfd [ 1814.856592] __sys_sendmsg+0x45/0x63 [ 1814.856755] ? __sys_sendmsg+0x45/0x63 [ 1814.856923] SyS_sendmsg+0x19/0x1b [ 1814.857083] entry_SYSCALL_64_fastpath+0x23/0xc2 [ 1814.857256] RIP: 0033:0x7f733b2dd690 [ 1814.857419] RSP: 002b:00007ffe1d3387d8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 1814.858238] RAX: ffffffffffffffda RBX: ffffffff810d278c RCX: 00007f733b2dd690 [ 1814.858445] RDX: 0000000000000000 RSI: 00007ffe1d338820 RDI: 0000000000000003 [ 1814.858651] RBP: ffff88005adcbf98 R08: 0000000000000001 R09: 0000000000000003 [ 1814.858856] R10: 00007ffe1d3385a0 R11: 0000000000000246 R12: 0000000000000002 [ 1814.859060] R13: 000000000066f1a0 R14: 00007ffe1d3408d0 R15: 0000000000000000 [ 1814.859267] ? trace_hardirqs_off_caller+0xa7/0xcf [ 1814.859446] Code: 10 55 48 89 c7 48 89 e5 e8 45 a1 fb ff 31 c0 5d c3 31 c0 c3 66 66 66 66 90 55 48 89 e5 41 56 41 55 41 54 53 49 89 fd 49 8b 45 30 <4c> 8b 20 41 8b 5c 24 38 31 c9 31 d2 48 c7 c7 50 8e 1d 82 41 89 [ 1814.860022] RIP: hrtimer_active+0x17/0x8a RSP: ffff88005adcb590 [ 1814.860214] CR2: 0000000000000000 Fixes: 87b60cfacf9f ("net_sched: fix error recovery at qdisc creation") Fixes: 0fbbeb1ba43b ("[PKT_SCHED]: Fix missing qdisc_destroy() in qdisc_create_dflt()") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- net/sched/sch_netem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index b7c29d5b6f04..743ff23885da 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -943,11 +943,11 @@ static int netem_init(struct Qdisc *sch, struct nlattr *opt) struct netem_sched_data *q = qdisc_priv(sch); int ret; + qdisc_watchdog_init(&q->watchdog, sch); + if (!opt) return -EINVAL; - qdisc_watchdog_init(&q->watchdog, sch); - q->loss_model = CLG_RANDOM; ret = netem_change(sch, opt); if (ret) From aa5d14953152307414b3039b02b3b5acf26d03bc Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 30 Aug 2017 12:49:05 +0300 Subject: [PATCH 0882/1212] sch_tbf: fix two null pointer dereferences on init failure commit c2d6511e6a4f1f3673d711569c00c3849549e9b0 upstream. sch_tbf calls qdisc_watchdog_cancel() in both its ->reset and ->destroy callbacks but it may fail before the timer is initialized due to missing options (either not supplied by user-space or set as a default qdisc), also q->qdisc is used by ->reset and ->destroy so we need it initialized. Reproduce: $ sysctl net.core.default_qdisc=tbf $ ip l set ethX up Crash log: [ 959.160172] BUG: unable to handle kernel NULL pointer dereference at 0000000000000018 [ 959.160323] IP: qdisc_reset+0xa/0x5c [ 959.160400] PGD 59cdb067 [ 959.160401] P4D 59cdb067 [ 959.160466] PUD 59ccb067 [ 959.160532] PMD 0 [ 959.160597] [ 959.160706] Oops: 0000 [#1] SMP [ 959.160778] Modules linked in: sch_tbf sch_sfb sch_prio sch_netem [ 959.160891] CPU: 2 PID: 1562 Comm: ip Not tainted 4.13.0-rc6+ #62 [ 959.160998] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140531_083030-gandalf 04/01/2014 [ 959.161157] task: ffff880059c9a700 task.stack: ffff8800376d0000 [ 959.161263] RIP: 0010:qdisc_reset+0xa/0x5c [ 959.161347] RSP: 0018:ffff8800376d3610 EFLAGS: 00010286 [ 959.161531] RAX: ffffffffa001b1dd RBX: ffff8800373a2800 RCX: 0000000000000000 [ 959.161733] RDX: ffffffff8215f160 RSI: ffffffff8215f160 RDI: 0000000000000000 [ 959.161939] RBP: ffff8800376d3618 R08: 00000000014080c0 R09: 00000000ffffffff [ 959.162141] R10: ffff8800376d3578 R11: 0000000000000020 R12: ffffffffa001d2c0 [ 959.162343] R13: ffff880037538000 R14: 00000000ffffffff R15: 0000000000000001 [ 959.162546] FS: 00007fcc5126b740(0000) GS:ffff88005d900000(0000) knlGS:0000000000000000 [ 959.162844] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 959.163030] CR2: 0000000000000018 CR3: 000000005abc4000 CR4: 00000000000406e0 [ 959.163233] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 959.163436] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 959.163638] Call Trace: [ 959.163788] tbf_reset+0x19/0x64 [sch_tbf] [ 959.163957] qdisc_destroy+0x8b/0xe5 [ 959.164119] qdisc_create_dflt+0x86/0x94 [ 959.164284] ? dev_activate+0x129/0x129 [ 959.164449] attach_one_default_qdisc+0x36/0x63 [ 959.164623] netdev_for_each_tx_queue+0x3d/0x48 [ 959.164795] dev_activate+0x4b/0x129 [ 959.164957] __dev_open+0xe7/0x104 [ 959.165118] __dev_change_flags+0xc6/0x15c [ 959.165287] dev_change_flags+0x25/0x59 [ 959.165451] do_setlink+0x30c/0xb3f [ 959.165613] ? check_chain_key+0xb0/0xfd [ 959.165782] rtnl_newlink+0x3a4/0x729 [ 959.165947] ? rtnl_newlink+0x117/0x729 [ 959.166121] ? ns_capable_common+0xd/0xb1 [ 959.166288] ? ns_capable+0x13/0x15 [ 959.166450] rtnetlink_rcv_msg+0x188/0x197 [ 959.166617] ? rcu_read_unlock+0x3e/0x5f [ 959.166783] ? rtnl_newlink+0x729/0x729 [ 959.166948] netlink_rcv_skb+0x6c/0xce [ 959.167113] rtnetlink_rcv+0x23/0x2a [ 959.167273] netlink_unicast+0x103/0x181 [ 959.167439] netlink_sendmsg+0x326/0x337 [ 959.167607] sock_sendmsg_nosec+0x14/0x3f [ 959.167772] sock_sendmsg+0x29/0x2e [ 959.167932] ___sys_sendmsg+0x209/0x28b [ 959.168098] ? do_raw_spin_unlock+0xcd/0xf8 [ 959.168267] ? _raw_spin_unlock+0x27/0x31 [ 959.168432] ? __handle_mm_fault+0x651/0xdb1 [ 959.168602] ? check_chain_key+0xb0/0xfd [ 959.168773] __sys_sendmsg+0x45/0x63 [ 959.168934] ? __sys_sendmsg+0x45/0x63 [ 959.169100] SyS_sendmsg+0x19/0x1b [ 959.169260] entry_SYSCALL_64_fastpath+0x23/0xc2 [ 959.169432] RIP: 0033:0x7fcc5097e690 [ 959.169592] RSP: 002b:00007ffd0d5c7b48 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 959.169887] RAX: ffffffffffffffda RBX: ffffffff810d278c RCX: 00007fcc5097e690 [ 959.170089] RDX: 0000000000000000 RSI: 00007ffd0d5c7b90 RDI: 0000000000000003 [ 959.170292] RBP: ffff8800376d3f98 R08: 0000000000000001 R09: 0000000000000003 [ 959.170494] R10: 00007ffd0d5c7910 R11: 0000000000000246 R12: 0000000000000006 [ 959.170697] R13: 000000000066f1a0 R14: 00007ffd0d5cfc40 R15: 0000000000000000 [ 959.170900] ? trace_hardirqs_off_caller+0xa7/0xcf [ 959.171076] Code: 00 41 c7 84 24 14 01 00 00 00 00 00 00 41 c7 84 24 98 00 00 00 00 00 00 00 41 5c 41 5d 41 5e 5d c3 66 66 66 66 90 55 48 89 e5 53 <48> 8b 47 18 48 89 fb 48 8b 40 48 48 85 c0 74 02 ff d0 48 8b bb [ 959.171637] RIP: qdisc_reset+0xa/0x5c RSP: ffff8800376d3610 [ 959.171821] CR2: 0000000000000018 Fixes: 87b60cfacf9f ("net_sched: fix error recovery at qdisc creation") Fixes: 0fbbeb1ba43b ("[PKT_SCHED]: Fix missing qdisc_destroy() in qdisc_create_dflt()") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- net/sched/sch_tbf.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index c2fbde742f37..a06c9d6bfc9c 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -432,12 +432,13 @@ static int tbf_init(struct Qdisc *sch, struct nlattr *opt) { struct tbf_sched_data *q = qdisc_priv(sch); + qdisc_watchdog_init(&q->watchdog, sch); + q->qdisc = &noop_qdisc; + if (opt == NULL) return -EINVAL; q->t_c = ktime_get_ns(); - qdisc_watchdog_init(&q->watchdog, sch); - q->qdisc = &noop_qdisc; return tbf_change(sch, opt); } From 5597719733b819bea7cc919188c39726382b1e05 Mon Sep 17 00:00:00 2001 From: Tomas Winkler Date: Tue, 2 Jan 2018 12:01:41 +0200 Subject: [PATCH 0883/1212] mei: me: allow runtime pm for platform with D0i3 commit cc365dcf0e56271bedf3de95f88922abe248e951 upstream. >From the pci power documentation: "The driver itself should not call pm_runtime_allow(), though. Instead, it should let user space or some platform-specific code do that (user space can do it via sysfs as stated above)..." However, the S0ix residency cannot be reached without MEI device getting into low power state. Hence, for mei devices that support D0i3, it's better to make runtime power management mandatory and not rely on the system integration such as udev rules. This policy cannot be applied globally as some older platforms were found to have broken power management. Cc: v4.13+ Cc: Rafael J. Wysocki Signed-off-by: Tomas Winkler Reviewed-by: Alexander Usyskin Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mei/pci-me.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/misc/mei/pci-me.c b/drivers/misc/mei/pci-me.c index adab5bbb642a..d5b84d68f988 100644 --- a/drivers/misc/mei/pci-me.c +++ b/drivers/misc/mei/pci-me.c @@ -230,8 +230,11 @@ static int mei_me_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (!pci_dev_run_wake(pdev)) mei_me_set_pm_domain(dev); - if (mei_pg_is_enabled(dev)) + if (mei_pg_is_enabled(dev)) { pm_runtime_put_noidle(&pdev->dev); + if (hw->d0i3_supported) + pm_runtime_allow(&pdev->dev); + } dev_dbg(&pdev->dev, "initialization successful.\n"); From 280880cebc339fea7e92fcb7c16914c593d176c9 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 6 Aug 2018 13:49:47 +0200 Subject: [PATCH 0884/1212] s390/lib: use expoline for all bcr instructions commit 5eda25b10297684c1f46a14199ec00210f3c346e upstream. The memove, memset, memcpy, __memset16, __memset32 and __memset64 function have an additional indirect return branch in form of a "bzr" instruction. These need to use expolines as well. Cc: # v4.17+ Fixes: 97489e0663 ("s390/lib: use expoline for indirect branches") Reviewed-by: Heiko Carstens Signed-off-by: Martin Schwidefsky Signed-off-by: Greg Kroah-Hartman --- arch/s390/lib/mem.S | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/s390/lib/mem.S b/arch/s390/lib/mem.S index 16c5998b9792..4254c477e8e0 100644 --- a/arch/s390/lib/mem.S +++ b/arch/s390/lib/mem.S @@ -26,7 +26,7 @@ */ ENTRY(memset) ltgr %r4,%r4 - bzr %r14 + jz .Lmemset_exit ltgr %r3,%r3 jnz .Lmemset_fill aghi %r4,-1 @@ -41,12 +41,13 @@ ENTRY(memset) .Lmemset_clear_rest: larl %r3,.Lmemset_xc ex %r4,0(%r3) +.Lmemset_exit: BR_EX %r14 .Lmemset_fill: stc %r3,0(%r2) cghi %r4,1 lgr %r1,%r2 - ber %r14 + je .Lmemset_fill_exit aghi %r4,-2 srlg %r3,%r4,8 ltgr %r3,%r3 @@ -58,6 +59,7 @@ ENTRY(memset) .Lmemset_fill_rest: larl %r3,.Lmemset_mvc ex %r4,0(%r3) +.Lmemset_fill_exit: BR_EX %r14 .Lmemset_xc: xc 0(1,%r1),0(%r1) @@ -71,7 +73,7 @@ ENTRY(memset) */ ENTRY(memcpy) ltgr %r4,%r4 - bzr %r14 + jz .Lmemcpy_exit aghi %r4,-1 srlg %r5,%r4,8 ltgr %r5,%r5 @@ -80,6 +82,7 @@ ENTRY(memcpy) .Lmemcpy_rest: larl %r5,.Lmemcpy_mvc ex %r4,0(%r5) +.Lmemcpy_exit: BR_EX %r14 .Lmemcpy_loop: mvc 0(256,%r1),0(%r3) From 116a6ad749e5acb59ba478663bb0de395edc4d26 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 6 Aug 2018 07:14:51 -0500 Subject: [PATCH 0885/1212] ASoC: wm8994: Fix missing break in switch commit ad0eaee6195db1db1749dd46b9e6f4466793d178 upstream. Add missing break statement in order to prevent the code from falling through to the default case. Addresses-Coverity-ID: 115050 ("Missing break in switch") Reported-by: Valdis Kletnieks Signed-off-by: Gustavo A. R. Silva Acked-by: Charles Keepax Signed-off-by: Mark Brown Cc: stable@vger.kernel.org [Gustavo: Backported to 3.16..4.18 - Remove code comment removal] Signed-off-by: Gustavo A. R. Silva Signed-off-by: Greg Kroah-Hartman --- sound/soc/codecs/wm8994.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/codecs/wm8994.c b/sound/soc/codecs/wm8994.c index a18aecb49935..2b770d3f05d4 100644 --- a/sound/soc/codecs/wm8994.c +++ b/sound/soc/codecs/wm8994.c @@ -2431,6 +2431,7 @@ static int wm8994_set_dai_sysclk(struct snd_soc_dai *dai, snd_soc_update_bits(codec, WM8994_POWER_MANAGEMENT_2, WM8994_OPCLK_ENA, 0); } + break; default: return -EINVAL; From a632d2d1849f9370d87e397319304f3787b5d05b Mon Sep 17 00:00:00 2001 From: Ethan Lien Date: Mon, 2 Jul 2018 15:44:58 +0800 Subject: [PATCH 0886/1212] btrfs: use correct compare function of dirty_metadata_bytes commit d814a49198eafa6163698bdd93961302f3a877a4 upstream. We use customized, nodesize batch value to update dirty_metadata_bytes. We should also use batch version of compare function or we will easily goto fast path and get false result from percpu_counter_compare(). Fixes: e2d845211eda ("Btrfs: use percpu counter for dirty metadata count") CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Ethan Lien Reviewed-by: Nikolay Borisov Signed-off-by: David Sterba nb: Rebased on 4.4.y ] Signed-off-by: Nikolay Borisov Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/disk-io.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d106b981d86f..ae6e3a30e61e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1011,8 +1011,9 @@ static int btree_writepages(struct address_space *mapping, fs_info = BTRFS_I(mapping->host)->root->fs_info; /* this is a bit racy, but that's ok */ - ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes, - BTRFS_DIRTY_METADATA_THRESH); + ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes, + BTRFS_DIRTY_METADATA_THRESH, + fs_info->dirty_metadata_batch); if (ret < 0) return 0; } @@ -3987,8 +3988,9 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root, if (flush_delayed) btrfs_balance_delayed_items(root); - ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes, - BTRFS_DIRTY_METADATA_THRESH); + ret = __percpu_counter_compare(&root->fs_info->dirty_metadata_bytes, + BTRFS_DIRTY_METADATA_THRESH, + root->fs_info->dirty_metadata_batch); if (ret > 0) { balance_dirty_pages_ratelimited( root->fs_info->btree_inode->i_mapping); From c40a7b3592b3b7519eadc130c5583db2aaf70f68 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 15 Sep 2018 09:40:42 +0200 Subject: [PATCH 0887/1212] Linux 4.4.156 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 2d9f89ec8397..6dd5924a7ea5 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 155 +SUBLEVEL = 156 EXTRAVERSION = NAME = Blurry Fish Butt From a975831e398bed18436fc81c35c5634109d02254 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 28 Jun 2018 19:34:40 -0700 Subject: [PATCH 0888/1212] f2fs: flush journal nat entries for nat_bits during unmount Let's flush journal nat entries for speed up in the next run. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b72fac4766a9..ad8e58cbb698 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2585,6 +2585,13 @@ void f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) nid_t set_idx = 0; LIST_HEAD(sets); + /* during unmount, let's flush nat_bits before checking dirty_nat_cnt */ + if (enabled_nat_bits(sbi, cpc)) { + down_write(&nm_i->nat_tree_lock); + remove_nats_in_journal(sbi); + up_write(&nm_i->nat_tree_lock); + } + if (!nm_i->dirty_nat_cnt) return; From 67a51cab20f7f400c508f51727e1686301542904 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 29 Jun 2018 18:55:12 -0700 Subject: [PATCH 0889/1212] f2fs: allow wrong configured dio to buffered write This fixes to support dio having unaligned buffers as buffered writes. xfs_io -f -d -c "pwrite 0 512" $testfile -> okay xfs_io -f -d -c "pwrite 1 512" $testfile -> EINVAL Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a166927355c8..dd1f812c12fe 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2368,14 +2368,20 @@ static int f2fs_write_end(struct file *file, static int check_direct_IO(struct inode *inode, struct iov_iter *iter, loff_t offset) { - unsigned blocksize_mask = inode->i_sb->s_blocksize - 1; - - if (offset & blocksize_mask) - return -EINVAL; - - if (iov_iter_alignment(iter) & blocksize_mask) - return -EINVAL; + unsigned i_blkbits = READ_ONCE(inode->i_blkbits); + unsigned blkbits = i_blkbits; + unsigned blocksize_mask = (1 << blkbits) - 1; + unsigned long align = offset | iov_iter_alignment(iter); + struct block_device *bdev = inode->i_sb->s_bdev; + if (align & blocksize_mask) { + if (bdev) + blkbits = blksize_bits(bdev_logical_block_size(bdev)); + blocksize_mask = (1 << blkbits) - 1; + if (align & blocksize_mask) + return -EINVAL; + return 1; + } return 0; } @@ -2393,7 +2399,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, err = check_direct_IO(inode, iter, offset); if (err) - return err; + return err < 0 ? err : 0; if (f2fs_force_buffered_io(inode, rw)) return 0; From 306b69b0edebee55d45576db708bc0d0e42051f7 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 6 Jul 2018 16:47:34 -0700 Subject: [PATCH 0890/1212] f2fs: do checkpoint in kill_sb When unmounting f2fs in force mode, we can get it stuck by io_schedule() by some pending IOs in meta_inode. io_schedule+0xd/0x30 wait_on_page_bit_common+0xc6/0x130 __filemap_fdatawait_range+0xbd/0x100 filemap_fdatawait_keep_errors+0x15/0x40 sync_inodes_sb+0x1cf/0x240 sync_filesystem+0x52/0x90 generic_shutdown_super+0x1d/0x110 kill_f2fs_super+0x28/0x80 [f2fs] deactivate_locked_super+0x35/0x60 cleanup_mnt+0x36/0x70 task_work_run+0x79/0xa0 exit_to_usermode_loop+0x62/0x70 do_syscall_64+0xdb/0xf0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 0xffffffffffffffff Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 08635dc2594f..735d7695f1d4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3097,9 +3097,19 @@ static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags, static void kill_f2fs_super(struct super_block *sb) { if (sb->s_root) { - set_sbi_flag(F2FS_SB(sb), SBI_IS_CLOSE); - f2fs_stop_gc_thread(F2FS_SB(sb)); - f2fs_stop_discard_thread(F2FS_SB(sb)); + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + set_sbi_flag(sbi, SBI_IS_CLOSE); + f2fs_stop_gc_thread(sbi); + f2fs_stop_discard_thread(sbi); + + if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) || + !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { + struct cp_control cpc = { + .reason = CP_UMOUNT, + }; + f2fs_write_checkpoint(sbi, &cpc); + } } kill_block_super(sb); } From d4aa07fef609efde260b96e8d08b6ade2c1944ce Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 11 Jul 2018 18:30:42 -0700 Subject: [PATCH 0891/1212] f2fs: keep meta pages in cp_error state It turns out losing meta pages in shutdown period makes f2fs very unstable so that I could see many unexpected error conditions. Let's keep meta pages for fault injection and sudden power-off tests. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 178623c15765..a30d93867284 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -243,11 +243,8 @@ static int __f2fs_write_meta_page(struct page *page, trace_f2fs_writepage(page, META); - if (unlikely(f2fs_cp_error(sbi))) { - dec_page_count(sbi, F2FS_DIRTY_META); - unlock_page(page); - return 0; - } + if (unlikely(f2fs_cp_error(sbi))) + goto redirty_out; if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; if (wbc->for_reclaim && page->index < GET_SUM_BLOCK(sbi, 0)) @@ -1130,6 +1127,9 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) if (!get_pages(sbi, F2FS_WB_CP_DATA)) break; + if (unlikely(f2fs_cp_error(sbi))) + break; + io_schedule_timeout(5*HZ); } finish_wait(&sbi->cp_wait, &wait); @@ -1203,8 +1203,12 @@ static void commit_checkpoint(struct f2fs_sb_info *sbi, /* writeout cp pack 2 page */ err = __f2fs_write_meta_page(page, &wbc, FS_CP_META_IO); - f2fs_bug_on(sbi, err); + if (unlikely(err && f2fs_cp_error(sbi))) { + f2fs_put_page(page, 1); + return; + } + f2fs_bug_on(sbi, err); f2fs_put_page(page, 0); /* submit checkpoint (with barrier if NOBARRIER is not set) */ @@ -1230,7 +1234,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) while (get_pages(sbi, F2FS_DIRTY_META)) { f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); if (unlikely(f2fs_cp_error(sbi))) - return -EIO; + break; } /* @@ -1310,7 +1314,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); if (unlikely(f2fs_cp_error(sbi))) - return -EIO; + break; } } @@ -1351,9 +1355,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* wait for previous submitted meta pages writeback */ wait_on_all_pages_writeback(sbi); - if (unlikely(f2fs_cp_error(sbi))) - return -EIO; - /* flush all device cache */ err = f2fs_flush_device_cache(sbi); if (err) @@ -1365,9 +1366,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_release_ino_entry(sbi, false); - if (unlikely(f2fs_cp_error(sbi))) - return -EIO; - clear_sbi_flag(sbi, SBI_IS_DIRTY); clear_sbi_flag(sbi, SBI_NEED_CP); __set_cp_next_pack(sbi); @@ -1382,7 +1380,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_DENTS)); - return 0; + return unlikely(f2fs_cp_error(sbi)) ? -EIO : 0; } /* From bbe8bea7fd28f441c1dde3eb0f855f232535908c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 21 Jun 2018 13:46:23 -0700 Subject: [PATCH 0892/1212] f2fs: indicate shutdown f2fs to allow unmount successfully Once we shutdown f2fs, we have to flush stale pages in order to unmount the system. In order to make stable, we need to stop fault injection as well. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 1 + fs/f2fs/f2fs.h | 7 +++++++ fs/f2fs/file.c | 4 ++++ fs/f2fs/inode.c | 3 +++ fs/f2fs/node.c | 3 ++- fs/f2fs/super.c | 5 +---- 6 files changed, 18 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index a30d93867284..fd9d139bd7d3 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -28,6 +28,7 @@ struct kmem_cache *f2fs_inode_entry_slab; void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) { + f2fs_build_fault_attr(sbi, 0); set_ckpt_flags(sbi, CP_ERROR_FLAG); if (!end_io) f2fs_flush_merged_writes(sbi); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0b514cf1ac6f..7dd2cf91b029 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1131,6 +1131,7 @@ enum { SBI_POR_DOING, /* recovery is doing or not */ SBI_NEED_SB_WRITE, /* need to recover superblock */ SBI_NEED_CP, /* need to checkpoint */ + SBI_IS_SHUTDOWN, /* shutdown by ioctl */ }; enum { @@ -3454,4 +3455,10 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, int rw) F2FS_I_SB(inode)->s_ndevs); } +#ifdef CONFIG_F2FS_FAULT_INJECTION +extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate); +#else +#define f2fs_build_fault_attr(sbi, rate) do { } while (0) +#endif + #endif diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 1ada29893092..84e47a39592e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1889,6 +1889,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) } if (sb) { f2fs_stop_checkpoint(sbi, false); + set_sbi_flag(sbi, SBI_IS_SHUTDOWN); thaw_bdev(sb->s_bdev, sb); } break; @@ -1898,13 +1899,16 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) if (ret) goto out; f2fs_stop_checkpoint(sbi, false); + set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; case F2FS_GOING_DOWN_NOSYNC: f2fs_stop_checkpoint(sbi, false); + set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; case F2FS_GOING_DOWN_METAFLUSH: f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO); f2fs_stop_checkpoint(sbi, false); + set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; default: ret = -EINVAL; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 27e18b5cb459..c80d42ef0860 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -159,6 +159,9 @@ bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page) struct f2fs_inode *ri; __u32 provided, calculated; + if (unlikely(is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN))) + return true; + if (!f2fs_enable_inode_chksum(sbi, page) || PageDirty(page) || PageWriteback(page)) return true; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index ad8e58cbb698..760c2fdfd1c1 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1146,7 +1146,8 @@ static int read_node_page(struct page *page, int op_flags) f2fs_get_node_info(sbi, page->index, &ni); - if (unlikely(ni.blk_addr == NULL_ADDR)) { + if (unlikely(ni.blk_addr == NULL_ADDR) || + is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN)) { ClearPageUptodate(page); return -ENOENT; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 735d7695f1d4..3416c840652e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -57,8 +57,7 @@ char *fault_name[FAULT_MAX] = { [FAULT_CHECKPOINT] = "checkpoint error", }; -static void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, - unsigned int rate) +void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate) { struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info; @@ -1380,9 +1379,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, POSIX_ACL); #endif -#ifdef CONFIG_F2FS_FAULT_INJECTION f2fs_build_fault_attr(sbi, 0); -#endif } #ifdef CONFIG_QUOTA From 0c12cf984c2b3301c82f5ccb70d03824914564f2 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sun, 15 Jul 2018 09:58:08 +0900 Subject: [PATCH 0893/1212] f2fs: avoid potential deadlock in f2fs_sbi_store [ 155.018460] ====================================================== [ 155.021431] WARNING: possible circular locking dependency detected [ 155.024339] 4.18.0-rc3+ #5 Tainted: G OE [ 155.026879] ------------------------------------------------------ [ 155.029783] umount/2901 is trying to acquire lock: [ 155.032187] 00000000c4282f1f (kn->count#130){++++}, at: kernfs_remove+0x1f/0x30 [ 155.035439] [ 155.035439] but task is already holding lock: [ 155.038892] 0000000056e4307b (&type->s_umount_key#41){++++}, at: deactivate_super+0x33/0x50 [ 155.042602] [ 155.042602] which lock already depends on the new lock. [ 155.042602] [ 155.047465] [ 155.047465] the existing dependency chain (in reverse order) is: [ 155.051354] [ 155.051354] -> #1 (&type->s_umount_key#41){++++}: [ 155.054768] f2fs_sbi_store+0x61/0x460 [f2fs] [ 155.057083] kernfs_fop_write+0x113/0x1a0 [ 155.059277] __vfs_write+0x36/0x180 [ 155.061250] vfs_write+0xbe/0x1b0 [ 155.063179] ksys_write+0x55/0xc0 [ 155.065068] do_syscall_64+0x60/0x1b0 [ 155.067071] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 155.069529] [ 155.069529] -> #0 (kn->count#130){++++}: [ 155.072421] __kernfs_remove+0x26f/0x2e0 [ 155.074452] kernfs_remove+0x1f/0x30 [ 155.076342] kobject_del.part.5+0xe/0x40 [ 155.078354] f2fs_put_super+0x12d/0x290 [f2fs] [ 155.080500] generic_shutdown_super+0x6c/0x110 [ 155.082655] kill_block_super+0x21/0x50 [ 155.084634] kill_f2fs_super+0x9c/0xc0 [f2fs] [ 155.086726] deactivate_locked_super+0x3f/0x70 [ 155.088826] cleanup_mnt+0x3b/0x70 [ 155.090584] task_work_run+0x93/0xc0 [ 155.092367] exit_to_usermode_loop+0xf0/0x100 [ 155.094466] do_syscall_64+0x162/0x1b0 [ 155.096312] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 155.098603] [ 155.098603] other info that might help us debug this: [ 155.098603] [ 155.102418] Possible unsafe locking scenario: [ 155.102418] [ 155.105134] CPU0 CPU1 [ 155.107037] ---- ---- [ 155.108910] lock(&type->s_umount_key#41); [ 155.110674] lock(kn->count#130); [ 155.113010] lock(&type->s_umount_key#41); [ 155.115608] lock(kn->count#130); Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 60c827eadd82..27ddf60e3362 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -286,8 +286,10 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, bool gc_entry = (!strcmp(a->attr.name, "gc_urgent") || a->struct_type == GC_THREAD); - if (gc_entry) - down_read(&sbi->sb->s_umount); + if (gc_entry) { + if (!down_read_trylock(&sbi->sb->s_umount)) + return -EAGAIN; + } ret = __sbi_store(a, sbi, buf, count); if (gc_entry) up_read(&sbi->sb->s_umount); From 1e77f24e1b45f615cb916c8f896a39d30601e0d0 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 6 Jun 2018 23:55:01 +0800 Subject: [PATCH 0894/1212] f2fs: don't acquire orphan ino during recovery During orphan inode recovery, checkpoint should never succeed due to SBI_POR_DOING flag, so we don't need acquire orphan ino which only be used by checkpoint. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index fd9d139bd7d3..3019b2a52c07 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -571,12 +571,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { struct inode *inode; struct node_info ni; - int err = f2fs_acquire_orphan_inode(sbi); - - if (err) - goto err_out; - - __add_ino_entry(sbi, ino, 0, ORPHAN_INO); + int err; inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) { @@ -606,7 +601,6 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) err = -EIO; goto err_out; } - __remove_ino_entry(sbi, ino, ORPHAN_INO); return 0; err_out: From f94a264ef4a1166e16035cf2c733a728ec733b28 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 6 Jun 2018 23:55:02 +0800 Subject: [PATCH 0895/1212] f2fs: move s_res{u,g}id initialization to default_options() Let default_options() initialize s_res{u,g}id with default value like other options. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3416c840652e..87b3259709a2 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1355,6 +1355,8 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; F2FS_OPTION(sbi).test_dummy_encryption = false; + F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); + F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); sbi->readdir_ra = 1; set_opt(sbi, BG_GC); @@ -2705,9 +2707,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) sb->s_fs_info = sbi; sbi->raw_super = raw_super; - F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); - F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); - /* precompute checksum seed for metadata */ if (f2fs_sb_has_inode_chksum(sb)) sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid, From 09271307a2e7bf77694908a496954518ea821c61 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 11 Jun 2018 18:02:01 +0800 Subject: [PATCH 0896/1212] f2fs: relocate readdir_ra configure initialization readdir_ra is sysfs configuration instead of mount option, so it should not be initialized in default_options(), otherwise after remount, it can be reset to be enabled which may not as user wish, so let's move it to f2fs_tuning_parameters(). Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 87b3259709a2..e7045117042b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1357,7 +1357,6 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).test_dummy_encryption = false; F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); - sbi->readdir_ra = 1; set_opt(sbi, BG_GC); set_opt(sbi, INLINE_XATTR); @@ -2658,6 +2657,8 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) sm_i->dcc_info->discard_granularity = 1; sm_i->ipu_policy = 1 << F2FS_IPU_FORCE; } + + sbi->readdir_ra = 1; } static int f2fs_fill_super(struct super_block *sb, void *data, int silent) From 8e900c803dcd868f3e7278bf5ddb2a2a2b14ec62 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 11 Jun 2018 18:02:02 +0800 Subject: [PATCH 0897/1212] f2fs: fix error path of fill_super In fill_super, if root inode's attribute is incorrect, we need to call f2fs_destroy_stats to release stats memory. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e7045117042b..f91c0d84bb17 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2919,7 +2919,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { iput(root); err = -EINVAL; - goto free_node_inode; + goto free_stats; } sb->s_root = d_make_root(root); /* allocate root dentry */ From 25e0980d84de4fefb51c9e1bf1976baa3e5e0eac Mon Sep 17 00:00:00 2001 From: Weichao Guo Date: Fri, 9 Mar 2018 23:10:21 +0800 Subject: [PATCH 0898/1212] f2fs: support in-memory inode checksum when checking consistency Enable in-memory inode checksum to protect metadata blocks from in-memory scribbles when checking consistency, which has no performance requirements. Signed-off-by: Weichao Guo Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 8 ++++++++ fs/f2fs/node.c | 10 +++++++++- fs/f2fs/node.h | 4 ++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index c80d42ef0860..8f6d340fa514 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -162,8 +162,12 @@ bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page) if (unlikely(is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN))) return true; +#ifdef CONFIG_F2FS_CHECK_FS + if (!f2fs_enable_inode_chksum(sbi, page)) +#else if (!f2fs_enable_inode_chksum(sbi, page) || PageDirty(page) || PageWriteback(page)) +#endif return true; ri = &F2FS_NODE(page)->i; @@ -477,6 +481,10 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; + +#ifdef CONFIG_F2FS_CHECK_FS + f2fs_inode_chksum_set(F2FS_I_SB(inode), node_page); +#endif } void f2fs_update_inode_page(struct inode *inode) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 760c2fdfd1c1..b3ca611b64d0 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1141,8 +1141,12 @@ static int read_node_page(struct page *page, int op_flags) .encrypted_page = NULL, }; - if (PageUptodate(page)) + if (PageUptodate(page)) { +#ifdef CONFIG_F2FS_CHECK_FS + f2fs_bug_on(sbi, !f2fs_inode_chksum_verify(sbi, page)); +#endif return LOCKED_PAGE; + } f2fs_get_node_info(sbi, page->index, &ni); @@ -1778,6 +1782,10 @@ static int f2fs_set_node_page_dirty(struct page *page) if (!PageUptodate(page)) SetPageUptodate(page); +#ifdef CONFIG_F2FS_CHECK_FS + if (IS_INODE(page)) + f2fs_inode_chksum_set(F2FS_P_SB(page), page); +#endif if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index b95e49e4a928..8f34bdffde93 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -444,6 +444,10 @@ static inline void set_mark(struct page *page, int mark, int type) else flag &= ~(0x1 << type); rn->footer.flag = cpu_to_le32(flag); + +#ifdef CONFIG_F2FS_CHECK_FS + f2fs_inode_chksum_set(F2FS_P_SB(page), page); +#endif } #define set_dentry_mark(page, mark) set_mark(page, mark, DENT_BIT_SHIFT) #define set_fsync_mark(page, mark) set_mark(page, mark, FSYNC_BIT_SHIFT) From 6e728636192fc802025447accf3b4c494e328bc8 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 15 Jun 2018 14:45:57 +0800 Subject: [PATCH 0899/1212] f2fs: fix to propagate return value of scan_nat_page() As Anatoly Trosinenko reported in bugzilla: How to reproduce: 1. Compile the 73fcb1a370c76 version of the kernel using the config attached 2. Unpack and mount the attached filesystem image as F2FS 3. The kernel will BUG() on mount (BUGs are explicitly enabled in config) [ 2.233612] F2FS-fs (sda): Found nat_bits in checkpoint [ 2.248422] ------------[ cut here ]------------ [ 2.248857] kernel BUG at fs/f2fs/node.c:1967! [ 2.249760] invalid opcode: 0000 [#1] SMP NOPTI [ 2.250219] Modules linked in: [ 2.251848] CPU: 0 PID: 944 Comm: mount Not tainted 4.17.0-rc5+ #1 [ 2.252331] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 [ 2.253305] RIP: 0010:build_free_nids+0x337/0x3f0 [ 2.253672] RSP: 0018:ffffae7fc0857c50 EFLAGS: 00000246 [ 2.254080] RAX: 00000000ffffffff RBX: 0000000000000123 RCX: 0000000000000001 [ 2.254638] RDX: ffff9aa7063d5c00 RSI: 0000000000000122 RDI: ffff9aa705852e00 [ 2.255190] RBP: ffff9aa705852e00 R08: 0000000000000001 R09: ffff9aa7059090c0 [ 2.255719] R10: 0000000000000000 R11: 0000000000000000 R12: ffff9aa705852e00 [ 2.256242] R13: ffff9aa7063ad000 R14: ffff9aa705919000 R15: 0000000000000123 [ 2.256809] FS: 00000000023078c0(0000) GS:ffff9aa707800000(0000) knlGS:0000000000000000 [ 2.258654] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 2.259153] CR2: 00000000005511ae CR3: 0000000005872000 CR4: 00000000000006f0 [ 2.259801] Call Trace: [ 2.260583] build_node_manager+0x5cd/0x600 [ 2.260963] f2fs_fill_super+0x66a/0x17c0 [ 2.261300] ? f2fs_commit_super+0xe0/0xe0 [ 2.261622] mount_bdev+0x16e/0x1a0 [ 2.261899] mount_fs+0x30/0x150 [ 2.262398] vfs_kern_mount.part.28+0x4f/0xf0 [ 2.262743] do_mount+0x5d0/0xc60 [ 2.263010] ? _copy_from_user+0x37/0x60 [ 2.263313] ? memdup_user+0x39/0x60 [ 2.263692] ksys_mount+0x7b/0xd0 [ 2.263960] __x64_sys_mount+0x1c/0x20 [ 2.264268] do_syscall_64+0x43/0xf0 [ 2.264560] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 2.265095] RIP: 0033:0x48d31a [ 2.265502] RSP: 002b:00007ffc6fe60a08 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5 [ 2.266089] RAX: ffffffffffffffda RBX: 0000000000008000 RCX: 000000000048d31a [ 2.266607] RDX: 00007ffc6fe62fa5 RSI: 00007ffc6fe62f9d RDI: 00007ffc6fe62f94 [ 2.267130] RBP: 00000000023078a0 R08: 0000000000000000 R09: 0000000000000000 [ 2.267670] R10: 0000000000008000 R11: 0000000000000246 R12: 0000000000000000 [ 2.268192] R13: 0000000000000000 R14: 00007ffc6fe60c78 R15: 0000000000000000 [ 2.268767] Code: e8 5f c3 ff ff 83 c3 01 41 83 c7 01 81 fb c7 01 00 00 74 48 44 39 7d 04 76 42 48 63 c3 48 8d 04 c0 41 8b 44 06 05 83 f8 ff 75 c1 <0f> 0b 49 8b 45 50 48 8d b8 b0 00 00 00 e8 37 59 69 00 b9 01 00 [ 2.270434] RIP: build_free_nids+0x337/0x3f0 RSP: ffffae7fc0857c50 [ 2.271426] ---[ end trace ab20c06cd3c8fde4 ]--- During loading NAT entries, we will do sanity check, once the entry info is corrupted, it will cause BUG_ON directly to protect user data from being overwrited. In this case, it will be better to just return failure on mount() instead of panic, so that user can get hint from kmsg and try fsck for recovery immediately rather than after an abnormal reboot. https://bugzilla.kernel.org/show_bug.cgi?id=199769 Reported-by: Anatoly Trosinenko Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/node.c | 42 ++++++++++++++++++++++++++++++------------ 2 files changed, 31 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7dd2cf91b029..f1805106527d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2894,7 +2894,7 @@ int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, bool do_balance, enum iostat_type io_type); -void f2fs_build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount); +int f2fs_build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount); bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid); void f2fs_alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); void f2fs_alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b3ca611b64d0..4a89616e138b 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1980,7 +1980,7 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) kmem_cache_free(free_nid_slab, i); } -static void scan_nat_page(struct f2fs_sb_info *sbi, +static int scan_nat_page(struct f2fs_sb_info *sbi, struct page *nat_page, nid_t start_nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -1998,7 +1998,10 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, break; blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); - f2fs_bug_on(sbi, blk_addr == NEW_ADDR); + + if (blk_addr == NEW_ADDR) + return -EINVAL; + if (blk_addr == NULL_ADDR) { add_free_nid(sbi, start_nid, true, true); } else { @@ -2007,6 +2010,8 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, spin_unlock(&NM_I(sbi)->nid_list_lock); } } + + return 0; } static void scan_curseg_cache(struct f2fs_sb_info *sbi) @@ -2062,11 +2067,11 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi) up_read(&nm_i->nat_tree_lock); } -static void __f2fs_build_free_nids(struct f2fs_sb_info *sbi, +static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) { struct f2fs_nm_info *nm_i = NM_I(sbi); - int i = 0; + int i = 0, ret; nid_t nid = nm_i->next_scan_nid; if (unlikely(nid >= nm_i->max_nid)) @@ -2074,17 +2079,17 @@ static void __f2fs_build_free_nids(struct f2fs_sb_info *sbi, /* Enough entries */ if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK) - return; + return 0; if (!sync && !f2fs_available_free_memory(sbi, FREE_NIDS)) - return; + return 0; if (!mount) { /* try to find free nids in free_nid_bitmap */ scan_free_nid_bits(sbi); if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK) - return; + return 0; } /* readahead nat pages to be scanned */ @@ -2098,8 +2103,16 @@ static void __f2fs_build_free_nids(struct f2fs_sb_info *sbi, nm_i->nat_block_bitmap)) { struct page *page = get_current_nat_page(sbi, nid); - scan_nat_page(sbi, page, nid); + ret = scan_nat_page(sbi, page, nid); f2fs_put_page(page, 1); + + if (ret) { + up_read(&nm_i->nat_tree_lock); + f2fs_bug_on(sbi, !mount); + f2fs_msg(sbi->sb, KERN_ERR, + "NAT is corrupt, run fsck to fix it"); + return -EINVAL; + } } nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK)); @@ -2120,13 +2133,19 @@ static void __f2fs_build_free_nids(struct f2fs_sb_info *sbi, f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), nm_i->ra_nid_pages, META_NAT, false); + + return 0; } -void f2fs_build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) +int f2fs_build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) { + int ret; + mutex_lock(&NM_I(sbi)->build_lock); - __f2fs_build_free_nids(sbi, sync, mount); + ret = __f2fs_build_free_nids(sbi, sync, mount); mutex_unlock(&NM_I(sbi)->build_lock); + + return ret; } /* @@ -2820,8 +2839,7 @@ int f2fs_build_node_manager(struct f2fs_sb_info *sbi) /* load free nid status from nat_bits table */ load_free_nid_bitmap(sbi); - f2fs_build_free_nids(sbi, true, true); - return 0; + return f2fs_build_free_nids(sbi, true, true); } void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) From 01fc47dc05a3ff726af67758e2436f69ec48bc6b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 20 Jun 2018 21:27:21 -0700 Subject: [PATCH 0900/1212] f2fs: don't issue discard commands in online discard is on Actually, we don't need to issue discard commands, if discard is on, as mentioned in the comment. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 3d0c42ef0474..fb1580f322c7 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2549,23 +2549,24 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) if (err) goto out; - start_block = START_BLOCK(sbi, start_segno); - end_block = START_BLOCK(sbi, end_segno + 1); - - __init_discard_policy(sbi, &dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen); - __issue_discard_cmd_range(sbi, &dpolicy, start_block, end_block); - /* * We filed discard candidates, but actually we don't need to wait for * all of them, since they'll be issued in idle time along with runtime * discard option. User configuration looks like using runtime discard * or periodic fstrim instead of it. */ - if (!test_opt(sbi, DISCARD)) { - trimmed = __wait_discard_cmd_range(sbi, &dpolicy, + if (test_opt(sbi, DISCARD)) + goto out; + + start_block = START_BLOCK(sbi, start_segno); + end_block = START_BLOCK(sbi, end_segno + 1); + + __init_discard_policy(sbi, &dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen); + __issue_discard_cmd_range(sbi, &dpolicy, start_block, end_block); + + trimmed = __wait_discard_cmd_range(sbi, &dpolicy, start_block, end_block); - range->len = F2FS_BLK_TO_BYTES(trimmed); - } + range->len = F2FS_BLK_TO_BYTES(trimmed); out: return err; } From bb36dbbd962fa1f30b15f44b93324517beae136f Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 20 Jun 2018 13:39:53 +0300 Subject: [PATCH 0901/1212] f2fs: Fix uninitialized return in f2fs_ioc_shutdown() "ret" can be uninitialized on the success path when "in == F2FS_GOING_DOWN_FULLSYNC". Fixes: 60b2b4ee2bc0 ("f2fs: Fix deadlock in shutdown ioctl") Signed-off-by: Dan Carpenter Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 84e47a39592e..c232acdee44d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1866,7 +1866,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct super_block *sb = sbi->sb; __u32 in; - int ret; + int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; From b90a91b0cbf6c4ffc3371680e00203ea631aa2e9 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Thu, 21 Jun 2018 14:49:06 +0800 Subject: [PATCH 0902/1212] f2fs: fix a hungtask problem caused by congestion_wait This patch fix hungtask problem which can be reproduced as follow: Thread 0~3: while true do touch /xxx/test/file_xxx done Thread 4 write a new checkpoint every three seconds. In the meantime, fio start 16 threads for randwrite. With my debug info, cycles num will exceed 1000 in function f2fs_sync_dirty_inodes, and most of cycle will be dropped into congestion_wait() and sleep more than 20ms. Cycles num reduced to 3 with this patch. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 3019b2a52c07..31e305a1355f 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -973,12 +973,10 @@ int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) iput(inode); /* We need to give cpu to another writers. */ - if (ino == cur_ino) { - congestion_wait(BLK_RW_ASYNC, HZ/50); + if (ino == cur_ino) cond_resched(); - } else { + else ino = cur_ino; - } } else { /* * We should submit bio, since it exists several From 8371163afd3cfeb90e4e74176f066ac61d58c235 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 21 Jun 2018 11:29:43 -0700 Subject: [PATCH 0903/1212] f2fs: assign REQ_RAHEAD to bio for ->readpages As Jens reported, we'd better assign REQ_RAHEAD to bio by the fact that ->readpages is called only from read-ahead. In Documentation/filesystems/vfs.txt, readpages: called by the VM to read pages associated with the address_space object. This is essentially just a vector version of readpage. Instead of just one page, several pages are requested. readpages is only used for read-ahead, so read errors are ignored. If anything goes wrong, feel free to give up. Signed-off-by: Jens Axboe Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index dd1f812c12fe..dec1d9275f7a 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -532,7 +532,7 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio) } static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, - unsigned nr_pages) + unsigned nr_pages, unsigned op_flag) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio; @@ -544,7 +544,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, return ERR_PTR(-ENOMEM); f2fs_target_device(sbi, blkaddr, bio); bio->bi_end_io = f2fs_read_end_io; - bio_set_op_attrs(bio, REQ_OP_READ, 0); + bio_set_op_attrs(bio, REQ_OP_READ, op_flag); if (f2fs_encrypted_file(inode)) post_read_steps |= 1 << STEP_DECRYPT; @@ -569,7 +569,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, static int f2fs_submit_page_read(struct inode *inode, struct page *page, block_t blkaddr) { - struct bio *bio = f2fs_grab_read_bio(inode, blkaddr, 1); + struct bio *bio = f2fs_grab_read_bio(inode, blkaddr, 1, 0); if (IS_ERR(bio)) return PTR_ERR(bio); @@ -1419,10 +1419,15 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, /* * This function was originally taken from fs/mpage.c, and customized for f2fs. * Major change was from block_size == page_size in f2fs by default. + * + * Note that the aops->readpages() function is ONLY used for read-ahead. If + * this function ever deviates from doing just read-ahead, it should either + * use ->readpage() or do the necessary surgery to decouple ->readpages() + * from read-ahead. */ static int f2fs_mpage_readpages(struct address_space *mapping, struct list_head *pages, struct page *page, - unsigned nr_pages) + unsigned nr_pages, bool is_readahead) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; @@ -1511,7 +1516,8 @@ static int f2fs_mpage_readpages(struct address_space *mapping, bio = NULL; } if (bio == NULL) { - bio = f2fs_grab_read_bio(inode, block_nr, nr_pages); + bio = f2fs_grab_read_bio(inode, block_nr, nr_pages, + is_readahead ? REQ_RAHEAD : 0); if (IS_ERR(bio)) { bio = NULL; goto set_error_page; @@ -1555,7 +1561,7 @@ static int f2fs_read_data_page(struct file *file, struct page *page) if (f2fs_has_inline_data(inode)) ret = f2fs_read_inline_data(inode, page); if (ret == -EAGAIN) - ret = f2fs_mpage_readpages(page->mapping, NULL, page, 1); + ret = f2fs_mpage_readpages(page->mapping, NULL, page, 1, false); return ret; } @@ -1572,7 +1578,7 @@ static int f2fs_read_data_pages(struct file *file, if (f2fs_has_inline_data(inode)) return 0; - return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages); + return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages, true); } static int encrypt_one_page(struct f2fs_io_info *fio) From b4957b928c845d339bbaeff0e1ed2bd910b05a5a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 21 Jun 2018 22:38:28 +0800 Subject: [PATCH 0904/1212] f2fs: fix to wait on page writeback before updating page In error path of f2fs_move_rehashed_dirents, inode page could be writeback state, so we should wait on inode page writeback before updating it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inline.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index f3185ae98860..f586fb9b20fa 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -477,6 +477,7 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, return 0; recover: lock_page(ipage); + f2fs_wait_on_page_writeback(ipage, NODE, true); memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA(dir)); f2fs_i_depth_write(dir, 0); f2fs_i_size_write(dir, MAX_INLINE_DATA(dir)); From e7406233c15f23d796d2e100872507d4ddc61e7e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 20 Jun 2018 10:02:19 +0200 Subject: [PATCH 0905/1212] f2fs: use timespec64 for inode timestamps The on-disk representation and the vfs both use 64-bit tv_sec values, so let's change the last missing piece in the middle. Signed-off-by: Arnd Bergmann Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 12 ++++++------ fs/f2fs/namei.c | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f1805106527d..30964ff984d9 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -734,8 +734,8 @@ struct f2fs_inode_info { int i_extra_isize; /* size of extra space located in i_addr */ kprojid_t i_projid; /* id for project quota */ int i_inline_xattr_size; /* inline xattr size */ - struct timespec i_crtime; /* inode creation time */ - struct timespec i_disk_time[4]; /* inode disk times */ + struct timespec64 i_crtime; /* inode creation time */ + struct timespec64 i_disk_time[4];/* inode disk times */ }; static inline void get_extent_info(struct extent_info *ext, @@ -2599,13 +2599,13 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) i_size_read(inode) & ~PAGE_MASK) return false; - if (!timespec_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime)) + if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime)) return false; - if (!timespec_equal(F2FS_I(inode)->i_disk_time + 1, &inode->i_ctime)) + if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &inode->i_ctime)) return false; - if (!timespec_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime)) + if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime)) return false; - if (!timespec_equal(F2FS_I(inode)->i_disk_time + 3, + if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 3, &F2FS_I(inode)->i_crtime)) return false; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 7b025524ee16..198d5583da06 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -50,8 +50,8 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) inode->i_ino = ino; inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = - F2FS_I(inode)->i_crtime = current_time(inode); + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + F2FS_I(inode)->i_crtime = inode->i_mtime; inode->i_generation = sbi->s_next_generation++; if (S_ISDIR(inode->i_mode)) From c69d5a7c6fb26055edcaba8070d0de01e94a7836 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 5 Jun 2018 17:44:11 +0800 Subject: [PATCH 0906/1212] f2fs: introduce and spread verify_blkaddr This patch introduces verify_blkaddr to check meta/data block address with valid range to detect bug earlier. In addition, once we encounter an invalid blkaddr, notice user to run fsck to fix, and let the kernel panic. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 10 ++++++++-- fs/f2fs/data.c | 8 ++++---- fs/f2fs/f2fs.h | 33 +++++++++++++++++++++++++++++---- fs/f2fs/file.c | 9 +++++---- fs/f2fs/inode.c | 7 ++++--- fs/f2fs/node.c | 4 ++-- fs/f2fs/recovery.c | 6 +++--- fs/f2fs/segment.c | 4 ++-- fs/f2fs/segment.h | 8 +++----- 9 files changed, 60 insertions(+), 29 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 31e305a1355f..fef18c291511 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -120,7 +120,7 @@ struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index) return __get_meta_page(sbi, index, false); } -bool f2fs_is_valid_meta_blkaddr(struct f2fs_sb_info *sbi, +bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { switch (type) { @@ -141,10 +141,16 @@ bool f2fs_is_valid_meta_blkaddr(struct f2fs_sb_info *sbi, return false; break; case META_POR: + case DATA_GENERIC: if (unlikely(blkaddr >= MAX_BLKADDR(sbi) || blkaddr < MAIN_BLKADDR(sbi))) return false; break; + case META_GENERIC: + if (unlikely(blkaddr < SEG0_BLKADDR(sbi) || + blkaddr >= MAIN_BLKADDR(sbi))) + return false; + break; default: BUG(); } @@ -178,7 +184,7 @@ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, blk_start_plug(&plug); for (; nrpages-- > 0; blkno++) { - if (!f2fs_is_valid_meta_blkaddr(sbi, blkno, type)) + if (!f2fs_is_valid_blkaddr(sbi, blkno, type)) goto out; switch (type) { diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index dec1d9275f7a..067c09cc18b8 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -483,7 +483,7 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio) spin_unlock(&io->io_lock); } - if (is_valid_blkaddr(fio->old_blkaddr)) + if (__is_valid_data_blkaddr(fio->old_blkaddr)) verify_block_addr(fio, fio->old_blkaddr); verify_block_addr(fio, fio->new_blkaddr); @@ -1043,7 +1043,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, next_block: blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); - if (!is_valid_blkaddr(blkaddr)) { + if (!is_valid_data_blkaddr(sbi, blkaddr)) { if (create) { if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; @@ -1697,7 +1697,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) f2fs_lookup_extent_cache(inode, page->index, &ei)) { fio->old_blkaddr = ei.blk + page->index - ei.fofs; - if (is_valid_blkaddr(fio->old_blkaddr)) { + if (is_valid_data_blkaddr(fio->sbi, fio->old_blkaddr)) { ipu_force = true; fio->need_lock = LOCK_DONE; goto got_it; @@ -1724,7 +1724,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) * If current allocation needs SSR, * it had better in-place writes for updated data. */ - if (ipu_force || (is_valid_blkaddr(fio->old_blkaddr) && + if (ipu_force || (is_valid_data_blkaddr(fio->sbi, fio->old_blkaddr) && need_inplace_update(fio))) { err = encrypt_one_page(fio); if (err) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 30964ff984d9..034c53004d1c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -262,7 +262,7 @@ struct cp_control { }; /* - * For CP/NAT/SIT/SSA readahead + * indicate meta/data type */ enum { META_CP, @@ -270,6 +270,8 @@ enum { META_SIT, META_SSA, META_POR, + DATA_GENERIC, + META_GENERIC, }; /* for the list of ino */ @@ -2751,13 +2753,36 @@ static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, spin_unlock(&sbi->iostat_lock); } -static inline bool is_valid_blkaddr(block_t blkaddr) +bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr, int type); +void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...); +static inline void verify_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr, int type) +{ + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, type)) { + f2fs_msg(sbi->sb, KERN_ERR, + "invalid blkaddr: %u, type: %d, run fsck to fix.", + blkaddr, type); + f2fs_bug_on(sbi, 1); + } +} + +static inline bool __is_valid_data_blkaddr(block_t blkaddr) { if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) return false; return true; } +static inline bool is_valid_data_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr) +{ + if (!__is_valid_data_blkaddr(blkaddr)) + return false; + verify_blkaddr(sbi, blkaddr, DATA_GENERIC); + return true; +} + /* * file.c */ @@ -2981,8 +3006,8 @@ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io); struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index); -bool f2fs_is_valid_meta_blkaddr(struct f2fs_sb_info *sbi, - block_t blkaddr, int type); +bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr, int type); int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync); void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c232acdee44d..9301697c3e7b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -353,13 +353,13 @@ static pgoff_t __get_first_dirty_index(struct address_space *mapping, return pgofs; } -static bool __found_offset(block_t blkaddr, pgoff_t dirty, pgoff_t pgofs, - int whence) +static bool __found_offset(struct f2fs_sb_info *sbi, block_t blkaddr, + pgoff_t dirty, pgoff_t pgofs, int whence) { switch (whence) { case SEEK_DATA: if ((blkaddr == NEW_ADDR && dirty == pgofs) || - is_valid_blkaddr(blkaddr)) + is_valid_data_blkaddr(sbi, blkaddr)) return true; break; case SEEK_HOLE: @@ -423,7 +423,8 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); - if (__found_offset(blkaddr, dirty, pgofs, whence)) { + if (__found_offset(F2FS_I_SB(inode), blkaddr, dirty, + pgofs, whence)) { f2fs_put_dnode(&dn); goto found; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 8f6d340fa514..20efd4544a52 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -68,11 +68,12 @@ static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) } } -static bool __written_first_block(struct f2fs_inode *ri) +static bool __written_first_block(struct f2fs_sb_info *sbi, + struct f2fs_inode *ri) { block_t addr = le32_to_cpu(ri->i_addr[offset_in_addr(ri)]); - if (is_valid_blkaddr(addr)) + if (is_valid_data_blkaddr(sbi, addr)) return true; return false; } @@ -282,7 +283,7 @@ static int do_read_inode(struct inode *inode) /* get rdev by using inline_info */ __get_inode_rdev(inode, ri); - if (__written_first_block(ri)) + if (__written_first_block(sbi, ri)) set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); if (!f2fs_need_inode_block_update(sbi, inode->i_ino)) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 4a89616e138b..aa403fc1851e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -371,7 +371,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, new_blkaddr == NULL_ADDR); f2fs_bug_on(sbi, nat_get_blkaddr(e) == NEW_ADDR && new_blkaddr == NEW_ADDR); - f2fs_bug_on(sbi, is_valid_blkaddr(nat_get_blkaddr(e)) && + f2fs_bug_on(sbi, is_valid_data_blkaddr(sbi, nat_get_blkaddr(e)) && new_blkaddr == NEW_ADDR); /* increment version no as node is removed */ @@ -382,7 +382,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, /* change address */ nat_set_blkaddr(e, new_blkaddr); - if (!is_valid_blkaddr(new_blkaddr)) + if (!is_valid_data_blkaddr(sbi, new_blkaddr)) set_nat_flag(e, IS_CHECKPOINTED, false); __set_nat_cache_dirty(nm_i, e); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index daf81d416b89..184b34be635b 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -252,7 +252,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, while (1) { struct fsync_inode_entry *entry; - if (!f2fs_is_valid_meta_blkaddr(sbi, blkaddr, META_POR)) + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, META_POR)) return 0; page = f2fs_get_tmp_page(sbi, blkaddr); @@ -507,7 +507,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, } /* dest is valid block, try to recover from src to dest */ - if (f2fs_is_valid_meta_blkaddr(sbi, dest, META_POR)) { + if (f2fs_is_valid_blkaddr(sbi, dest, META_POR)) { if (src == NULL_ADDR) { err = f2fs_reserve_new_block(&dn); @@ -568,7 +568,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, while (1) { struct fsync_inode_entry *entry; - if (!f2fs_is_valid_meta_blkaddr(sbi, blkaddr, META_POR)) + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, META_POR)) break; f2fs_ra_meta_pages_cond(sbi, blkaddr); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index fb1580f322c7..fb99dd6ebf42 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1999,7 +1999,7 @@ bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) struct seg_entry *se; bool is_cp = false; - if (!is_valid_blkaddr(blkaddr)) + if (!is_valid_data_blkaddr(sbi, blkaddr)) return true; down_read(&sit_i->sentry_lock); @@ -3074,7 +3074,7 @@ void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr) { struct page *cpage; - if (!is_valid_blkaddr(blkaddr)) + if (!is_valid_data_blkaddr(sbi, blkaddr)) return; cpage = find_lock_page(META_MAPPING(sbi), blkaddr); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index f18fc82fbe99..a7460da9af43 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -85,7 +85,7 @@ (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & ((sbi)->blocks_per_seg - 1)) #define GET_SEGNO(sbi, blk_addr) \ - ((!is_valid_blkaddr(blk_addr)) ? \ + ((!is_valid_data_blkaddr(sbi, blk_addr)) ? \ NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ GET_SEGNO_FROM_SEG0(sbi, blk_addr))) #define BLKS_PER_SEC(sbi) \ @@ -647,11 +647,9 @@ static inline void verify_block_addr(struct f2fs_io_info *fio, block_t blk_addr) if (PAGE_TYPE_OF_BIO(fio->type) == META && (!is_read_io(fio->op) || fio->is_meta)) - BUG_ON(blk_addr < SEG0_BLKADDR(sbi) || - blk_addr >= MAIN_BLKADDR(sbi)); + verify_blkaddr(sbi, blk_addr, META_GENERIC); else - BUG_ON(blk_addr < MAIN_BLKADDR(sbi) || - blk_addr >= MAX_BLKADDR(sbi)); + verify_blkaddr(sbi, blk_addr, DATA_GENERIC); } /* From 5e5f3dc3ce50f36775d2edc395eefd3cbcce88d1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 22 Jun 2018 16:06:59 +0800 Subject: [PATCH 0907/1212] f2fs: disable f2fs_check_rb_tree_consistence If there is millions of discard entries cached in rb tree, each sanity check of it can cause very long latency as held cmd_lock blocking other lock grabbers. In other aspect, we have enabled the check very long time, as we see, there is no such inconsistent condition caused by bugs. But still we do not choose to kill it directly, instead, adding an flag to disable the check now, if there is related code change, we can reuse it to detect bugs. Signed-off-by: Yunlei He Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/segment.c | 10 +++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 034c53004d1c..196967c1bc69 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -379,6 +379,7 @@ struct discard_cmd_control { atomic_t issing_discard; /* # of issing discard */ atomic_t discard_cmd_cnt; /* # of cached cmd count */ struct rb_root root; /* root of discard rb-tree */ + bool rbtree_check; /* config for consistence check */ }; /* for the list of fsync inodes, used only during recovery */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index fb99dd6ebf42..6b1b8ec9ba58 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1279,8 +1279,9 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, mutex_lock(&dcc->cmd_lock); if (list_empty(pend_list)) goto next; - f2fs_bug_on(sbi, - !f2fs_check_rb_tree_consistence(sbi, &dcc->root)); + if (unlikely(dcc->rbtree_check)) + f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, + &dcc->root)); blk_start_plug(&plug); list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); @@ -1832,6 +1833,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg; dcc->undiscard_blks = 0; dcc->root = RB_ROOT; + dcc->rbtree_check = false; init_waitqueue_head(&dcc->discard_wait_queue); SM_I(sbi)->dcc_info = dcc; @@ -2461,7 +2463,9 @@ static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi, issued = 0; mutex_lock(&dcc->cmd_lock); - f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, &dcc->root)); + if (unlikely(dcc->rbtree_check)) + f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, + &dcc->root)); dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, NULL, start, From b8c037ce5b2f8ac7be045d489bad3472b9088632 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 23 Jun 2018 00:12:36 +0800 Subject: [PATCH 0908/1212] f2fs: fix to do sanity check with secs_per_zone As Wen Xu reported in below link: https://bugzilla.kernel.org/show_bug.cgi?id=200183 - Overview Divide zero in reset_curseg() when mounting a crafted f2fs image - Reproduce - Kernel message [ 588.281510] divide error: 0000 [#1] SMP KASAN PTI [ 588.282701] CPU: 0 PID: 1293 Comm: mount Not tainted 4.18.0-rc1+ #4 [ 588.284000] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 588.286178] RIP: 0010:reset_curseg+0x94/0x1a0 [ 588.298166] RSP: 0018:ffff8801e88d7940 EFLAGS: 00010246 [ 588.299360] RAX: 0000000000000014 RBX: ffff8801e1d46d00 RCX: ffffffffb88bf60b [ 588.300809] RDX: 0000000000000000 RSI: dffffc0000000000 RDI: ffff8801e1d46d64 [ 588.305272] R13: 0000000000000000 R14: 0000000000000014 R15: 0000000000000000 [ 588.306822] FS: 00007fad85008840(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000 [ 588.308456] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 588.309623] CR2: 0000000001705078 CR3: 00000001f30f8000 CR4: 00000000000006f0 [ 588.311085] Call Trace: [ 588.311637] f2fs_build_segment_manager+0x103f/0x3410 [ 588.316136] ? f2fs_commit_super+0x1b0/0x1b0 [ 588.317031] ? set_blocksize+0x90/0x140 [ 588.319473] f2fs_mount+0x15/0x20 [ 588.320166] mount_fs+0x60/0x1a0 [ 588.320847] ? alloc_vfsmnt+0x309/0x360 [ 588.321647] vfs_kern_mount+0x6b/0x1a0 [ 588.322432] do_mount+0x34a/0x18c0 [ 588.323175] ? strndup_user+0x46/0x70 [ 588.323937] ? copy_mount_string+0x20/0x20 [ 588.324793] ? memcg_kmem_put_cache+0x1b/0xa0 [ 588.325702] ? kasan_check_write+0x14/0x20 [ 588.326562] ? _copy_from_user+0x6a/0x90 [ 588.327375] ? memdup_user+0x42/0x60 [ 588.328118] ksys_mount+0x83/0xd0 [ 588.328808] __x64_sys_mount+0x67/0x80 [ 588.329607] do_syscall_64+0x78/0x170 [ 588.330400] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 588.331461] RIP: 0033:0x7fad848e8b9a [ 588.336022] RSP: 002b:00007ffd7c5b6be8 EFLAGS: 00000206 ORIG_RAX: 00000000000000a5 [ 588.337547] RAX: ffffffffffffffda RBX: 00000000016f8030 RCX: 00007fad848e8b9a [ 588.338999] RDX: 00000000016f8210 RSI: 00000000016f9f30 RDI: 0000000001700ec0 [ 588.340442] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000013 [ 588.341887] R10: 00000000c0ed0000 R11: 0000000000000206 R12: 0000000001700ec0 [ 588.343341] R13: 00000000016f8210 R14: 0000000000000000 R15: 0000000000000003 [ 588.354891] ---[ end trace 4ce02f25ff7d3df5 ]--- [ 588.355862] RIP: 0010:reset_curseg+0x94/0x1a0 [ 588.360742] RSP: 0018:ffff8801e88d7940 EFLAGS: 00010246 [ 588.361812] RAX: 0000000000000014 RBX: ffff8801e1d46d00 RCX: ffffffffb88bf60b [ 588.363485] RDX: 0000000000000000 RSI: dffffc0000000000 RDI: ffff8801e1d46d64 [ 588.365213] RBP: ffff8801e88d7968 R08: ffffed003c32266f R09: ffffed003c32266f [ 588.366661] R10: 0000000000000001 R11: ffffed003c32266e R12: ffff8801f0337700 [ 588.368110] R13: 0000000000000000 R14: 0000000000000014 R15: 0000000000000000 [ 588.370057] FS: 00007fad85008840(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000 [ 588.372099] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 588.373291] CR2: 0000000001705078 CR3: 00000001f30f8000 CR4: 00000000000006f0 - Location https://elixir.bootlin.com/linux/latest/source/fs/f2fs/segment.c#L2147 curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno); If secs_per_zone is corrupted due to fuzzing test, it will cause divide zero operation when using GET_ZONE_FROM_SEG macro, so we should do more sanity check with secs_per_zone during mount to avoid this issue. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f91c0d84bb17..df0a19397c63 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2231,9 +2231,9 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return 1; } - if (secs_per_zone > total_sections) { + if (secs_per_zone > total_sections || !secs_per_zone) { f2fs_msg(sb, KERN_INFO, - "Wrong secs_per_zone (%u > %u)", + "Wrong secs_per_zone / total_sections (%u, %u)", secs_per_zone, total_sections); return 1; } From 748d56c1f4fad878234177be53090b7a22b51df6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 23 Jun 2018 11:25:19 +0800 Subject: [PATCH 0909/1212] f2fs: fix to do sanity check with {sit,nat}_ver_bitmap_bytesize This patch adds to do sanity check with {sit,nat}_ver_bitmap_bytesize during mount, in order to avoid accessing across cache boundary with this abnormal bitmap size. - Overview buffer overrun in build_sit_info() when mounting a crafted f2fs image - Reproduce - Kernel message [ 548.580867] F2FS-fs (loop0): Invalid log blocks per segment (8201) [ 548.580877] F2FS-fs (loop0): Can't find valid F2FS filesystem in 1th superblock [ 548.584979] ================================================================== [ 548.586568] BUG: KASAN: use-after-free in kmemdup+0x36/0x50 [ 548.587715] Read of size 64 at addr ffff8801e9c265ff by task mount/1295 [ 548.589428] CPU: 1 PID: 1295 Comm: mount Not tainted 4.18.0-rc1+ #4 [ 548.589432] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 548.589438] Call Trace: [ 548.589474] dump_stack+0x7b/0xb5 [ 548.589487] print_address_description+0x70/0x290 [ 548.589492] kasan_report+0x291/0x390 [ 548.589496] ? kmemdup+0x36/0x50 [ 548.589509] check_memory_region+0x139/0x190 [ 548.589514] memcpy+0x23/0x50 [ 548.589518] kmemdup+0x36/0x50 [ 548.589545] f2fs_build_segment_manager+0x8fa/0x3410 [ 548.589551] ? __asan_loadN+0xf/0x20 [ 548.589560] ? f2fs_sanity_check_ckpt+0x1be/0x240 [ 548.589566] ? f2fs_flush_sit_entries+0x10c0/0x10c0 [ 548.589587] ? __put_user_ns+0x40/0x40 [ 548.589604] ? find_next_bit+0x57/0x90 [ 548.589610] f2fs_fill_super+0x194b/0x2b40 [ 548.589617] ? f2fs_commit_super+0x1b0/0x1b0 [ 548.589637] ? set_blocksize+0x90/0x140 [ 548.589651] mount_bdev+0x1c5/0x210 [ 548.589655] ? f2fs_commit_super+0x1b0/0x1b0 [ 548.589667] f2fs_mount+0x15/0x20 [ 548.589672] mount_fs+0x60/0x1a0 [ 548.589683] ? alloc_vfsmnt+0x309/0x360 [ 548.589688] vfs_kern_mount+0x6b/0x1a0 [ 548.589699] do_mount+0x34a/0x18c0 [ 548.589710] ? lockref_put_or_lock+0xcf/0x160 [ 548.589716] ? copy_mount_string+0x20/0x20 [ 548.589728] ? memcg_kmem_put_cache+0x1b/0xa0 [ 548.589734] ? kasan_check_write+0x14/0x20 [ 548.589740] ? _copy_from_user+0x6a/0x90 [ 548.589744] ? memdup_user+0x42/0x60 [ 548.589750] ksys_mount+0x83/0xd0 [ 548.589755] __x64_sys_mount+0x67/0x80 [ 548.589781] do_syscall_64+0x78/0x170 [ 548.589797] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 548.589820] RIP: 0033:0x7f76fc331b9a [ 548.589821] Code: 48 8b 0d 01 c3 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ce c2 2b 00 f7 d8 64 89 01 48 [ 548.589880] RSP: 002b:00007ffd4f0a0e48 EFLAGS: 00000206 ORIG_RAX: 00000000000000a5 [ 548.589890] RAX: ffffffffffffffda RBX: 000000000146c030 RCX: 00007f76fc331b9a [ 548.589892] RDX: 000000000146c210 RSI: 000000000146df30 RDI: 0000000001474ec0 [ 548.589895] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000013 [ 548.589897] R10: 00000000c0ed0000 R11: 0000000000000206 R12: 0000000001474ec0 [ 548.589900] R13: 000000000146c210 R14: 0000000000000000 R15: 0000000000000003 [ 548.590242] The buggy address belongs to the page: [ 548.591243] page:ffffea0007a70980 count:0 mapcount:0 mapping:0000000000000000 index:0x0 [ 548.592886] flags: 0x2ffff0000000000() [ 548.593665] raw: 02ffff0000000000 dead000000000100 dead000000000200 0000000000000000 [ 548.595258] raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 [ 548.603713] page dumped because: kasan: bad access detected [ 548.605203] Memory state around the buggy address: [ 548.606198] ffff8801e9c26480: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 548.607676] ffff8801e9c26500: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 548.609157] >ffff8801e9c26580: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 548.610629] ^ [ 548.612088] ffff8801e9c26600: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 548.613674] ffff8801e9c26680: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 548.615141] ================================================================== [ 548.616613] Disabling lock debugging due to kernel taint [ 548.622871] WARNING: CPU: 1 PID: 1295 at mm/page_alloc.c:4065 __alloc_pages_slowpath+0xe4a/0x1420 [ 548.622878] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy [ 548.623217] CPU: 1 PID: 1295 Comm: mount Tainted: G B 4.18.0-rc1+ #4 [ 548.623219] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 548.623226] RIP: 0010:__alloc_pages_slowpath+0xe4a/0x1420 [ 548.623227] Code: ff ff 01 89 85 c8 fe ff ff e9 91 fc ff ff 41 89 c5 e9 5c fc ff ff 0f 0b 89 f8 25 ff ff f7 ff 89 85 8c fe ff ff e9 d5 f2 ff ff <0f> 0b e9 65 f2 ff ff 65 8b 05 38 81 d2 47 f6 c4 01 74 1c 65 48 8b [ 548.623281] RSP: 0018:ffff8801f28c7678 EFLAGS: 00010246 [ 548.623284] RAX: 0000000000000000 RBX: 00000000006040c0 RCX: ffffffffb82f73b7 [ 548.623287] RDX: 1ffff1003e518eeb RSI: 000000000000000c RDI: 0000000000000000 [ 548.623290] RBP: ffff8801f28c7880 R08: 0000000000000000 R09: ffffed0047fff2c5 [ 548.623292] R10: 0000000000000001 R11: ffffed0047fff2c4 R12: ffff8801e88de040 [ 548.623295] R13: 00000000006040c0 R14: 000000000000000c R15: ffff8801f28c7938 [ 548.623299] FS: 00007f76fca51840(0000) GS:ffff8801f6f00000(0000) knlGS:0000000000000000 [ 548.623302] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 548.623304] CR2: 00007f19b9171760 CR3: 00000001ed952000 CR4: 00000000000006e0 [ 548.623317] Call Trace: [ 548.623325] ? kasan_check_read+0x11/0x20 [ 548.623330] ? __zone_watermark_ok+0x92/0x240 [ 548.623336] ? get_page_from_freelist+0x1c3/0x1d90 [ 548.623347] ? _raw_spin_lock_irqsave+0x2a/0x60 [ 548.623353] ? warn_alloc+0x250/0x250 [ 548.623358] ? save_stack+0x46/0xd0 [ 548.623361] ? kasan_kmalloc+0xad/0xe0 [ 548.623366] ? __isolate_free_page+0x2a0/0x2a0 [ 548.623370] ? mount_fs+0x60/0x1a0 [ 548.623374] ? vfs_kern_mount+0x6b/0x1a0 [ 548.623378] ? do_mount+0x34a/0x18c0 [ 548.623383] ? ksys_mount+0x83/0xd0 [ 548.623387] ? __x64_sys_mount+0x67/0x80 [ 548.623391] ? do_syscall_64+0x78/0x170 [ 548.623396] ? entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 548.623401] __alloc_pages_nodemask+0x3c5/0x400 [ 548.623407] ? __alloc_pages_slowpath+0x1420/0x1420 [ 548.623412] ? __mutex_lock_slowpath+0x20/0x20 [ 548.623417] ? kvmalloc_node+0x31/0x80 [ 548.623424] alloc_pages_current+0x75/0x110 [ 548.623436] kmalloc_order+0x24/0x60 [ 548.623442] kmalloc_order_trace+0x24/0xb0 [ 548.623448] __kmalloc_track_caller+0x207/0x220 [ 548.623455] ? f2fs_build_node_manager+0x399/0xbb0 [ 548.623460] kmemdup+0x20/0x50 [ 548.623465] f2fs_build_node_manager+0x399/0xbb0 [ 548.623470] f2fs_fill_super+0x195e/0x2b40 [ 548.623477] ? f2fs_commit_super+0x1b0/0x1b0 [ 548.623481] ? set_blocksize+0x90/0x140 [ 548.623486] mount_bdev+0x1c5/0x210 [ 548.623489] ? f2fs_commit_super+0x1b0/0x1b0 [ 548.623495] f2fs_mount+0x15/0x20 [ 548.623498] mount_fs+0x60/0x1a0 [ 548.623503] ? alloc_vfsmnt+0x309/0x360 [ 548.623508] vfs_kern_mount+0x6b/0x1a0 [ 548.623513] do_mount+0x34a/0x18c0 [ 548.623518] ? lockref_put_or_lock+0xcf/0x160 [ 548.623523] ? copy_mount_string+0x20/0x20 [ 548.623528] ? memcg_kmem_put_cache+0x1b/0xa0 [ 548.623533] ? kasan_check_write+0x14/0x20 [ 548.623537] ? _copy_from_user+0x6a/0x90 [ 548.623542] ? memdup_user+0x42/0x60 [ 548.623547] ksys_mount+0x83/0xd0 [ 548.623552] __x64_sys_mount+0x67/0x80 [ 548.623557] do_syscall_64+0x78/0x170 [ 548.623562] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 548.623566] RIP: 0033:0x7f76fc331b9a [ 548.623567] Code: 48 8b 0d 01 c3 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ce c2 2b 00 f7 d8 64 89 01 48 [ 548.623632] RSP: 002b:00007ffd4f0a0e48 EFLAGS: 00000206 ORIG_RAX: 00000000000000a5 [ 548.623636] RAX: ffffffffffffffda RBX: 000000000146c030 RCX: 00007f76fc331b9a [ 548.623639] RDX: 000000000146c210 RSI: 000000000146df30 RDI: 0000000001474ec0 [ 548.623641] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000013 [ 548.623643] R10: 00000000c0ed0000 R11: 0000000000000206 R12: 0000000001474ec0 [ 548.623646] R13: 000000000146c210 R14: 0000000000000000 R15: 0000000000000003 [ 548.623650] ---[ end trace 4ce02f25ff7d3df5 ]--- [ 548.623656] F2FS-fs (loop0): Failed to initialize F2FS node manager [ 548.627936] F2FS-fs (loop0): Invalid log blocks per segment (8201) [ 548.627940] F2FS-fs (loop0): Can't find valid F2FS filesystem in 1th superblock [ 548.635835] F2FS-fs (loop0): Failed to initialize F2FS node manager - Location https://elixir.bootlin.com/linux/v4.18-rc1/source/fs/f2fs/segment.c#L3578 sit_i->sit_bitmap = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL); Buffer overrun happens when doing memcpy. I suspect there is missing (inconsistent) checks on bitmap_size. Reported by Wen Xu (wen.xu@gatech.edu) from SSLab, Gatech. Reported-by: Wen Xu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index df0a19397c63..6c3231975ae5 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2284,12 +2284,17 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned int ovp_segments, reserved_segments; unsigned int main_segs, blocks_per_seg; + unsigned int sit_segs, nat_segs; + unsigned int sit_bitmap_size, nat_bitmap_size; + unsigned int log_blocks_per_seg; int i; total = le32_to_cpu(raw_super->segment_count); fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); - fsmeta += le32_to_cpu(raw_super->segment_count_sit); - fsmeta += le32_to_cpu(raw_super->segment_count_nat); + sit_segs = le32_to_cpu(raw_super->segment_count_sit); + fsmeta += sit_segs; + nat_segs = le32_to_cpu(raw_super->segment_count_nat); + fsmeta += nat_segs; fsmeta += le32_to_cpu(ckpt->rsvd_segment_count); fsmeta += le32_to_cpu(raw_super->segment_count_ssa); @@ -2320,6 +2325,18 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) return 1; } + sit_bitmap_size = le32_to_cpu(ckpt->sit_ver_bitmap_bytesize); + nat_bitmap_size = le32_to_cpu(ckpt->nat_ver_bitmap_bytesize); + log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); + + if (sit_bitmap_size != ((sit_segs / 2) << log_blocks_per_seg) / 8 || + nat_bitmap_size != ((nat_segs / 2) << log_blocks_per_seg) / 8) { + f2fs_msg(sbi->sb, KERN_ERR, + "Wrong bitmap size: sit: %u, nat:%u", + sit_bitmap_size, nat_bitmap_size); + return 1; + } + if (unlikely(f2fs_cp_error(sbi))) { f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); return 1; From f983081fe45fe2ac5427664c00c0e76013d68cba Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 25 Jun 2018 20:33:24 +0800 Subject: [PATCH 0910/1212] f2fs: fix to correct return value of f2fs_trim_fs We should account trimmed block number from __wait_all_discard_cmd in __issue_discard_cmd_range, otherwise trimmed blocks returned by f2fs_trim_fs will be wrong, this patch fixes it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6b1b8ec9ba58..4c6ede354774 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1400,21 +1400,22 @@ static unsigned int __wait_discard_cmd_range(struct f2fs_sb_info *sbi, return trimmed; } -static void __wait_all_discard_cmd(struct f2fs_sb_info *sbi, +static unsigned int __wait_all_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy) { struct discard_policy dp; + unsigned int discard_blks; - if (dpolicy) { - __wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX); - return; - } + if (dpolicy) + return __wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX); /* wait all */ __init_discard_policy(sbi, &dp, DPOLICY_FSTRIM, 1); - __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); + discard_blks = __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); __init_discard_policy(sbi, &dp, DPOLICY_UMOUNT, 1); - __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); + discard_blks += __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); + + return discard_blks; } /* This should be covered by global mutex, &sit_i->sentry_lock */ @@ -2448,7 +2449,7 @@ bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, return has_candidate; } -static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi, +static unsigned int __issue_discard_cmd_range(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, unsigned int start, unsigned int end) { @@ -2458,6 +2459,7 @@ static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi, struct discard_cmd *dc; struct blk_plug plug; int issued; + unsigned int trimmed = 0; next: issued = 0; @@ -2495,7 +2497,7 @@ static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi, blk_finish_plug(&plug); mutex_unlock(&dcc->cmd_lock); - __wait_all_discard_cmd(sbi, NULL); + trimmed += __wait_all_discard_cmd(sbi, NULL); congestion_wait(BLK_RW_ASYNC, HZ/50); goto next; } @@ -2509,6 +2511,8 @@ static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi, blk_finish_plug(&plug); mutex_unlock(&dcc->cmd_lock); + + return trimmed; } int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) @@ -2566,9 +2570,10 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) end_block = START_BLOCK(sbi, end_segno + 1); __init_discard_policy(sbi, &dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen); - __issue_discard_cmd_range(sbi, &dpolicy, start_block, end_block); + trimmed = __issue_discard_cmd_range(sbi, &dpolicy, + start_block, end_block); - trimmed = __wait_discard_cmd_range(sbi, &dpolicy, + trimmed += __wait_discard_cmd_range(sbi, &dpolicy, start_block, end_block); range->len = F2FS_BLK_TO_BYTES(trimmed); out: From 922a1f96e6841436dab049866b061672979ffcca Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 25 Jun 2018 23:29:49 +0800 Subject: [PATCH 0911/1212] f2fs: fix to do sanity check with extra_attr feature If FI_EXTRA_ATTR is set in inode by fuzzing, inode.i_addr[0] will be parsed as inode.i_extra_isize, then in __recover_inline_status, inline data address will beyond boundary of page, result in accessing invalid memory. So in this condition, during reading inode page, let's do sanity check with EXTRA_ATTR feature of fs and extra_attr bit of inode, if they're inconsistent, deny to load this inode. - Overview Out-of-bound access in f2fs_iget() when mounting a corrupted f2fs image - Reproduce The following message will be got in KASAN build of 4.18 upstream kernel. [ 819.392227] ================================================================== [ 819.393901] BUG: KASAN: slab-out-of-bounds in f2fs_iget+0x736/0x1530 [ 819.395329] Read of size 4 at addr ffff8801f099c968 by task mount/1292 [ 819.397079] CPU: 1 PID: 1292 Comm: mount Not tainted 4.18.0-rc1+ #4 [ 819.397082] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 819.397088] Call Trace: [ 819.397124] dump_stack+0x7b/0xb5 [ 819.397154] print_address_description+0x70/0x290 [ 819.397159] kasan_report+0x291/0x390 [ 819.397163] ? f2fs_iget+0x736/0x1530 [ 819.397176] check_memory_region+0x139/0x190 [ 819.397182] __asan_loadN+0xf/0x20 [ 819.397185] f2fs_iget+0x736/0x1530 [ 819.397197] f2fs_fill_super+0x1b4f/0x2b40 [ 819.397202] ? f2fs_fill_super+0x1b4f/0x2b40 [ 819.397208] ? f2fs_commit_super+0x1b0/0x1b0 [ 819.397227] ? set_blocksize+0x90/0x140 [ 819.397241] mount_bdev+0x1c5/0x210 [ 819.397245] ? f2fs_commit_super+0x1b0/0x1b0 [ 819.397252] f2fs_mount+0x15/0x20 [ 819.397256] mount_fs+0x60/0x1a0 [ 819.397267] ? alloc_vfsmnt+0x309/0x360 [ 819.397272] vfs_kern_mount+0x6b/0x1a0 [ 819.397282] do_mount+0x34a/0x18c0 [ 819.397300] ? lockref_put_or_lock+0xcf/0x160 [ 819.397306] ? copy_mount_string+0x20/0x20 [ 819.397318] ? memcg_kmem_put_cache+0x1b/0xa0 [ 819.397324] ? kasan_check_write+0x14/0x20 [ 819.397334] ? _copy_from_user+0x6a/0x90 [ 819.397353] ? memdup_user+0x42/0x60 [ 819.397359] ksys_mount+0x83/0xd0 [ 819.397365] __x64_sys_mount+0x67/0x80 [ 819.397388] do_syscall_64+0x78/0x170 [ 819.397403] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 819.397422] RIP: 0033:0x7f54c667cb9a [ 819.397424] Code: 48 8b 0d 01 c3 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ce c2 2b 00 f7 d8 64 89 01 48 [ 819.397483] RSP: 002b:00007ffd8f46cd08 EFLAGS: 00000202 ORIG_RAX: 00000000000000a5 [ 819.397496] RAX: ffffffffffffffda RBX: 0000000000dfa030 RCX: 00007f54c667cb9a [ 819.397498] RDX: 0000000000dfa210 RSI: 0000000000dfbf30 RDI: 0000000000e02ec0 [ 819.397501] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000013 [ 819.397503] R10: 00000000c0ed0000 R11: 0000000000000202 R12: 0000000000e02ec0 [ 819.397505] R13: 0000000000dfa210 R14: 0000000000000000 R15: 0000000000000003 [ 819.397866] Allocated by task 139: [ 819.398702] save_stack+0x46/0xd0 [ 819.398705] kasan_kmalloc+0xad/0xe0 [ 819.398709] kasan_slab_alloc+0x11/0x20 [ 819.398713] kmem_cache_alloc+0xd1/0x1e0 [ 819.398717] dup_fd+0x50/0x4c0 [ 819.398740] copy_process.part.37+0xbed/0x32e0 [ 819.398744] _do_fork+0x16e/0x590 [ 819.398748] __x64_sys_clone+0x69/0x80 [ 819.398752] do_syscall_64+0x78/0x170 [ 819.398756] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 819.399097] Freed by task 159: [ 819.399743] save_stack+0x46/0xd0 [ 819.399747] __kasan_slab_free+0x13c/0x1a0 [ 819.399750] kasan_slab_free+0xe/0x10 [ 819.399754] kmem_cache_free+0x89/0x1e0 [ 819.399757] put_files_struct+0x132/0x150 [ 819.399761] exit_files+0x62/0x70 [ 819.399766] do_exit+0x47b/0x1390 [ 819.399770] do_group_exit+0x86/0x130 [ 819.399774] __x64_sys_exit_group+0x2c/0x30 [ 819.399778] do_syscall_64+0x78/0x170 [ 819.399782] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 819.400115] The buggy address belongs to the object at ffff8801f099c680 which belongs to the cache files_cache of size 704 [ 819.403234] The buggy address is located 40 bytes to the right of 704-byte region [ffff8801f099c680, ffff8801f099c940) [ 819.405689] The buggy address belongs to the page: [ 819.406709] page:ffffea0007c26700 count:1 mapcount:0 mapping:ffff8801f69a3340 index:0xffff8801f099d380 compound_mapcount: 0 [ 819.408984] flags: 0x2ffff0000008100(slab|head) [ 819.409932] raw: 02ffff0000008100 ffffea00077fb600 0000000200000002 ffff8801f69a3340 [ 819.411514] raw: ffff8801f099d380 0000000080130000 00000001ffffffff 0000000000000000 [ 819.413073] page dumped because: kasan: bad access detected [ 819.414539] Memory state around the buggy address: [ 819.415521] ffff8801f099c800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 819.416981] ffff8801f099c880: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 819.418454] >ffff8801f099c900: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc [ 819.419921] ^ [ 819.421265] ffff8801f099c980: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb [ 819.422745] ffff8801f099ca00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 819.424206] ================================================================== [ 819.425668] Disabling lock debugging due to kernel taint [ 819.457463] F2FS-fs (loop0): Mounted with checkpoint version = 3 The kernel still mounts the image. If you run the following program on the mounted folder mnt, (poc.c) static void activity(char *mpoint) { char *foo_bar_baz; int err; static int buf[8192]; memset(buf, 0, sizeof(buf)); err = asprintf(&foo_bar_baz, "%s/foo/bar/baz", mpoint); int fd = open(foo_bar_baz, O_RDONLY, 0); if (fd >= 0) { read(fd, (char *)buf, 11); close(fd); } } int main(int argc, char *argv[]) { activity(argv[1]); return 0; } You can get kernel crash: [ 819.457463] F2FS-fs (loop0): Mounted with checkpoint version = 3 [ 918.028501] BUG: unable to handle kernel paging request at ffffed0048000d82 [ 918.044020] PGD 23ffee067 P4D 23ffee067 PUD 23fbef067 PMD 0 [ 918.045207] Oops: 0000 [#1] SMP KASAN PTI [ 918.046048] CPU: 0 PID: 1309 Comm: poc Tainted: G B 4.18.0-rc1+ #4 [ 918.047573] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 918.049552] RIP: 0010:check_memory_region+0x5e/0x190 [ 918.050565] Code: f8 49 c1 e8 03 49 89 db 49 c1 eb 03 4d 01 cb 4d 01 c1 4d 8d 63 01 4c 89 c8 4d 89 e2 4d 29 ca 49 83 fa 10 7f 3d 4d 85 d2 74 32 <41> 80 39 00 75 23 48 b8 01 00 00 00 00 fc ff df 4d 01 d1 49 01 c0 [ 918.054322] RSP: 0018:ffff8801e3a1f258 EFLAGS: 00010202 [ 918.055400] RAX: ffffed0048000d82 RBX: ffff880240006c11 RCX: ffffffffb8867d14 [ 918.056832] RDX: 0000000000000000 RSI: 0000000000000002 RDI: ffff880240006c10 [ 918.058253] RBP: ffff8801e3a1f268 R08: 1ffff10048000d82 R09: ffffed0048000d82 [ 918.059717] R10: 0000000000000001 R11: ffffed0048000d82 R12: ffffed0048000d83 [ 918.061159] R13: ffff8801e3a1f390 R14: 0000000000000000 R15: ffff880240006c08 [ 918.062614] FS: 00007fac9732c700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000 [ 918.064246] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 918.065412] CR2: ffffed0048000d82 CR3: 00000001df77a000 CR4: 00000000000006f0 [ 918.066882] Call Trace: [ 918.067410] __asan_loadN+0xf/0x20 [ 918.068149] f2fs_find_target_dentry+0xf4/0x270 [ 918.069083] ? __get_node_page+0x331/0x5b0 [ 918.069925] f2fs_find_in_inline_dir+0x24b/0x310 [ 918.070881] ? f2fs_recover_inline_data+0x4c0/0x4c0 [ 918.071905] ? unwind_next_frame.part.5+0x34f/0x490 [ 918.072901] ? unwind_dump+0x290/0x290 [ 918.073695] ? is_bpf_text_address+0xe/0x20 [ 918.074566] __f2fs_find_entry+0x599/0x670 [ 918.075408] ? kasan_unpoison_shadow+0x36/0x50 [ 918.076315] ? kasan_kmalloc+0xad/0xe0 [ 918.077100] ? memcg_kmem_put_cache+0x55/0xa0 [ 918.077998] ? f2fs_find_target_dentry+0x270/0x270 [ 918.079006] ? d_set_d_op+0x30/0x100 [ 918.079749] ? __d_lookup_rcu+0x69/0x2e0 [ 918.080556] ? __d_alloc+0x275/0x450 [ 918.081297] ? kasan_check_write+0x14/0x20 [ 918.082135] ? memset+0x31/0x40 [ 918.082820] ? fscrypt_setup_filename+0x1ec/0x4c0 [ 918.083782] ? d_alloc_parallel+0x5bb/0x8c0 [ 918.084640] f2fs_find_entry+0xe9/0x110 [ 918.085432] ? __f2fs_find_entry+0x670/0x670 [ 918.086308] ? kasan_check_write+0x14/0x20 [ 918.087163] f2fs_lookup+0x297/0x590 [ 918.087902] ? f2fs_link+0x2b0/0x2b0 [ 918.088646] ? legitimize_path.isra.29+0x61/0xa0 [ 918.089589] __lookup_slow+0x12e/0x240 [ 918.090371] ? may_delete+0x2b0/0x2b0 [ 918.091123] ? __nd_alloc_stack+0xa0/0xa0 [ 918.091944] lookup_slow+0x44/0x60 [ 918.092642] walk_component+0x3ee/0xa40 [ 918.093428] ? is_bpf_text_address+0xe/0x20 [ 918.094283] ? pick_link+0x3e0/0x3e0 [ 918.095047] ? in_group_p+0xa5/0xe0 [ 918.095771] ? generic_permission+0x53/0x1e0 [ 918.096666] ? security_inode_permission+0x1d/0x70 [ 918.097646] ? inode_permission+0x7a/0x1f0 [ 918.098497] link_path_walk+0x2a2/0x7b0 [ 918.099298] ? apparmor_capget+0x3d0/0x3d0 [ 918.100140] ? walk_component+0xa40/0xa40 [ 918.100958] ? path_init+0x2e6/0x580 [ 918.101695] path_openat+0x1bb/0x2160 [ 918.102471] ? __save_stack_trace+0x92/0x100 [ 918.103352] ? save_stack+0xb5/0xd0 [ 918.104070] ? vfs_unlink+0x250/0x250 [ 918.104822] ? save_stack+0x46/0xd0 [ 918.105538] ? kasan_slab_alloc+0x11/0x20 [ 918.106370] ? kmem_cache_alloc+0xd1/0x1e0 [ 918.107213] ? getname_flags+0x76/0x2c0 [ 918.107997] ? getname+0x12/0x20 [ 918.108677] ? do_sys_open+0x14b/0x2c0 [ 918.109450] ? __x64_sys_open+0x4c/0x60 [ 918.110255] ? do_syscall_64+0x78/0x170 [ 918.111083] ? entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 918.112148] ? entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 918.113204] ? f2fs_empty_inline_dir+0x1e0/0x1e0 [ 918.114150] ? timespec64_trunc+0x5c/0x90 [ 918.114993] ? wb_io_lists_depopulated+0x1a/0xc0 [ 918.115937] ? inode_io_list_move_locked+0x102/0x110 [ 918.116949] do_filp_open+0x12b/0x1d0 [ 918.117709] ? may_open_dev+0x50/0x50 [ 918.118475] ? kasan_kmalloc+0xad/0xe0 [ 918.119246] do_sys_open+0x17c/0x2c0 [ 918.119983] ? do_sys_open+0x17c/0x2c0 [ 918.120751] ? filp_open+0x60/0x60 [ 918.121463] ? task_work_run+0x4d/0xf0 [ 918.122237] __x64_sys_open+0x4c/0x60 [ 918.123001] do_syscall_64+0x78/0x170 [ 918.123759] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 918.124802] RIP: 0033:0x7fac96e3e040 [ 918.125537] Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 83 3d 09 27 2d 00 00 75 10 b8 02 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 7e e0 01 00 48 89 04 24 [ 918.129341] RSP: 002b:00007fff1b37f848 EFLAGS: 00000246 ORIG_RAX: 0000000000000002 [ 918.130870] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fac96e3e040 [ 918.132295] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 000000000122d080 [ 918.133748] RBP: 00007fff1b37f9b0 R08: 00007fac9710bbd8 R09: 0000000000000001 [ 918.135209] R10: 000000000000069d R11: 0000000000000246 R12: 0000000000400c20 [ 918.136650] R13: 00007fff1b37fab0 R14: 0000000000000000 R15: 0000000000000000 [ 918.138093] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy [ 918.147924] CR2: ffffed0048000d82 [ 918.148619] ---[ end trace 4ce02f25ff7d3df5 ]--- [ 918.149563] RIP: 0010:check_memory_region+0x5e/0x190 [ 918.150576] Code: f8 49 c1 e8 03 49 89 db 49 c1 eb 03 4d 01 cb 4d 01 c1 4d 8d 63 01 4c 89 c8 4d 89 e2 4d 29 ca 49 83 fa 10 7f 3d 4d 85 d2 74 32 <41> 80 39 00 75 23 48 b8 01 00 00 00 00 fc ff df 4d 01 d1 49 01 c0 [ 918.154360] RSP: 0018:ffff8801e3a1f258 EFLAGS: 00010202 [ 918.155411] RAX: ffffed0048000d82 RBX: ffff880240006c11 RCX: ffffffffb8867d14 [ 918.156833] RDX: 0000000000000000 RSI: 0000000000000002 RDI: ffff880240006c10 [ 918.158257] RBP: ffff8801e3a1f268 R08: 1ffff10048000d82 R09: ffffed0048000d82 [ 918.159722] R10: 0000000000000001 R11: ffffed0048000d82 R12: ffffed0048000d83 [ 918.161149] R13: ffff8801e3a1f390 R14: 0000000000000000 R15: ffff880240006c08 [ 918.162587] FS: 00007fac9732c700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000 [ 918.164203] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 918.165356] CR2: ffffed0048000d82 CR3: 00000001df77a000 CR4: 00000000000006f0 Reported-by: Wen Xu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 20efd4544a52..86b8e849ae0b 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -205,6 +205,16 @@ static bool sanity_check_inode(struct inode *inode) __func__, inode->i_ino); return false; } + + if (f2fs_has_extra_attr(inode) && + !f2fs_sb_has_extra_attr(sbi->sb)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: inode (ino=%lx) is with extra_attr, " + "but extra_attr feature is off", + __func__, inode->i_ino); + return false; + } return true; } @@ -257,6 +267,11 @@ static int do_read_inode(struct inode *inode) get_inline_info(inode, ri); + if (!sanity_check_inode(inode)) { + f2fs_put_page(node_page, 1); + return -EINVAL; + } + fi->i_extra_isize = f2fs_has_extra_attr(inode) ? le16_to_cpu(ri->i_extra_isize) : 0; @@ -338,10 +353,6 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) ret = do_read_inode(inode); if (ret) goto bad_inode; - if (!sanity_check_inode(inode)) { - ret = -EINVAL; - goto bad_inode; - } make_now: if (ino == F2FS_NODE_INO(sbi)) { inode->i_mapping->a_ops = &f2fs_node_aops; From af4137733f9cf91b00f8297de3b92e5e4e643f39 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 27 Jun 2018 18:05:54 +0800 Subject: [PATCH 0912/1212] f2fs: fix to do sanity check with user_block_count This patch fixs to do sanity check with user_block_count. - Overview Divide zero in utilization when mount() a corrupted f2fs image - Reproduce (4.18 upstream kernel) - Kernel message [ 564.099503] F2FS-fs (loop0): invalid crc value [ 564.101991] divide error: 0000 [#1] SMP KASAN PTI [ 564.103103] CPU: 1 PID: 1298 Comm: f2fs_discard-7: Not tainted 4.18.0-rc1+ #4 [ 564.104584] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 564.106624] RIP: 0010:issue_discard_thread+0x248/0x5c0 [ 564.107692] Code: ff ff 48 8b bd e8 fe ff ff 41 8b 9d 4c 04 00 00 e8 cd b8 ad ff 41 8b 85 50 04 00 00 31 d2 48 8d 04 80 48 8d 04 80 48 c1 e0 02 <48> f7 f3 83 f8 50 7e 16 41 c7 86 7c ff ff ff 01 00 00 00 41 c7 86 [ 564.111686] RSP: 0018:ffff8801f3117dc0 EFLAGS: 00010206 [ 564.112775] RAX: 0000000000000384 RBX: 0000000000000000 RCX: ffffffffb88c1e03 [ 564.114250] RDX: 0000000000000000 RSI: dffffc0000000000 RDI: ffff8801e3aa4850 [ 564.115706] RBP: ffff8801f3117f00 R08: 1ffffffff751a1d0 R09: fffffbfff751a1d0 [ 564.117177] R10: 0000000000000001 R11: fffffbfff751a1d0 R12: 00000000fffffffc [ 564.118634] R13: ffff8801e3aa4400 R14: ffff8801f3117ed8 R15: ffff8801e2050000 [ 564.120094] FS: 0000000000000000(0000) GS:ffff8801f6f00000(0000) knlGS:0000000000000000 [ 564.121748] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 564.122923] CR2: 000000000202b078 CR3: 00000001f11ac000 CR4: 00000000000006e0 [ 564.124383] Call Trace: [ 564.124924] ? __issue_discard_cmd+0x480/0x480 [ 564.125882] ? __sched_text_start+0x8/0x8 [ 564.126756] ? __kthread_parkme+0xcb/0x100 [ 564.127620] ? kthread_blkcg+0x70/0x70 [ 564.128412] kthread+0x180/0x1d0 [ 564.129105] ? __issue_discard_cmd+0x480/0x480 [ 564.130029] ? kthread_associate_blkcg+0x150/0x150 [ 564.131033] ret_from_fork+0x35/0x40 [ 564.131794] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy [ 564.141798] ---[ end trace 4ce02f25ff7d3df5 ]--- [ 564.142773] RIP: 0010:issue_discard_thread+0x248/0x5c0 [ 564.143885] Code: ff ff 48 8b bd e8 fe ff ff 41 8b 9d 4c 04 00 00 e8 cd b8 ad ff 41 8b 85 50 04 00 00 31 d2 48 8d 04 80 48 8d 04 80 48 c1 e0 02 <48> f7 f3 83 f8 50 7e 16 41 c7 86 7c ff ff ff 01 00 00 00 41 c7 86 [ 564.147776] RSP: 0018:ffff8801f3117dc0 EFLAGS: 00010206 [ 564.148856] RAX: 0000000000000384 RBX: 0000000000000000 RCX: ffffffffb88c1e03 [ 564.150424] RDX: 0000000000000000 RSI: dffffc0000000000 RDI: ffff8801e3aa4850 [ 564.151906] RBP: ffff8801f3117f00 R08: 1ffffffff751a1d0 R09: fffffbfff751a1d0 [ 564.153463] R10: 0000000000000001 R11: fffffbfff751a1d0 R12: 00000000fffffffc [ 564.154915] R13: ffff8801e3aa4400 R14: ffff8801f3117ed8 R15: ffff8801e2050000 [ 564.156405] FS: 0000000000000000(0000) GS:ffff8801f6f00000(0000) knlGS:0000000000000000 [ 564.158070] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 564.159279] CR2: 000000000202b078 CR3: 00000001f11ac000 CR4: 00000000000006e0 [ 564.161043] ================================================================== [ 564.162587] BUG: KASAN: stack-out-of-bounds in from_kuid_munged+0x1d/0x50 [ 564.163994] Read of size 4 at addr ffff8801f3117c84 by task f2fs_discard-7:/1298 [ 564.165852] CPU: 1 PID: 1298 Comm: f2fs_discard-7: Tainted: G D 4.18.0-rc1+ #4 [ 564.167593] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 564.169522] Call Trace: [ 564.170057] dump_stack+0x7b/0xb5 [ 564.170778] print_address_description+0x70/0x290 [ 564.171765] kasan_report+0x291/0x390 [ 564.172540] ? from_kuid_munged+0x1d/0x50 [ 564.173408] __asan_load4+0x78/0x80 [ 564.174148] from_kuid_munged+0x1d/0x50 [ 564.174962] do_notify_parent+0x1f5/0x4f0 [ 564.175808] ? send_sigqueue+0x390/0x390 [ 564.176639] ? css_set_move_task+0x152/0x340 [ 564.184197] do_exit+0x1290/0x1390 [ 564.184950] ? __issue_discard_cmd+0x480/0x480 [ 564.185884] ? mm_update_next_owner+0x380/0x380 [ 564.186829] ? __sched_text_start+0x8/0x8 [ 564.187672] ? __kthread_parkme+0xcb/0x100 [ 564.188528] ? kthread_blkcg+0x70/0x70 [ 564.189333] ? kthread+0x180/0x1d0 [ 564.190052] ? __issue_discard_cmd+0x480/0x480 [ 564.190983] rewind_stack_do_exit+0x17/0x20 [ 564.192190] The buggy address belongs to the page: [ 564.193213] page:ffffea0007cc45c0 count:0 mapcount:0 mapping:0000000000000000 index:0x0 [ 564.194856] flags: 0x2ffff0000000000() [ 564.195644] raw: 02ffff0000000000 0000000000000000 dead000000000200 0000000000000000 [ 564.197247] raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 [ 564.198826] page dumped because: kasan: bad access detected [ 564.200299] Memory state around the buggy address: [ 564.201306] ffff8801f3117b80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 564.202779] ffff8801f3117c00: 00 00 00 00 00 00 00 00 00 00 00 f3 f3 f3 f3 f3 [ 564.204252] >ffff8801f3117c80: f3 f3 f3 00 00 00 00 00 00 00 00 00 f1 f1 f1 f1 [ 564.205742] ^ [ 564.206424] ffff8801f3117d00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 564.207908] ffff8801f3117d80: f3 f3 f3 f3 f3 f3 f3 f3 00 00 00 00 00 00 00 00 [ 564.209389] ================================================================== [ 564.231795] F2FS-fs (loop0): Mounted with checkpoint version = 2 - Location https://elixir.bootlin.com/linux/v4.18-rc1/source/fs/f2fs/segment.h#L586 return div_u64((u64)valid_user_blocks(sbi) * 100, sbi->user_block_count); Missing checks on sbi->user_block_count. Reported-by: Wen Xu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 6c3231975ae5..b0aa9ce320e8 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2287,6 +2287,8 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) unsigned int sit_segs, nat_segs; unsigned int sit_bitmap_size, nat_bitmap_size; unsigned int log_blocks_per_seg; + unsigned int segment_count_main; + block_t user_block_count; int i; total = le32_to_cpu(raw_super->segment_count); @@ -2311,6 +2313,16 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) return 1; } + user_block_count = le64_to_cpu(ckpt->user_block_count); + segment_count_main = le32_to_cpu(raw_super->segment_count_main); + log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); + if (!user_block_count || user_block_count >= + segment_count_main << log_blocks_per_seg) { + f2fs_msg(sbi->sb, KERN_ERR, + "Wrong user_block_count: %u", user_block_count); + return 1; + } + main_segs = le32_to_cpu(raw_super->segment_count_main); blocks_per_seg = sbi->blocks_per_seg; @@ -2327,7 +2339,6 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) sit_bitmap_size = le32_to_cpu(ckpt->sit_ver_bitmap_bytesize); nat_bitmap_size = le32_to_cpu(ckpt->nat_ver_bitmap_bytesize); - log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); if (sit_bitmap_size != ((sit_segs / 2) << log_blocks_per_seg) / 8 || nat_bitmap_size != ((nat_segs / 2) << log_blocks_per_seg) / 8) { From 0588ef4a094250ed4ccf7e383e9fdf60548d3ed2 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Wed, 27 Jun 2018 14:46:21 +0800 Subject: [PATCH 0913/1212] f2fs: Allocate and stat mem used by free nid bitmap more accurately This patch used f2fs_bitmap_size macro to calculate mem used by free nid bitmap, and stat used mem including aligned part. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 3 ++- fs/f2fs/node.c | 2 +- include/linux/f2fs_fs.h | 5 ----- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 2d65e77ae5cf..214a968962a1 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -215,7 +215,8 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem += sizeof(struct f2fs_nm_info); si->base_mem += __bitmap_size(sbi, NAT_BITMAP); si->base_mem += (NM_I(sbi)->nat_bits_blocks << F2FS_BLKSIZE_BITS); - si->base_mem += NM_I(sbi)->nat_blocks * NAT_ENTRY_BITMAP_SIZE; + si->base_mem += NM_I(sbi)->nat_blocks * + f2fs_bitmap_size(NAT_ENTRY_PER_BLOCK); si->base_mem += NM_I(sbi)->nat_blocks / 8; si->base_mem += NM_I(sbi)->nat_blocks * sizeof(unsigned short); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index aa403fc1851e..6c24811e7b9d 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2800,7 +2800,7 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi) for (i = 0; i < nm_i->nat_blocks; i++) { nm_i->free_nid_bitmap[i] = f2fs_kvzalloc(sbi, - NAT_ENTRY_BITMAP_SIZE_ALIGNED, GFP_KERNEL); + f2fs_bitmap_size(NAT_ENTRY_PER_BLOCK), GFP_KERNEL); if (!nm_i->free_nid_bitmap) return -ENOMEM; } diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 2ebfa01b7091..8e6a18582566 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -304,11 +304,6 @@ struct f2fs_node { * For NAT entries */ #define NAT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_nat_entry)) -#define NAT_ENTRY_BITMAP_SIZE ((NAT_ENTRY_PER_BLOCK + 7) / 8) -#define NAT_ENTRY_BITMAP_SIZE_ALIGNED \ - ((NAT_ENTRY_BITMAP_SIZE + BITS_PER_LONG - 1) / \ - BITS_PER_LONG * BITS_PER_LONG) - struct f2fs_nat_entry { __u8 version; /* latest version of cached nat entry */ From 0c700609b7d8ae5148eb0df22d470d093aa487ad Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 29 Jun 2018 13:55:22 +0800 Subject: [PATCH 0914/1212] f2fs: fix to do sanity check with node footer and iblocks This patch adds to do sanity check with below fields of inode to avoid reported panic. - node footer - iblocks https://bugzilla.kernel.org/show_bug.cgi?id=200223 - Overview BUG() triggered in f2fs_truncate_inode_blocks() when un-mounting a mounted f2fs image after writing to it - Reproduce - POC (poc.c) static void activity(char *mpoint) { char *foo_bar_baz; int err; static int buf[8192]; memset(buf, 0, sizeof(buf)); err = asprintf(&foo_bar_baz, "%s/foo/bar/baz", mpoint); // open / write / read int fd = open(foo_bar_baz, O_RDWR | O_TRUNC, 0777); if (fd >= 0) { write(fd, (char *)buf, 517); write(fd, (char *)buf, sizeof(buf)); close(fd); } } int main(int argc, char *argv[]) { activity(argv[1]); return 0; } - Kernel meesage [ 552.479723] F2FS-fs (loop0): Mounted with checkpoint version = 2 [ 556.451891] ------------[ cut here ]------------ [ 556.451899] kernel BUG at fs/f2fs/node.c:987! [ 556.452920] invalid opcode: 0000 [#1] SMP KASAN PTI [ 556.453936] CPU: 1 PID: 1310 Comm: umount Not tainted 4.18.0-rc1+ #4 [ 556.455213] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 556.457140] RIP: 0010:f2fs_truncate_inode_blocks+0x4a7/0x6f0 [ 556.458280] Code: e8 ae ea ff ff 41 89 c7 c1 e8 1f 84 c0 74 0a 41 83 ff fe 0f 85 35 ff ff ff 81 85 b0 fe ff ff fb 03 00 00 e9 f7 fd ff ff 0f 0b <0f> 0b e8 62 b7 9a 00 48 8b bd a0 fe ff ff e8 56 54 ae ff 48 8b b5 [ 556.462015] RSP: 0018:ffff8801f292f808 EFLAGS: 00010286 [ 556.463068] RAX: ffffed003e73242d RBX: ffff8801f292f958 RCX: ffffffffb88b81bc [ 556.464479] RDX: 0000000000000000 RSI: 0000000000000004 RDI: ffff8801f3992164 [ 556.465901] RBP: ffff8801f292f980 R08: ffffed003e73242d R09: ffffed003e73242d [ 556.467311] R10: 0000000000000001 R11: ffffed003e73242c R12: 00000000fffffc64 [ 556.468706] R13: ffff8801f3992000 R14: 0000000000000058 R15: 00000000ffff8801 [ 556.470117] FS: 00007f8029297840(0000) GS:ffff8801f6f00000(0000) knlGS:0000000000000000 [ 556.471702] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 556.472838] CR2: 000055f5f57305d8 CR3: 00000001f18b0000 CR4: 00000000000006e0 [ 556.474265] Call Trace: [ 556.474782] ? f2fs_alloc_nid_failed+0xf0/0xf0 [ 556.475686] ? truncate_nodes+0x980/0x980 [ 556.476516] ? pagecache_get_page+0x21f/0x2f0 [ 556.477412] ? __asan_loadN+0xf/0x20 [ 556.478153] ? __get_node_page+0x331/0x5b0 [ 556.478992] ? reweight_entity+0x1e6/0x3b0 [ 556.479826] f2fs_truncate_blocks+0x55e/0x740 [ 556.480709] ? f2fs_truncate_data_blocks+0x20/0x20 [ 556.481689] ? __radix_tree_lookup+0x34/0x160 [ 556.482630] ? radix_tree_lookup+0xd/0x10 [ 556.483445] f2fs_truncate+0xd4/0x1a0 [ 556.484206] f2fs_evict_inode+0x5ce/0x630 [ 556.485032] evict+0x16f/0x290 [ 556.485664] iput+0x280/0x300 [ 556.486300] dentry_unlink_inode+0x165/0x1e0 [ 556.487169] __dentry_kill+0x16a/0x260 [ 556.487936] dentry_kill+0x70/0x250 [ 556.488651] shrink_dentry_list+0x125/0x260 [ 556.489504] shrink_dcache_parent+0xc1/0x110 [ 556.490379] ? shrink_dcache_sb+0x200/0x200 [ 556.491231] ? bit_wait_timeout+0xc0/0xc0 [ 556.492047] do_one_tree+0x12/0x40 [ 556.492743] shrink_dcache_for_umount+0x3f/0xa0 [ 556.493656] generic_shutdown_super+0x43/0x1c0 [ 556.494561] kill_block_super+0x52/0x80 [ 556.495341] kill_f2fs_super+0x62/0x70 [ 556.496105] deactivate_locked_super+0x6f/0xa0 [ 556.497004] deactivate_super+0x5e/0x80 [ 556.497785] cleanup_mnt+0x61/0xa0 [ 556.498492] __cleanup_mnt+0x12/0x20 [ 556.499218] task_work_run+0xc8/0xf0 [ 556.499949] exit_to_usermode_loop+0x125/0x130 [ 556.500846] do_syscall_64+0x138/0x170 [ 556.501609] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 556.502659] RIP: 0033:0x7f8028b77487 [ 556.503384] Code: 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 31 f6 e9 09 00 00 00 66 0f 1f 84 00 00 00 00 00 b8 a6 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d e1 c9 2b 00 f7 d8 64 89 01 48 [ 556.507137] RSP: 002b:00007fff9f2e3598 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 [ 556.508637] RAX: 0000000000000000 RBX: 0000000000ebd030 RCX: 00007f8028b77487 [ 556.510069] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000000000ec41e0 [ 556.511481] RBP: 0000000000ec41e0 R08: 0000000000000000 R09: 0000000000000014 [ 556.512892] R10: 00000000000006b2 R11: 0000000000000246 R12: 00007f802908083c [ 556.514320] R13: 0000000000000000 R14: 0000000000ebd210 R15: 00007fff9f2e3820 [ 556.515745] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy [ 556.529276] ---[ end trace 4ce02f25ff7d3df5 ]--- [ 556.530340] RIP: 0010:f2fs_truncate_inode_blocks+0x4a7/0x6f0 [ 556.531513] Code: e8 ae ea ff ff 41 89 c7 c1 e8 1f 84 c0 74 0a 41 83 ff fe 0f 85 35 ff ff ff 81 85 b0 fe ff ff fb 03 00 00 e9 f7 fd ff ff 0f 0b <0f> 0b e8 62 b7 9a 00 48 8b bd a0 fe ff ff e8 56 54 ae ff 48 8b b5 [ 556.535330] RSP: 0018:ffff8801f292f808 EFLAGS: 00010286 [ 556.536395] RAX: ffffed003e73242d RBX: ffff8801f292f958 RCX: ffffffffb88b81bc [ 556.537824] RDX: 0000000000000000 RSI: 0000000000000004 RDI: ffff8801f3992164 [ 556.539290] RBP: ffff8801f292f980 R08: ffffed003e73242d R09: ffffed003e73242d [ 556.540709] R10: 0000000000000001 R11: ffffed003e73242c R12: 00000000fffffc64 [ 556.542131] R13: ffff8801f3992000 R14: 0000000000000058 R15: 00000000ffff8801 [ 556.543579] FS: 00007f8029297840(0000) GS:ffff8801f6f00000(0000) knlGS:0000000000000000 [ 556.545180] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 556.546338] CR2: 000055f5f57305d8 CR3: 00000001f18b0000 CR4: 00000000000006e0 [ 556.547809] ================================================================== [ 556.549248] BUG: KASAN: stack-out-of-bounds in arch_tlb_gather_mmu+0x52/0x170 [ 556.550672] Write of size 8 at addr ffff8801f292fd10 by task umount/1310 [ 556.552338] CPU: 1 PID: 1310 Comm: umount Tainted: G D 4.18.0-rc1+ #4 [ 556.553886] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 556.555756] Call Trace: [ 556.556264] dump_stack+0x7b/0xb5 [ 556.556944] print_address_description+0x70/0x290 [ 556.557903] kasan_report+0x291/0x390 [ 556.558649] ? arch_tlb_gather_mmu+0x52/0x170 [ 556.559537] __asan_store8+0x57/0x90 [ 556.560268] arch_tlb_gather_mmu+0x52/0x170 [ 556.561110] tlb_gather_mmu+0x12/0x40 [ 556.561862] exit_mmap+0x123/0x2a0 [ 556.562555] ? __ia32_sys_munmap+0x50/0x50 [ 556.563384] ? exit_aio+0x98/0x230 [ 556.564079] ? __x32_compat_sys_io_submit+0x260/0x260 [ 556.565099] ? taskstats_exit+0x1f4/0x640 [ 556.565925] ? kasan_check_read+0x11/0x20 [ 556.566739] ? mm_update_next_owner+0x322/0x380 [ 556.567652] mmput+0x8b/0x1d0 [ 556.568260] do_exit+0x43a/0x1390 [ 556.568937] ? mm_update_next_owner+0x380/0x380 [ 556.569855] ? deactivate_super+0x5e/0x80 [ 556.570668] ? cleanup_mnt+0x61/0xa0 [ 556.571395] ? __cleanup_mnt+0x12/0x20 [ 556.572156] ? task_work_run+0xc8/0xf0 [ 556.572917] ? exit_to_usermode_loop+0x125/0x130 [ 556.573861] rewind_stack_do_exit+0x17/0x20 [ 556.574707] RIP: 0033:0x7f8028b77487 [ 556.575428] Code: Bad RIP value. [ 556.576106] RSP: 002b:00007fff9f2e3598 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 [ 556.577599] RAX: 0000000000000000 RBX: 0000000000ebd030 RCX: 00007f8028b77487 [ 556.579020] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000000000ec41e0 [ 556.580422] RBP: 0000000000ec41e0 R08: 0000000000000000 R09: 0000000000000014 [ 556.581833] R10: 00000000000006b2 R11: 0000000000000246 R12: 00007f802908083c [ 556.583252] R13: 0000000000000000 R14: 0000000000ebd210 R15: 00007fff9f2e3820 [ 556.584983] The buggy address belongs to the page: [ 556.585961] page:ffffea0007ca4bc0 count:0 mapcount:0 mapping:0000000000000000 index:0x0 [ 556.587540] flags: 0x2ffff0000000000() [ 556.588296] raw: 02ffff0000000000 0000000000000000 dead000000000200 0000000000000000 [ 556.589822] raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 [ 556.591359] page dumped because: kasan: bad access detected [ 556.592786] Memory state around the buggy address: [ 556.593753] ffff8801f292fc00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 556.595191] ffff8801f292fc80: 00 00 00 00 00 00 00 00 f1 f1 f1 f1 00 00 00 00 [ 556.596613] >ffff8801f292fd00: 00 00 f3 00 00 00 00 f3 f3 00 00 00 00 f4 f4 f4 [ 556.598044] ^ [ 556.598797] ffff8801f292fd80: f3 f3 f3 f3 00 00 00 00 00 00 00 00 00 00 00 00 [ 556.600225] ffff8801f292fe00: 00 00 00 00 00 00 00 00 f1 f1 f1 f1 00 f4 f4 f4 [ 556.601647] ================================================================== - Location https://elixir.bootlin.com/linux/v4.18-rc1/source/fs/f2fs/node.c#L987 case NODE_DIND_BLOCK: err = truncate_nodes(&dn, nofs, offset[1], 3); cont = 0; break; default: BUG(); <--- } Reported-by Wen Xu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 86b8e849ae0b..2d322eb18dca 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -193,9 +193,30 @@ void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page) ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, page)); } -static bool sanity_check_inode(struct inode *inode) +static bool sanity_check_inode(struct inode *inode, struct page *node_page) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned long long iblocks; + + iblocks = le64_to_cpu(F2FS_INODE(node_page)->i_blocks); + if (!iblocks) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: corrupted inode i_blocks i_ino=%lx iblocks=%llu, " + "run fsck to fix.", + __func__, inode->i_ino, iblocks); + return false; + } + + if (ino_of_node(node_page) != nid_of_node(node_page)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: corrupted inode footer i_ino=%lx, ino,nid: " + "[%u, %u] run fsck to fix.", + __func__, inode->i_ino, + ino_of_node(node_page), nid_of_node(node_page)); + return false; + } if (f2fs_sb_has_flexible_inline_xattr(sbi->sb) && !f2fs_has_extra_attr(inode)) { @@ -267,7 +288,7 @@ static int do_read_inode(struct inode *inode) get_inline_info(inode, ri); - if (!sanity_check_inode(inode)) { + if (!sanity_check_inode(inode, node_page)) { f2fs_put_page(node_page, 1); return -EINVAL; } From 062c37efc3290c675d97126cf9be94287abbcf81 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 30 Jun 2018 18:13:40 +0800 Subject: [PATCH 0915/1212] f2fs: fix to do sanity check with reserved blkaddr of inline inode As Wen Xu reported in bugzilla, after image was injected with random data by fuzzing, inline inode would contain invalid reserved blkaddr, then during inline conversion, we will encounter illegal memory accessing reported by KASAN, the root cause of this is when writing out converted inline page, we will use invalid reserved blkaddr to update sit bitmap, result in accessing memory beyond sit bitmap boundary. In order to fix this issue, let's do sanity check with reserved block address of inline inode to avoid above condition. https://bugzilla.kernel.org/show_bug.cgi?id=200179 [ 1428.846352] BUG: KASAN: use-after-free in update_sit_entry+0x80/0x7f0 [ 1428.846618] Read of size 4 at addr ffff880194483540 by task a.out/2741 [ 1428.846855] CPU: 0 PID: 2741 Comm: a.out Tainted: G W 4.17.0+ #1 [ 1428.846858] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 1428.846860] Call Trace: [ 1428.846868] dump_stack+0x71/0xab [ 1428.846875] print_address_description+0x6b/0x290 [ 1428.846881] kasan_report+0x28e/0x390 [ 1428.846888] ? update_sit_entry+0x80/0x7f0 [ 1428.846898] update_sit_entry+0x80/0x7f0 [ 1428.846906] f2fs_allocate_data_block+0x6db/0xc70 [ 1428.846914] ? f2fs_get_node_info+0x14f/0x590 [ 1428.846920] do_write_page+0xc8/0x150 [ 1428.846928] f2fs_outplace_write_data+0xfe/0x210 [ 1428.846935] ? f2fs_do_write_node_page+0x170/0x170 [ 1428.846941] ? radix_tree_tag_clear+0xff/0x130 [ 1428.846946] ? __mod_node_page_state+0x22/0xa0 [ 1428.846951] ? inc_zone_page_state+0x54/0x100 [ 1428.846956] ? __test_set_page_writeback+0x336/0x5d0 [ 1428.846964] f2fs_convert_inline_page+0x407/0x6d0 [ 1428.846971] ? f2fs_read_inline_data+0x3b0/0x3b0 [ 1428.846978] ? __get_node_page+0x335/0x6b0 [ 1428.846987] f2fs_convert_inline_inode+0x41b/0x500 [ 1428.846994] ? f2fs_convert_inline_page+0x6d0/0x6d0 [ 1428.847000] ? kasan_unpoison_shadow+0x31/0x40 [ 1428.847005] ? kasan_kmalloc+0xa6/0xd0 [ 1428.847024] f2fs_file_mmap+0x79/0xc0 [ 1428.847029] mmap_region+0x58b/0x880 [ 1428.847037] ? arch_get_unmapped_area+0x370/0x370 [ 1428.847042] do_mmap+0x55b/0x7a0 [ 1428.847048] vm_mmap_pgoff+0x16f/0x1c0 [ 1428.847055] ? vma_is_stack_for_current+0x50/0x50 [ 1428.847062] ? __fsnotify_update_child_dentry_flags.part.1+0x160/0x160 [ 1428.847068] ? do_sys_open+0x206/0x2a0 [ 1428.847073] ? __fget+0xb4/0x100 [ 1428.847079] ksys_mmap_pgoff+0x278/0x360 [ 1428.847085] ? find_mergeable_anon_vma+0x50/0x50 [ 1428.847091] do_syscall_64+0x73/0x160 [ 1428.847098] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 1428.847102] RIP: 0033:0x7fb1430766ba [ 1428.847103] Code: 89 f5 41 54 49 89 fc 55 53 74 35 49 63 e8 48 63 da 4d 89 f9 49 89 e8 4d 63 d6 48 89 da 4c 89 ee 4c 89 e7 b8 09 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 56 5b 5d 41 5c 41 5d 41 5e 41 5f c3 0f 1f 00 [ 1428.847162] RSP: 002b:00007ffc651d9388 EFLAGS: 00000246 ORIG_RAX: 0000000000000009 [ 1428.847167] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007fb1430766ba [ 1428.847170] RDX: 0000000000000001 RSI: 0000000000001000 RDI: 0000000000000000 [ 1428.847173] RBP: 0000000000000003 R08: 0000000000000003 R09: 0000000000000000 [ 1428.847176] R10: 0000000000008002 R11: 0000000000000246 R12: 0000000000000000 [ 1428.847179] R13: 0000000000001000 R14: 0000000000008002 R15: 0000000000000000 [ 1428.847252] Allocated by task 2683: [ 1428.847372] kasan_kmalloc+0xa6/0xd0 [ 1428.847380] kmem_cache_alloc+0xc8/0x1e0 [ 1428.847385] getname_flags+0x73/0x2b0 [ 1428.847390] user_path_at_empty+0x1d/0x40 [ 1428.847395] vfs_statx+0xc1/0x150 [ 1428.847401] __do_sys_newlstat+0x7e/0xd0 [ 1428.847405] do_syscall_64+0x73/0x160 [ 1428.847411] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 1428.847466] Freed by task 2683: [ 1428.847566] __kasan_slab_free+0x137/0x190 [ 1428.847571] kmem_cache_free+0x85/0x1e0 [ 1428.847575] filename_lookup+0x191/0x280 [ 1428.847580] vfs_statx+0xc1/0x150 [ 1428.847585] __do_sys_newlstat+0x7e/0xd0 [ 1428.847590] do_syscall_64+0x73/0x160 [ 1428.847596] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 1428.847648] The buggy address belongs to the object at ffff880194483300 which belongs to the cache names_cache of size 4096 [ 1428.847946] The buggy address is located 576 bytes inside of 4096-byte region [ffff880194483300, ffff880194484300) [ 1428.848234] The buggy address belongs to the page: [ 1428.848366] page:ffffea0006512000 count:1 mapcount:0 mapping:ffff8801f3586380 index:0x0 compound_mapcount: 0 [ 1428.848606] flags: 0x17fff8000008100(slab|head) [ 1428.848737] raw: 017fff8000008100 dead000000000100 dead000000000200 ffff8801f3586380 [ 1428.848931] raw: 0000000000000000 0000000000070007 00000001ffffffff 0000000000000000 [ 1428.849122] page dumped because: kasan: bad access detected [ 1428.849305] Memory state around the buggy address: [ 1428.849436] ffff880194483400: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 1428.849620] ffff880194483480: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 1428.849804] >ffff880194483500: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 1428.849985] ^ [ 1428.850120] ffff880194483580: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 1428.850303] ffff880194483600: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 1428.850498] ================================================================== Reported-by: Wen Xu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inline.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index f586fb9b20fa..17125baa2ba7 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -130,6 +130,16 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) if (err) return err; + if (unlikely(dn->data_blkaddr != NEW_ADDR)) { + f2fs_put_dnode(dn); + set_sbi_flag(fio.sbi, SBI_NEED_FSCK); + f2fs_msg(fio.sbi->sb, KERN_WARNING, + "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, " + "run fsck to fix.", + __func__, dn->inode->i_ino, dn->data_blkaddr); + return -EINVAL; + } + f2fs_bug_on(F2FS_P_SB(page), PageWriteback(page)); f2fs_do_read_inline_data(page, dn->inode_page); @@ -363,6 +373,17 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, if (err) goto out; + if (unlikely(dn.data_blkaddr != NEW_ADDR)) { + f2fs_put_dnode(&dn); + set_sbi_flag(F2FS_P_SB(page), SBI_NEED_FSCK); + f2fs_msg(F2FS_P_SB(page)->sb, KERN_WARNING, + "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, " + "run fsck to fix.", + __func__, dir->i_ino, dn.data_blkaddr); + err = -EINVAL; + goto out; + } + f2fs_wait_on_page_writeback(page, DATA, true); dentry_blk = page_address(page); From e6f326a93627888691a538d80d753fbc920ffcb2 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Sat, 30 Jun 2018 23:57:03 +0800 Subject: [PATCH 0916/1212] f2fs: avoid the global name 'fault_name' Non-prefix global name 'fault_name' will pollute global namespace, fix it. Refer to: https://lists.01.org/pipermail/kbuild-all/2018-June/049660.html To: Jaegeuk Kim To: Chao Yu Cc: linux-f2fs-devel@lists.sourceforge.net Cc: linux-kernel@vger.kernel.org Reported-by: kbuild test robot Signed-off-by: Gao Xiang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 4 ++-- fs/f2fs/super.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 196967c1bc69..a2ae88fc3567 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -67,7 +67,7 @@ struct f2fs_fault_info { unsigned int inject_type; }; -extern char *fault_name[FAULT_MAX]; +extern char *f2fs_fault_name[FAULT_MAX]; #define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type))) #endif @@ -1348,7 +1348,7 @@ struct f2fs_sb_info { #ifdef CONFIG_F2FS_FAULT_INJECTION #define f2fs_show_injection_info(type) \ printk("%sF2FS-fs : inject %s in %s of %pF\n", \ - KERN_INFO, fault_name[type], \ + KERN_INFO, f2fs_fault_name[type], \ __func__, __builtin_return_address(0)) static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) { diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b0aa9ce320e8..2a3338dacc5a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -41,7 +41,7 @@ static struct kmem_cache *f2fs_inode_cachep; #ifdef CONFIG_F2FS_FAULT_INJECTION -char *fault_name[FAULT_MAX] = { +char *f2fs_fault_name[FAULT_MAX] = { [FAULT_KMALLOC] = "kmalloc", [FAULT_KVMALLOC] = "kvmalloc", [FAULT_PAGE_ALLOC] = "page alloc", From 76e16a4d6e69eb0ded796750383081a51f460ea8 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sun, 1 Jul 2018 13:57:06 -0700 Subject: [PATCH 0917/1212] f2fs: Replace strncpy with memcpy gcc 8.1.0 complains: fs/f2fs/namei.c: In function 'f2fs_update_extension_list': fs/f2fs/namei.c:257:3: warning: 'strncpy' output truncated before terminating nul copying as many bytes from a string as its length fs/f2fs/namei.c:249:3: warning: 'strncpy' output truncated before terminating nul copying as many bytes from a string as its length Using strncpy() is indeed less than perfect since the length of data to be copied has already been determined with strlen(). Replace strncpy() with memcpy() to address the warning and optimize the code a little. Signed-off-by: Guenter Roeck Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 198d5583da06..f3be4dc3cbe2 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -246,7 +246,7 @@ int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, return -EINVAL; if (hot) { - strncpy(extlist[count], name, strlen(name)); + memcpy(extlist[count], name, strlen(name)); sbi->raw_super->hot_ext_count = hot_count + 1; } else { char buf[F2FS_MAX_EXTENSION][F2FS_EXTENSION_LEN]; @@ -254,7 +254,7 @@ int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, memcpy(buf, &extlist[cold_count], F2FS_EXTENSION_LEN * hot_count); memset(extlist[cold_count], 0, F2FS_EXTENSION_LEN); - strncpy(extlist[cold_count], name, strlen(name)); + memcpy(extlist[cold_count], name, strlen(name)); memcpy(&extlist[cold_count + 1], buf, F2FS_EXTENSION_LEN * hot_count); sbi->raw_super->extension_count = cpu_to_le32(cold_count + 1); From 271b5b05e1debaccedf4e3e30488096dfcd105d5 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Mon, 2 Jul 2018 10:40:19 +0800 Subject: [PATCH 0918/1212] f2fs: check the right return value of memory alloc function This patch check the right return value of memory alloc function Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 6c24811e7b9d..52d95e01922a 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2801,7 +2801,7 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi) for (i = 0; i < nm_i->nat_blocks; i++) { nm_i->free_nid_bitmap[i] = f2fs_kvzalloc(sbi, f2fs_bitmap_size(NAT_ENTRY_PER_BLOCK), GFP_KERNEL); - if (!nm_i->free_nid_bitmap) + if (!nm_i->free_nid_bitmap[i]) return -ENOMEM; } From 4b0b25270d5d7baad3d228b73dbb81fb3a1df432 Mon Sep 17 00:00:00 2001 From: Sahitya Tummala Date: Mon, 2 Jul 2018 11:37:40 +0530 Subject: [PATCH 0919/1212] f2fs: show the fsync_mode=nobarrier mount option This patch shows the fsync_mode=nobarrier mount option in f2fs_show_options(). Signed-off-by: Sahitya Tummala Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 2a3338dacc5a..2ae2cfcf06c8 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1343,6 +1343,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",fsync_mode=%s", "posix"); else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) seq_printf(seq, ",fsync_mode=%s", "strict"); + else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_NOBARRIER) + seq_printf(seq, ",fsync_mode=%s", "nobarrier"); return 0; } From e25e77eec111999a5765db0565747cfe19711b7f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 4 Jul 2018 18:04:10 +0800 Subject: [PATCH 0920/1212] f2fs: try grabbing node page lock aggressively in sync scenario In synchronous scenario, like in checkpoint(), we are going to flush dirty node pages to device synchronously, we can easily failed writebacking node page due to trylock_page() failure, especially in condition of intensive lock competition, which can cause long latency of checkpoint(). So let's use lock_page() in synchronous scenario to avoid this issue. Signed-off-by: Yunlei He Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 52d95e01922a..15960985ae27 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1638,7 +1638,9 @@ int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, !is_cold_node(page))) continue; lock_node: - if (!trylock_page(page)) + if (wbc->sync_mode == WB_SYNC_ALL) + lock_page(page); + else if (!trylock_page(page)) continue; if (unlikely(page->mapping != NODE_MAPPING(sbi))) { From 43abf6c39227e2a391ea8060bb730569bf51d5af Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 4 Jul 2018 21:20:05 +0800 Subject: [PATCH 0921/1212] f2fs: fix to skip GC if type in SSA and SIT is inconsistent If segment type in SSA and SIT is inconsistent, we will encounter below BUG_ON during GC, to avoid this panic, let's just skip doing GC on such segment. The bug is triggered with image reported in below link: https://bugzilla.kernel.org/show_bug.cgi?id=200223 [ 388.060262] ------------[ cut here ]------------ [ 388.060268] kernel BUG at /home/y00370721/git/devf2fs/gc.c:989! [ 388.061172] invalid opcode: 0000 [#1] SMP [ 388.061773] Modules linked in: f2fs(O) bluetooth ecdh_generic xt_tcpudp iptable_filter ip_tables x_tables lp ttm drm_kms_helper drm intel_rapl sb_edac crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pcbc aesni_intel fb_sys_fops ppdev aes_x86_64 syscopyarea crypto_simd sysfillrect parport_pc joydev sysimgblt glue_helper parport cryptd i2c_piix4 serio_raw mac_hid btrfs hid_generic usbhid hid raid6_pq psmouse pata_acpi floppy [ 388.064247] CPU: 7 PID: 4151 Comm: f2fs_gc-7:0 Tainted: G O 4.13.0-rc1+ #26 [ 388.065306] Hardware name: Xen HVM domU, BIOS 4.1.2_115-900.260_ 11/06/2015 [ 388.066058] task: ffff880201583b80 task.stack: ffffc90004d7c000 [ 388.069948] RIP: 0010:do_garbage_collect+0xcc8/0xcd0 [f2fs] [ 388.070766] RSP: 0018:ffffc90004d7fc68 EFLAGS: 00010202 [ 388.071783] RAX: ffff8801ed227000 RBX: 0000000000000001 RCX: ffffea0007b489c0 [ 388.072700] RDX: ffff880000000000 RSI: 0000000000000001 RDI: ffffea0007b489c0 [ 388.073607] RBP: ffffc90004d7fd58 R08: 0000000000000003 R09: ffffea0007b489dc [ 388.074619] R10: 0000000000000000 R11: 0052782ab317138d R12: 0000000000000018 [ 388.075625] R13: 0000000000000018 R14: ffff880211ceb000 R15: ffff880211ceb000 [ 388.076687] FS: 0000000000000000(0000) GS:ffff880214fc0000(0000) knlGS:0000000000000000 [ 388.083277] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 388.084536] CR2: 0000000000e18c60 CR3: 00000001ecf2e000 CR4: 00000000001406e0 [ 388.085748] Call Trace: [ 388.086690] ? find_next_bit+0xb/0x10 [ 388.088091] f2fs_gc+0x1a8/0x9d0 [f2fs] [ 388.088888] ? lock_timer_base+0x7d/0xa0 [ 388.090213] ? try_to_del_timer_sync+0x44/0x60 [ 388.091698] gc_thread_func+0x342/0x4b0 [f2fs] [ 388.092892] ? wait_woken+0x80/0x80 [ 388.094098] kthread+0x109/0x140 [ 388.095010] ? f2fs_gc+0x9d0/0x9d0 [f2fs] [ 388.096043] ? kthread_park+0x60/0x60 [ 388.097281] ret_from_fork+0x25/0x30 [ 388.098401] Code: ff ff 48 83 e8 01 48 89 44 24 58 e9 27 f8 ff ff 48 83 e8 01 e9 78 fc ff ff 48 8d 78 ff e9 17 fb ff ff 48 83 ef 01 e9 4d f4 ff ff <0f> 0b 66 0f 1f 44 00 00 0f 1f 44 00 00 55 48 89 e5 41 56 41 55 [ 388.100864] RIP: do_garbage_collect+0xcc8/0xcd0 [f2fs] RSP: ffffc90004d7fc68 [ 388.101810] ---[ end trace 81c73d6e6b7da61d ]--- Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 772ef64d2035..78b256c71d1f 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -986,7 +986,13 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, goto next; sum = page_address(sum_page); - f2fs_bug_on(sbi, type != GET_SUM_TYPE((&sum->footer))); + if (type != GET_SUM_TYPE((&sum->footer))) { + f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent segment (%u) " + "type [%d, %d] in SSA and SIT", + segno, type, GET_SUM_TYPE((&sum->footer))); + set_sbi_flag(sbi, SBI_NEED_FSCK); + goto next; + } /* * this is to avoid deadlock: From af9f4e6d590d83c8bbb6a4c88bf7ef2d1c802322 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 1 Aug 2018 19:13:44 +0800 Subject: [PATCH 0922/1212] f2fs: fix to do sanity check with block address in main area This patch add to do sanity check with below field: - cp_pack_total_block_count - blkaddr of data/node - extent info - Overview BUG() in verify_block_addr() when writing to a corrupted f2fs image - Reproduce (4.18 upstream kernel) - POC (poc.c) static void activity(char *mpoint) { char *foo_bar_baz; int err; static int buf[8192]; memset(buf, 0, sizeof(buf)); err = asprintf(&foo_bar_baz, "%s/foo/bar/baz", mpoint); int fd = open(foo_bar_baz, O_RDWR | O_TRUNC, 0777); if (fd >= 0) { write(fd, (char *)buf, sizeof(buf)); fdatasync(fd); close(fd); } } int main(int argc, char *argv[]) { activity(argv[1]); return 0; } - Kernel message [ 689.349473] F2FS-fs (loop0): Mounted with checkpoint version = 3 [ 699.728662] WARNING: CPU: 0 PID: 1309 at fs/f2fs/segment.c:2860 f2fs_inplace_write_data+0x232/0x240 [ 699.728670] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy [ 699.729056] CPU: 0 PID: 1309 Comm: a.out Not tainted 4.18.0-rc1+ #4 [ 699.729064] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 699.729074] RIP: 0010:f2fs_inplace_write_data+0x232/0x240 [ 699.729076] Code: ff e9 cf fe ff ff 49 8d 7d 10 e8 39 45 ad ff 4d 8b 7d 10 be 04 00 00 00 49 8d 7f 48 e8 07 49 ad ff 45 8b 7f 48 e9 fb fe ff ff <0f> 0b f0 41 80 4d 48 04 e9 65 fe ff ff 90 66 66 66 66 90 55 48 8d [ 699.729130] RSP: 0018:ffff8801f43af568 EFLAGS: 00010202 [ 699.729139] RAX: 000000000000003f RBX: ffff8801f43af7b8 RCX: ffffffffb88c9113 [ 699.729142] RDX: 0000000000000003 RSI: dffffc0000000000 RDI: ffff8802024e5540 [ 699.729144] RBP: ffff8801f43af590 R08: 0000000000000009 R09: ffffffffffffffe8 [ 699.729147] R10: 0000000000000001 R11: ffffed0039b0596a R12: ffff8802024e5540 [ 699.729149] R13: ffff8801f0335500 R14: ffff8801e3e7a700 R15: ffff8801e1ee4450 [ 699.729154] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000 [ 699.729156] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 699.729159] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0 [ 699.729171] Call Trace: [ 699.729192] f2fs_do_write_data_page+0x2e2/0xe00 [ 699.729203] ? f2fs_should_update_outplace+0xd0/0xd0 [ 699.729238] ? memcg_drain_all_list_lrus+0x280/0x280 [ 699.729269] ? __radix_tree_replace+0xa3/0x120 [ 699.729276] __write_data_page+0x5c7/0xe30 [ 699.729291] ? kasan_check_read+0x11/0x20 [ 699.729310] ? page_mapped+0x8a/0x110 [ 699.729321] ? page_mkclean+0xe9/0x160 [ 699.729327] ? f2fs_do_write_data_page+0xe00/0xe00 [ 699.729331] ? invalid_page_referenced_vma+0x130/0x130 [ 699.729345] ? clear_page_dirty_for_io+0x332/0x450 [ 699.729351] f2fs_write_cache_pages+0x4ca/0x860 [ 699.729358] ? __write_data_page+0xe30/0xe30 [ 699.729374] ? percpu_counter_add_batch+0x22/0xa0 [ 699.729380] ? kasan_check_write+0x14/0x20 [ 699.729391] ? _raw_spin_lock+0x17/0x40 [ 699.729403] ? f2fs_mark_inode_dirty_sync.part.18+0x16/0x30 [ 699.729413] ? iov_iter_advance+0x113/0x640 [ 699.729418] ? f2fs_write_end+0x133/0x2e0 [ 699.729423] ? balance_dirty_pages_ratelimited+0x239/0x640 [ 699.729428] f2fs_write_data_pages+0x329/0x520 [ 699.729433] ? generic_perform_write+0x250/0x320 [ 699.729438] ? f2fs_write_cache_pages+0x860/0x860 [ 699.729454] ? current_time+0x110/0x110 [ 699.729459] ? f2fs_preallocate_blocks+0x1ef/0x370 [ 699.729464] do_writepages+0x37/0xb0 [ 699.729468] ? f2fs_write_cache_pages+0x860/0x860 [ 699.729472] ? do_writepages+0x37/0xb0 [ 699.729478] __filemap_fdatawrite_range+0x19a/0x1f0 [ 699.729483] ? delete_from_page_cache_batch+0x4e0/0x4e0 [ 699.729496] ? __vfs_write+0x2b2/0x410 [ 699.729501] file_write_and_wait_range+0x66/0xb0 [ 699.729506] f2fs_do_sync_file+0x1f9/0xd90 [ 699.729511] ? truncate_partial_data_page+0x290/0x290 [ 699.729521] ? __sb_end_write+0x30/0x50 [ 699.729526] ? vfs_write+0x20f/0x260 [ 699.729530] f2fs_sync_file+0x9a/0xb0 [ 699.729534] ? f2fs_do_sync_file+0xd90/0xd90 [ 699.729548] vfs_fsync_range+0x68/0x100 [ 699.729554] ? __fget_light+0xc9/0xe0 [ 699.729558] do_fsync+0x3d/0x70 [ 699.729562] __x64_sys_fdatasync+0x24/0x30 [ 699.729585] do_syscall_64+0x78/0x170 [ 699.729595] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 699.729613] RIP: 0033:0x7f9bf930d800 [ 699.729615] Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 83 3d 49 bf 2c 00 00 75 10 b8 4b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 be 78 01 00 48 89 04 24 [ 699.729668] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b [ 699.729673] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800 [ 699.729675] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003 [ 699.729678] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000 [ 699.729680] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610 [ 699.729683] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000 [ 699.729687] ---[ end trace 4ce02f25ff7d3df5 ]--- [ 699.729782] ------------[ cut here ]------------ [ 699.729785] kernel BUG at fs/f2fs/segment.h:654! [ 699.731055] invalid opcode: 0000 [#1] SMP KASAN PTI [ 699.732104] CPU: 0 PID: 1309 Comm: a.out Tainted: G W 4.18.0-rc1+ #4 [ 699.733684] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 699.735611] RIP: 0010:f2fs_submit_page_bio+0x29b/0x730 [ 699.736649] Code: 54 49 8d bd 18 04 00 00 e8 b2 59 af ff 41 8b 8d 18 04 00 00 8b 45 b8 41 d3 e6 44 01 f0 4c 8d 73 14 41 39 c7 0f 82 37 fe ff ff <0f> 0b 65 8b 05 2c 04 77 47 89 c0 48 0f a3 05 52 c1 d5 01 0f 92 c0 [ 699.740524] RSP: 0018:ffff8801f43af508 EFLAGS: 00010283 [ 699.741573] RAX: 0000000000000000 RBX: ffff8801f43af7b8 RCX: ffffffffb88a7cef [ 699.743006] RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff8801e3e7a64c [ 699.744426] RBP: ffff8801f43af558 R08: ffffed003e066b55 R09: ffffed003e066b55 [ 699.745833] R10: 0000000000000001 R11: ffffed003e066b54 R12: ffffea0007876940 [ 699.747256] R13: ffff8801f0335500 R14: ffff8801e3e7a600 R15: 0000000000000001 [ 699.748683] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000 [ 699.750293] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 699.751462] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0 [ 699.752874] Call Trace: [ 699.753386] ? f2fs_inplace_write_data+0x93/0x240 [ 699.754341] f2fs_inplace_write_data+0xd2/0x240 [ 699.755271] f2fs_do_write_data_page+0x2e2/0xe00 [ 699.756214] ? f2fs_should_update_outplace+0xd0/0xd0 [ 699.757215] ? memcg_drain_all_list_lrus+0x280/0x280 [ 699.758209] ? __radix_tree_replace+0xa3/0x120 [ 699.759164] __write_data_page+0x5c7/0xe30 [ 699.760002] ? kasan_check_read+0x11/0x20 [ 699.760823] ? page_mapped+0x8a/0x110 [ 699.761573] ? page_mkclean+0xe9/0x160 [ 699.762345] ? f2fs_do_write_data_page+0xe00/0xe00 [ 699.763332] ? invalid_page_referenced_vma+0x130/0x130 [ 699.764374] ? clear_page_dirty_for_io+0x332/0x450 [ 699.765347] f2fs_write_cache_pages+0x4ca/0x860 [ 699.766276] ? __write_data_page+0xe30/0xe30 [ 699.767161] ? percpu_counter_add_batch+0x22/0xa0 [ 699.768112] ? kasan_check_write+0x14/0x20 [ 699.768951] ? _raw_spin_lock+0x17/0x40 [ 699.769739] ? f2fs_mark_inode_dirty_sync.part.18+0x16/0x30 [ 699.770885] ? iov_iter_advance+0x113/0x640 [ 699.771743] ? f2fs_write_end+0x133/0x2e0 [ 699.772569] ? balance_dirty_pages_ratelimited+0x239/0x640 [ 699.773680] f2fs_write_data_pages+0x329/0x520 [ 699.774603] ? generic_perform_write+0x250/0x320 [ 699.775544] ? f2fs_write_cache_pages+0x860/0x860 [ 699.776510] ? current_time+0x110/0x110 [ 699.777299] ? f2fs_preallocate_blocks+0x1ef/0x370 [ 699.778279] do_writepages+0x37/0xb0 [ 699.779026] ? f2fs_write_cache_pages+0x860/0x860 [ 699.779978] ? do_writepages+0x37/0xb0 [ 699.780755] __filemap_fdatawrite_range+0x19a/0x1f0 [ 699.781746] ? delete_from_page_cache_batch+0x4e0/0x4e0 [ 699.782820] ? __vfs_write+0x2b2/0x410 [ 699.783597] file_write_and_wait_range+0x66/0xb0 [ 699.784540] f2fs_do_sync_file+0x1f9/0xd90 [ 699.785381] ? truncate_partial_data_page+0x290/0x290 [ 699.786415] ? __sb_end_write+0x30/0x50 [ 699.787204] ? vfs_write+0x20f/0x260 [ 699.787941] f2fs_sync_file+0x9a/0xb0 [ 699.788694] ? f2fs_do_sync_file+0xd90/0xd90 [ 699.789572] vfs_fsync_range+0x68/0x100 [ 699.790360] ? __fget_light+0xc9/0xe0 [ 699.791128] do_fsync+0x3d/0x70 [ 699.791779] __x64_sys_fdatasync+0x24/0x30 [ 699.792614] do_syscall_64+0x78/0x170 [ 699.793371] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 699.794406] RIP: 0033:0x7f9bf930d800 [ 699.795134] Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 83 3d 49 bf 2c 00 00 75 10 b8 4b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 be 78 01 00 48 89 04 24 [ 699.798960] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b [ 699.800483] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800 [ 699.801923] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003 [ 699.803373] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000 [ 699.804798] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610 [ 699.806233] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000 [ 699.807667] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy [ 699.817079] ---[ end trace 4ce02f25ff7d3df6 ]--- [ 699.818068] RIP: 0010:f2fs_submit_page_bio+0x29b/0x730 [ 699.819114] Code: 54 49 8d bd 18 04 00 00 e8 b2 59 af ff 41 8b 8d 18 04 00 00 8b 45 b8 41 d3 e6 44 01 f0 4c 8d 73 14 41 39 c7 0f 82 37 fe ff ff <0f> 0b 65 8b 05 2c 04 77 47 89 c0 48 0f a3 05 52 c1 d5 01 0f 92 c0 [ 699.822919] RSP: 0018:ffff8801f43af508 EFLAGS: 00010283 [ 699.823977] RAX: 0000000000000000 RBX: ffff8801f43af7b8 RCX: ffffffffb88a7cef [ 699.825436] RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff8801e3e7a64c [ 699.826881] RBP: ffff8801f43af558 R08: ffffed003e066b55 R09: ffffed003e066b55 [ 699.828292] R10: 0000000000000001 R11: ffffed003e066b54 R12: ffffea0007876940 [ 699.829750] R13: ffff8801f0335500 R14: ffff8801e3e7a600 R15: 0000000000000001 [ 699.831192] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000 [ 699.832793] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 699.833981] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0 [ 699.835556] ================================================================== [ 699.837029] BUG: KASAN: stack-out-of-bounds in update_stack_state+0x38c/0x3e0 [ 699.838462] Read of size 8 at addr ffff8801f43af970 by task a.out/1309 [ 699.840086] CPU: 0 PID: 1309 Comm: a.out Tainted: G D W 4.18.0-rc1+ #4 [ 699.841603] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 699.843475] Call Trace: [ 699.843982] dump_stack+0x7b/0xb5 [ 699.844661] print_address_description+0x70/0x290 [ 699.845607] kasan_report+0x291/0x390 [ 699.846351] ? update_stack_state+0x38c/0x3e0 [ 699.853831] __asan_load8+0x54/0x90 [ 699.854569] update_stack_state+0x38c/0x3e0 [ 699.855428] ? __read_once_size_nocheck.constprop.7+0x20/0x20 [ 699.856601] ? __save_stack_trace+0x5e/0x100 [ 699.857476] unwind_next_frame.part.5+0x18e/0x490 [ 699.858448] ? unwind_dump+0x290/0x290 [ 699.859217] ? clear_page_dirty_for_io+0x332/0x450 [ 699.860185] __unwind_start+0x106/0x190 [ 699.860974] __save_stack_trace+0x5e/0x100 [ 699.861808] ? __save_stack_trace+0x5e/0x100 [ 699.862691] ? unlink_anon_vmas+0xba/0x2c0 [ 699.863525] save_stack_trace+0x1f/0x30 [ 699.864312] save_stack+0x46/0xd0 [ 699.864993] ? __alloc_pages_slowpath+0x1420/0x1420 [ 699.865990] ? flush_tlb_mm_range+0x15e/0x220 [ 699.866889] ? kasan_check_write+0x14/0x20 [ 699.867724] ? __dec_node_state+0x92/0xb0 [ 699.868543] ? lock_page_memcg+0x85/0xf0 [ 699.869350] ? unlock_page_memcg+0x16/0x80 [ 699.870185] ? page_remove_rmap+0x198/0x520 [ 699.871048] ? mark_page_accessed+0x133/0x200 [ 699.871930] ? _cond_resched+0x1a/0x50 [ 699.872700] ? unmap_page_range+0xcd4/0xe50 [ 699.873551] ? rb_next+0x58/0x80 [ 699.874217] ? rb_next+0x58/0x80 [ 699.874895] __kasan_slab_free+0x13c/0x1a0 [ 699.875734] ? unlink_anon_vmas+0xba/0x2c0 [ 699.876563] kasan_slab_free+0xe/0x10 [ 699.877315] kmem_cache_free+0x89/0x1e0 [ 699.878095] unlink_anon_vmas+0xba/0x2c0 [ 699.878913] free_pgtables+0x101/0x1b0 [ 699.879677] exit_mmap+0x146/0x2a0 [ 699.880378] ? __ia32_sys_munmap+0x50/0x50 [ 699.881214] ? kasan_check_read+0x11/0x20 [ 699.882052] ? mm_update_next_owner+0x322/0x380 [ 699.882985] mmput+0x8b/0x1d0 [ 699.883602] do_exit+0x43a/0x1390 [ 699.884288] ? mm_update_next_owner+0x380/0x380 [ 699.885212] ? f2fs_sync_file+0x9a/0xb0 [ 699.885995] ? f2fs_do_sync_file+0xd90/0xd90 [ 699.886877] ? vfs_fsync_range+0x68/0x100 [ 699.887694] ? __fget_light+0xc9/0xe0 [ 699.888442] ? do_fsync+0x3d/0x70 [ 699.889118] ? __x64_sys_fdatasync+0x24/0x30 [ 699.889996] rewind_stack_do_exit+0x17/0x20 [ 699.890860] RIP: 0033:0x7f9bf930d800 [ 699.891585] Code: Bad RIP value. [ 699.892268] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b [ 699.893781] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800 [ 699.895220] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003 [ 699.896643] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000 [ 699.898069] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610 [ 699.899505] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000 [ 699.901241] The buggy address belongs to the page: [ 699.902215] page:ffffea0007d0ebc0 count:0 mapcount:0 mapping:0000000000000000 index:0x0 [ 699.903811] flags: 0x2ffff0000000000() [ 699.904585] raw: 02ffff0000000000 0000000000000000 ffffffff07d00101 0000000000000000 [ 699.906125] raw: 0000000000000000 0000000000240000 00000000ffffffff 0000000000000000 [ 699.907673] page dumped because: kasan: bad access detected [ 699.909108] Memory state around the buggy address: [ 699.910077] ffff8801f43af800: 00 f1 f1 f1 f1 00 f4 f4 f4 f3 f3 f3 f3 00 00 00 [ 699.911528] ffff8801f43af880: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 699.912953] >ffff8801f43af900: 00 00 00 00 00 00 00 00 f1 01 f4 f4 f4 f2 f2 f2 [ 699.914392] ^ [ 699.915758] ffff8801f43af980: f2 00 f4 f4 00 00 00 00 f2 00 00 00 00 00 00 00 [ 699.917193] ffff8801f43afa00: 00 00 00 00 00 00 00 00 00 f3 f3 f3 00 00 00 00 [ 699.918634] ================================================================== - Location https://elixir.bootlin.com/linux/v4.18-rc1/source/fs/f2fs/segment.h#L644 Reported-by Wen Xu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 22 +++++++++++++++++++--- fs/f2fs/data.c | 33 +++++++++++++++++++++++++++------ fs/f2fs/f2fs.h | 3 +++ fs/f2fs/file.c | 12 ++++++++++++ fs/f2fs/inode.c | 17 +++++++++++++++++ fs/f2fs/node.c | 4 ++++ fs/f2fs/segment.h | 3 +-- 7 files changed, 83 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index fef18c291511..ccbf1a7ec95a 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -86,8 +86,10 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, fio.page = page; if (f2fs_submit_page_bio(&fio)) { - f2fs_put_page(page, 1); - goto repeat; + memset(page_address(page), 0, PAGE_SIZE); + f2fs_stop_checkpoint(sbi, false); + f2fs_bug_on(sbi, 1); + return page; } lock_page(page); @@ -143,8 +145,14 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, case META_POR: case DATA_GENERIC: if (unlikely(blkaddr >= MAX_BLKADDR(sbi) || - blkaddr < MAIN_BLKADDR(sbi))) + blkaddr < MAIN_BLKADDR(sbi))) { + if (type == DATA_GENERIC) { + f2fs_msg(sbi->sb, KERN_WARNING, + "access invalid blkaddr:%u", blkaddr); + WARN_ON(1); + } return false; + } break; case META_GENERIC: if (unlikely(blkaddr < SEG0_BLKADDR(sbi) || @@ -772,6 +780,14 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, &cp_page_1, version); if (err) goto invalid_cp1; + + if (le32_to_cpu(cp_block->cp_pack_total_block_count) > + sbi->blocks_per_seg) { + f2fs_msg(sbi->sb, KERN_WARNING, + "invalid cp_pack_total_block_count:%u", + le32_to_cpu(cp_block->cp_pack_total_block_count)); + goto invalid_cp1; + } pre_version = *version; cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 067c09cc18b8..57ea0940f868 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -439,7 +439,10 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) struct page *page = fio->encrypted_page ? fio->encrypted_page : fio->page; - verify_block_addr(fio, fio->new_blkaddr); + if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, + __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) + return -EFAULT; + trace_f2fs_submit_page_bio(page, fio); f2fs_trace_ios(fio, 0); @@ -1043,6 +1046,12 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, next_block: blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); + if (__is_valid_data_blkaddr(blkaddr) && + !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC)) { + err = -EFAULT; + goto sync_out; + } + if (!is_valid_data_blkaddr(sbi, blkaddr)) { if (create) { if (unlikely(f2fs_cp_error(sbi))) { @@ -1497,6 +1506,10 @@ static int f2fs_mpage_readpages(struct address_space *mapping, SetPageUptodate(page); goto confused; } + + if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr, + DATA_GENERIC)) + goto set_error_page; } else { zero_user_segment(page, 0, PAGE_SIZE); if (!PageUptodate(page)) @@ -1697,11 +1710,13 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) f2fs_lookup_extent_cache(inode, page->index, &ei)) { fio->old_blkaddr = ei.blk + page->index - ei.fofs; - if (is_valid_data_blkaddr(fio->sbi, fio->old_blkaddr)) { - ipu_force = true; - fio->need_lock = LOCK_DONE; - goto got_it; - } + if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, + DATA_GENERIC)) + return -EFAULT; + + ipu_force = true; + fio->need_lock = LOCK_DONE; + goto got_it; } /* Deadlock due to between page->lock and f2fs_lock_op */ @@ -1720,6 +1735,12 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) goto out_writepage; } got_it: + if (__is_valid_data_blkaddr(fio->old_blkaddr) && + !f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, + DATA_GENERIC)) { + err = -EFAULT; + goto out_writepage; + } /* * If current allocation needs SSR, * it had better in-place writes for updated data. diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a2ae88fc3567..9ed528a730c7 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2754,6 +2754,9 @@ static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, spin_unlock(&sbi->iostat_lock); } +#define __is_meta_io(fio) (PAGE_TYPE_OF_BIO(fio->type) == META && \ + (!is_read_io(fio->op) || fio->is_meta)) + bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 9301697c3e7b..9908f00fb671 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -423,6 +423,13 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); + if (__is_valid_data_blkaddr(blkaddr) && + !f2fs_is_valid_blkaddr(F2FS_I_SB(inode), + blkaddr, DATA_GENERIC)) { + f2fs_put_dnode(&dn); + goto fail; + } + if (__found_offset(F2FS_I_SB(inode), blkaddr, dirty, pgofs, whence)) { f2fs_put_dnode(&dn); @@ -517,6 +524,11 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) dn->data_blkaddr = NULL_ADDR; f2fs_set_data_blkaddr(dn); + + if (__is_valid_data_blkaddr(blkaddr) && + !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC)) + continue; + f2fs_invalidate_blocks(sbi, blkaddr); if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page)) clear_inode_flag(dn->inode, FI_FIRST_BLOCK_WRITTEN); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 2d322eb18dca..5ef9711902df 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -236,6 +236,23 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) __func__, inode->i_ino); return false; } + + if (F2FS_I(inode)->extent_tree) { + struct extent_info *ei = &F2FS_I(inode)->extent_tree->largest; + + if (ei->len && + (!f2fs_is_valid_blkaddr(sbi, ei->blk, DATA_GENERIC) || + !f2fs_is_valid_blkaddr(sbi, ei->blk + ei->len - 1, + DATA_GENERIC))) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: inode (ino=%lx) extent info [%u, %u, %u] " + "is incorrect, run fsck to fix", + __func__, inode->i_ino, + ei->blk, ei->fofs, ei->len); + return false; + } + } return true; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 15960985ae27..912f37394577 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1401,6 +1401,10 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, return 0; } + if (__is_valid_data_blkaddr(ni.blk_addr) && + !f2fs_is_valid_blkaddr(sbi, ni.blk_addr, DATA_GENERIC)) + goto redirty_out; + if (atomic && !test_opt(sbi, NOBARRIER)) fio.op_flags |= WRITE_FLUSH_FUA; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index a7460da9af43..b5bd3287e104 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -645,8 +645,7 @@ static inline void verify_block_addr(struct f2fs_io_info *fio, block_t blk_addr) { struct f2fs_sb_info *sbi = fio->sbi; - if (PAGE_TYPE_OF_BIO(fio->type) == META && - (!is_read_io(fio->op) || fio->is_meta)) + if (__is_meta_io(fio)) verify_blkaddr(sbi, blk_addr, META_GENERIC); else verify_blkaddr(sbi, blk_addr, DATA_GENERIC); From b30479ee9e2fef5769fec840555645371b5ae07f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 5 Jul 2018 19:37:00 +0800 Subject: [PATCH 0923/1212] f2fs: fix to detect looped node chain correctly Below dmesg was printed when testing generic/388 of fstest: F2FS-fs (zram1): find_fsync_dnodes: detect looped node chain, blkaddr:526615, next:526616 F2FS-fs (zram1): Cannot recover all fsync data errno=-22 F2FS-fs (zram1): Mounted with checkpoint version = 22300d0e F2FS-fs (zram1): find_fsync_dnodes: detect looped node chain, blkaddr:526615, next:526616 F2FS-fs (zram1): Cannot recover all fsync data errno=-22 The reason is that we initialize free_blocks with free blocks of filesystem, so if filesystem is full, free_blocks can be zero, below condition will be true, so that, it will fail recovery. if (++loop_cnt >= free_blocks || blkaddr == next_blkaddr_of_node(page)) To fix this issue, initialize free_blocks with correct value which includes over-privision blocks. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 184b34be635b..501767451e2b 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -241,8 +241,8 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, struct page *page = NULL; block_t blkaddr; unsigned int loop_cnt = 0; - unsigned int free_blocks = sbi->user_block_count - - valid_user_blocks(sbi); + unsigned int free_blocks = MAIN_SEGS(sbi) * sbi->blocks_per_seg - + valid_user_blocks(sbi); int err = 0; /* get node pages in the current segment */ From 58d8acc92028a6afd373452a0b63535ed6961ad0 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 5 Jul 2018 14:24:11 +0800 Subject: [PATCH 0924/1212] f2fs: enable real-time discard by default f2fs is focused on flash based storage, so let's enable real-time discard by default, if user don't want to enable it, 'nodiscard' mount option should be used on mount. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 2ae2cfcf06c8..9cd7c3d159db 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1368,12 +1368,12 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, NOHEAP); sbi->sb->s_flags |= MS_LAZYTIME; set_opt(sbi, FLUSH_MERGE); - if (f2fs_sb_has_blkzoned(sbi->sb)) { - set_opt_mode(sbi, F2FS_MOUNT_LFS); + if (blk_queue_discard(bdev_get_queue(sbi->sb->s_bdev))) set_opt(sbi, DISCARD); - } else { + if (f2fs_sb_has_blkzoned(sbi->sb)) + set_opt_mode(sbi, F2FS_MOUNT_LFS); + else set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE); - } #ifdef CONFIG_F2FS_FS_XATTR set_opt(sbi, XATTR_USER); From 648f2564d47ae7ad8daea47d26189274ca12ffbd Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 6 Jul 2018 20:50:57 -0700 Subject: [PATCH 0925/1212] f2fs: fix defined but not used build warnings Fix build warnings in f2fs when CONFIG_PROC_FS is not enabled by marking the unused functions as __maybe_unused. ../fs/f2fs/sysfs.c:519:12: warning: 'segment_info_seq_show' defined but not used [-Wunused-function] ../fs/f2fs/sysfs.c:546:12: warning: 'segment_bits_seq_show' defined but not used [-Wunused-function] ../fs/f2fs/sysfs.c:570:12: warning: 'iostat_info_seq_show' defined but not used [-Wunused-function] Signed-off-by: Randy Dunlap Cc: Jaegeuk Kim Cc: Chao Yu Cc: linux-f2fs-devel@lists.sourceforge.net Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 27ddf60e3362..47f24b922af0 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -9,6 +9,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ +#include #include #include #include @@ -518,7 +519,8 @@ static struct kobject f2fs_feat = { .kset = &f2fs_kset, }; -static int segment_info_seq_show(struct seq_file *seq, void *offset) +static int __maybe_unused segment_info_seq_show(struct seq_file *seq, + void *offset) { struct super_block *sb = seq->private; struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -545,7 +547,8 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset) return 0; } -static int segment_bits_seq_show(struct seq_file *seq, void *offset) +static int __maybe_unused segment_bits_seq_show(struct seq_file *seq, + void *offset) { struct super_block *sb = seq->private; struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -569,7 +572,8 @@ static int segment_bits_seq_show(struct seq_file *seq, void *offset) return 0; } -static int iostat_info_seq_show(struct seq_file *seq, void *offset) +static int __maybe_unused iostat_info_seq_show(struct seq_file *seq, + void *offset) { struct super_block *sb = seq->private; struct f2fs_sb_info *sbi = F2FS_SB(sb); From c13d317593359a77518028380ef25572ca5b3e21 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 8 Jul 2018 22:16:53 +0800 Subject: [PATCH 0926/1212] f2fs: detect bug_on in f2fs_wait_discard_bios Add bug_on to detect potential non-empty discard wait list. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 4c6ede354774..5d6a0ca899b2 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1468,6 +1468,8 @@ bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) /* just to make sure there is no pending discard commands */ __wait_all_discard_cmd(sbi, NULL); + + f2fs_bug_on(sbi, atomic_read(&dcc->discard_cmd_cnt)); return dropped; } From a992c9d815499485230762b3ffc4650739fc2573 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 8 Jul 2018 22:16:54 +0800 Subject: [PATCH 0927/1212] f2fs: clean up with IS_INODE() Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 5ef9711902df..07e68a306224 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -122,7 +122,7 @@ static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page if (!f2fs_sb_has_inode_chksum(sbi->sb)) return false; - if (!RAW_IS_INODE(F2FS_NODE(page)) || !(ri->i_inline & F2FS_EXTRA_ATTR)) + if (!IS_INODE(page) || !(ri->i_inline & F2FS_EXTRA_ATTR)) return false; if (!F2FS_FITS_IN_INODE(ri, le16_to_cpu(ri->i_extra_isize), From 8e37f3a2aad381c4937388a47bb6942634a53de5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 8 Jul 2018 22:08:09 +0800 Subject: [PATCH 0928/1212] f2fs: stop issuing discard immediately if there is queued IO For background discard policy, even if there is queued user IO, still we will check max_requests times for next discard entry, it is unneeded, let's just stop this round submission immediately. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5d6a0ca899b2..89f9471dbca5 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1268,7 +1268,7 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, struct list_head *pend_list; struct discard_cmd *dc, *tmp; struct blk_plug plug; - int i, iter = 0, issued = 0; + int i, issued = 0; bool io_interrupted = false; for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { @@ -1289,20 +1289,19 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, if (dpolicy->io_aware && i < dpolicy->io_aware_gran && !is_idle(sbi)) { io_interrupted = true; - goto skip; + break; } __submit_discard_cmd(sbi, dpolicy, dc); - issued++; -skip: - if (++iter >= dpolicy->max_requests) + + if (++issued >= dpolicy->max_requests) break; } blk_finish_plug(&plug); next: mutex_unlock(&dcc->cmd_lock); - if (iter >= dpolicy->max_requests) + if (issued >= dpolicy->max_requests || io_interrupted) break; } From 7b1c7d07d000f7e0c90d0c9930c9e360d4f33878 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 8 Jul 2018 22:11:01 +0800 Subject: [PATCH 0929/1212] f2fs: issue small discard by LBA order For small granularity discard which size is smaller than 64KB, if we issue those kind of discards orderly by size, their IOs will be spread into entire logical address, so that in FTL, L2P table will be updated randomly, result bad wear rate in the table. In this patch, we choose to issue small discard by LBA order, by this way, we can expect that L2P table updates from adjacent discard IOs can be merged in the cache, so it can reduce lifetime wearing of flash. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/segment.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9ed528a730c7..3cbd25644b30 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -359,6 +359,7 @@ struct discard_policy { unsigned int io_aware_gran; /* minimum granularity discard not be aware of I/O */ bool io_aware; /* issue discard in idle time */ bool sync; /* submit discard with REQ_SYNC flag */ + bool ordered; /* issue discard by lba order */ unsigned int granularity; /* discard granularity */ }; @@ -375,6 +376,7 @@ struct discard_cmd_control { unsigned int max_discards; /* max. discards to be issued */ unsigned int discard_granularity; /* discard granularity */ unsigned int undiscard_blks; /* # of undiscard blocks */ + unsigned int next_pos; /* next discard position */ atomic_t issued_discard; /* # of issued discard */ atomic_t issing_discard; /* # of issing discard */ atomic_t discard_cmd_cnt; /* # of cached cmd count */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 89f9471dbca5..cc7a78bbb422 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1015,6 +1015,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, /* common policy */ dpolicy->type = discard_type; dpolicy->sync = true; + dpolicy->ordered = false; dpolicy->granularity = granularity; dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; @@ -1026,6 +1027,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; dpolicy->io_aware = true; dpolicy->sync = false; + dpolicy->ordered = true; if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) { dpolicy->granularity = 1; dpolicy->max_interval = DEF_MIN_DISCARD_ISSUE_TIME; @@ -1261,6 +1263,63 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, return 0; } +static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *prev_dc = NULL, *next_dc = NULL; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + struct discard_cmd *dc; + struct blk_plug plug; + unsigned int pos = dcc->next_pos; + unsigned int issued = 0; + bool io_interrupted = false; + + mutex_lock(&dcc->cmd_lock); + dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, + NULL, pos, + (struct rb_entry **)&prev_dc, + (struct rb_entry **)&next_dc, + &insert_p, &insert_parent, true); + if (!dc) + dc = next_dc; + + blk_start_plug(&plug); + + while (dc) { + struct rb_node *node; + + if (dc->state != D_PREP) + goto next; + + if (dpolicy->io_aware && !is_idle(sbi)) { + io_interrupted = true; + break; + } + + dcc->next_pos = dc->lstart + dc->len; + __submit_discard_cmd(sbi, dpolicy, dc); + + if (++issued >= dpolicy->max_requests) + break; +next: + node = rb_next(&dc->rb_node); + dc = rb_entry_safe(node, struct discard_cmd, rb_node); + } + + blk_finish_plug(&plug); + + if (!dc) + dcc->next_pos = 0; + + mutex_unlock(&dcc->cmd_lock); + + if (!issued && io_interrupted) + issued = -1; + + return issued; +} + static int __issue_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy) { @@ -1274,6 +1333,10 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { if (i + 1 < dpolicy->granularity) break; + + if (i < DEFAULT_DISCARD_GRANULARITY && dpolicy->ordered) + return __issue_discard_cmd_orderly(sbi, dpolicy); + pend_list = &dcc->pend_list[i]; mutex_lock(&dcc->cmd_lock); @@ -1834,6 +1897,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) dcc->nr_discards = 0; dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg; dcc->undiscard_blks = 0; + dcc->next_pos = 0; dcc->root = RB_ROOT; dcc->rbtree_check = false; From 5e39f1496f8eb41f19cef9be713c5abef787153b Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Mon, 9 Jul 2018 20:32:42 -0700 Subject: [PATCH 0930/1212] f2fs: Keep alloc_valid_block_count in sync If we attempt to request more blocks than we have room for, we try to instead request as much as we can, however, alloc_valid_block_count is not decremented to match the new value, allowing it to drift higher until the next checkpoint. This always decrements it when the requested amount cannot be fulfilled. Signed-off-by: Daniel Rosenberg Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3cbd25644b30..4c69b7f2194b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1751,18 +1751,20 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, sbi->total_valid_block_count -= diff; if (!*count) { spin_unlock(&sbi->stat_lock); - percpu_counter_sub(&sbi->alloc_valid_block_count, diff); goto enospc; } } spin_unlock(&sbi->stat_lock); - if (unlikely(release)) + if (unlikely(release)) { + percpu_counter_sub(&sbi->alloc_valid_block_count, release); dquot_release_reservation_block(inode, release); + } f2fs_i_blocks_write(inode, *count, true, true); return 0; enospc: + percpu_counter_sub(&sbi->alloc_valid_block_count, release); dquot_release_reservation_block(inode, release); return -ENOSPC; } From f0416587766350a3338ee477a467366682a5cb0d Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Thu, 12 Jul 2018 23:09:26 +0800 Subject: [PATCH 0931/1212] f2fs: do not set free of current section For the case when sbi->segs_per_sec > 1, take section:segment = 5 for example, if segment 1 is just used and allocate new segment 2, and the blocks of segment 1 is invalidated, at this time, the previous code will use __set_test_and_free to free the free_secmap and free_sections++, this is not correct since it is still a current section, so fix it. Signed-off-by: Yunlong Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index b5bd3287e104..50495515f0a0 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -448,6 +448,8 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi, if (test_and_clear_bit(segno, free_i->free_segmap)) { free_i->free_segments++; + if (IS_CURSEC(sbi, secno)) + goto skip_free; next = find_next_bit(free_i->free_segmap, start_segno + sbi->segs_per_sec, start_segno); if (next >= start_segno + sbi->segs_per_sec) { @@ -455,6 +457,7 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi, free_i->free_sections++; } } +skip_free: spin_unlock(&free_i->segmap_lock); } From a6eb50d252c4b301705c3352671dd28b854eef91 Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Thu, 12 Jul 2018 23:09:28 +0800 Subject: [PATCH 0932/1212] f2fs: blk_finish_plug of submit_bio in lfs mode Expand the blk_finish_plug action from blkzoned to normal lfs mode, since plug will cause the out-of-order IO submission, which is not friendly to flash in lfs mode. Signed-off-by: Yunlong Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 57ea0940f868..4ac63e77e474 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -262,7 +262,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, if (type != DATA && type != NODE) goto submit_io; - if (f2fs_sb_has_blkzoned(sbi->sb) && current->plug) + if (test_opt(sbi, LFS) && current->plug) blk_finish_plug(current->plug); start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS; From fab2fbb28099ff7ea5ffaab1e6c239f5987b752b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 8 Jul 2018 22:16:55 +0800 Subject: [PATCH 0933/1212] f2fs: fix to do sanity check with i_extra_isize If inode.i_extra_isize was fuzzed to an abnormal value, when calculating inline data size, the result will overflow, result in accessing invalid memory area when operating inline data. Let's do sanity check with i_extra_isize during inode loading for fixing. https://bugzilla.kernel.org/show_bug.cgi?id=200421 - Reproduce - POC (poc.c) #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void activity(char *mpoint) { char *foo_bar_baz; char *foo_baz; char *xattr; int err; err = asprintf(&foo_bar_baz, "%s/foo/bar/baz", mpoint); err = asprintf(&foo_baz, "%s/foo/baz", mpoint); err = asprintf(&xattr, "%s/foo/bar/xattr", mpoint); rename(foo_bar_baz, foo_baz); char buf2[113]; memset(buf2, 0, sizeof(buf2)); listxattr(xattr, buf2, sizeof(buf2)); removexattr(xattr, "user.mime_type"); } int main(int argc, char *argv[]) { activity(argv[1]); return 0; } - Kernel message Umount the image will leave the following message [ 2910.995489] F2FS-fs (loop0): Mounted with checkpoint version = 2 [ 2918.416465] ================================================================== [ 2918.416807] BUG: KASAN: slab-out-of-bounds in f2fs_iget+0xcb9/0x1a80 [ 2918.417009] Read of size 4 at addr ffff88018efc2068 by task a.out/1229 [ 2918.417311] CPU: 1 PID: 1229 Comm: a.out Not tainted 4.17.0+ #1 [ 2918.417314] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 2918.417323] Call Trace: [ 2918.417366] dump_stack+0x71/0xab [ 2918.417401] print_address_description+0x6b/0x290 [ 2918.417407] kasan_report+0x28e/0x390 [ 2918.417411] ? f2fs_iget+0xcb9/0x1a80 [ 2918.417415] f2fs_iget+0xcb9/0x1a80 [ 2918.417422] ? f2fs_lookup+0x2e7/0x580 [ 2918.417425] f2fs_lookup+0x2e7/0x580 [ 2918.417433] ? __recover_dot_dentries+0x400/0x400 [ 2918.417447] ? legitimize_path.isra.29+0x5a/0xa0 [ 2918.417453] __lookup_slow+0x11c/0x220 [ 2918.417457] ? may_delete+0x2a0/0x2a0 [ 2918.417475] ? deref_stack_reg+0xe0/0xe0 [ 2918.417479] ? __lookup_hash+0xb0/0xb0 [ 2918.417483] lookup_slow+0x3e/0x60 [ 2918.417488] walk_component+0x3ac/0x990 [ 2918.417492] ? generic_permission+0x51/0x1e0 [ 2918.417495] ? inode_permission+0x51/0x1d0 [ 2918.417499] ? pick_link+0x3e0/0x3e0 [ 2918.417502] ? link_path_walk+0x4b1/0x770 [ 2918.417513] ? _raw_spin_lock_irqsave+0x25/0x50 [ 2918.417518] ? walk_component+0x990/0x990 [ 2918.417522] ? path_init+0x2e6/0x580 [ 2918.417526] path_lookupat+0x13f/0x430 [ 2918.417531] ? trailing_symlink+0x3a0/0x3a0 [ 2918.417534] ? do_renameat2+0x270/0x7b0 [ 2918.417538] ? __kasan_slab_free+0x14c/0x190 [ 2918.417541] ? do_renameat2+0x270/0x7b0 [ 2918.417553] ? kmem_cache_free+0x85/0x1e0 [ 2918.417558] ? do_renameat2+0x270/0x7b0 [ 2918.417563] filename_lookup+0x13c/0x280 [ 2918.417567] ? filename_parentat+0x2b0/0x2b0 [ 2918.417572] ? kasan_unpoison_shadow+0x31/0x40 [ 2918.417575] ? kasan_kmalloc+0xa6/0xd0 [ 2918.417593] ? strncpy_from_user+0xaa/0x1c0 [ 2918.417598] ? getname_flags+0x101/0x2b0 [ 2918.417614] ? path_listxattr+0x87/0x110 [ 2918.417619] path_listxattr+0x87/0x110 [ 2918.417623] ? listxattr+0xc0/0xc0 [ 2918.417637] ? mm_fault_error+0x1b0/0x1b0 [ 2918.417654] do_syscall_64+0x73/0x160 [ 2918.417660] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 2918.417676] RIP: 0033:0x7f2f3a3480d7 [ 2918.417677] Code: f0 ff ff 73 01 c3 48 8b 0d be dd 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 b8 c2 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 91 dd 2b 00 f7 d8 64 89 01 48 [ 2918.417732] RSP: 002b:00007fff4095b7d8 EFLAGS: 00000206 ORIG_RAX: 00000000000000c2 [ 2918.417744] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f2f3a3480d7 [ 2918.417746] RDX: 0000000000000071 RSI: 00007fff4095b810 RDI: 000000000126a0c0 [ 2918.417749] RBP: 00007fff4095b890 R08: 000000000126a010 R09: 0000000000000000 [ 2918.417751] R10: 00000000000001ab R11: 0000000000000206 R12: 00000000004005e0 [ 2918.417753] R13: 00007fff4095b990 R14: 0000000000000000 R15: 0000000000000000 [ 2918.417853] Allocated by task 329: [ 2918.418002] kasan_kmalloc+0xa6/0xd0 [ 2918.418007] kmem_cache_alloc+0xc8/0x1e0 [ 2918.418023] mempool_init_node+0x194/0x230 [ 2918.418027] mempool_init+0x12/0x20 [ 2918.418042] bioset_init+0x2bd/0x380 [ 2918.418052] blk_alloc_queue_node+0xe9/0x540 [ 2918.418075] dm_create+0x2c0/0x800 [ 2918.418080] dev_create+0xd2/0x530 [ 2918.418083] ctl_ioctl+0x2a3/0x5b0 [ 2918.418087] dm_ctl_ioctl+0xa/0x10 [ 2918.418092] do_vfs_ioctl+0x13e/0x8c0 [ 2918.418095] ksys_ioctl+0x66/0x70 [ 2918.418098] __x64_sys_ioctl+0x3d/0x50 [ 2918.418102] do_syscall_64+0x73/0x160 [ 2918.418106] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 2918.418204] Freed by task 0: [ 2918.418301] (stack is not available) [ 2918.418521] The buggy address belongs to the object at ffff88018efc0000 which belongs to the cache biovec-max of size 8192 [ 2918.418894] The buggy address is located 104 bytes to the right of 8192-byte region [ffff88018efc0000, ffff88018efc2000) [ 2918.419257] The buggy address belongs to the page: [ 2918.419431] page:ffffea00063bf000 count:1 mapcount:0 mapping:ffff8801f2242540 index:0x0 compound_mapcount: 0 [ 2918.419702] flags: 0x17fff8000008100(slab|head) [ 2918.419879] raw: 017fff8000008100 dead000000000100 dead000000000200 ffff8801f2242540 [ 2918.420101] raw: 0000000000000000 0000000000030003 00000001ffffffff 0000000000000000 [ 2918.420322] page dumped because: kasan: bad access detected [ 2918.420599] Memory state around the buggy address: [ 2918.420764] ffff88018efc1f00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 2918.420975] ffff88018efc1f80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 2918.421194] >ffff88018efc2000: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 2918.421406] ^ [ 2918.421627] ffff88018efc2080: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 2918.421838] ffff88018efc2100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 2918.422046] ================================================================== [ 2918.422264] Disabling lock debugging due to kernel taint [ 2923.901641] BUG: unable to handle kernel paging request at ffff88018f0db000 [ 2923.901884] PGD 22226a067 P4D 22226a067 PUD 222273067 PMD 18e642063 PTE 800000018f0db061 [ 2923.902120] Oops: 0003 [#1] SMP KASAN PTI [ 2923.902274] CPU: 1 PID: 1231 Comm: umount Tainted: G B 4.17.0+ #1 [ 2923.902490] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 2923.902761] RIP: 0010:__memset+0x24/0x30 [ 2923.902906] Code: 90 90 90 90 90 90 66 66 90 66 90 49 89 f9 48 89 d1 83 e2 07 48 c1 e9 03 40 0f b6 f6 48 b8 01 01 01 01 01 01 01 01 48 0f af c6 48 ab 89 d1 f3 aa 4c 89 c8 c3 90 49 89 f9 40 88 f0 48 89 d1 f3 [ 2923.903446] RSP: 0018:ffff88018ddf7ae0 EFLAGS: 00010206 [ 2923.903622] RAX: 0000000000000000 RBX: ffff8801d549d888 RCX: 1ffffffffffdaffb [ 2923.903833] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff88018f0daffc [ 2923.904062] RBP: ffff88018efc206c R08: 1ffff10031df840d R09: ffff88018efc206c [ 2923.904273] R10: ffffffffffffe1ee R11: ffffed0031df65fa R12: 0000000000000000 [ 2923.904485] R13: ffff8801d549dc98 R14: 00000000ffffc3db R15: ffffea00063bec80 [ 2923.904693] FS: 00007fa8b2f8a840(0000) GS:ffff8801f3b00000(0000) knlGS:0000000000000000 [ 2923.904937] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 2923.910080] CR2: ffff88018f0db000 CR3: 000000018f892000 CR4: 00000000000006e0 [ 2923.914930] Call Trace: [ 2923.919724] f2fs_truncate_inline_inode+0x114/0x170 [ 2923.924487] f2fs_truncate_blocks+0x11b/0x7c0 [ 2923.929178] ? f2fs_truncate_data_blocks+0x10/0x10 [ 2923.933834] ? dqget+0x670/0x670 [ 2923.938437] ? f2fs_destroy_extent_tree+0xd6/0x270 [ 2923.943107] ? __radix_tree_lookup+0x2f/0x150 [ 2923.947772] f2fs_truncate+0xd4/0x1a0 [ 2923.952491] f2fs_evict_inode+0x5ab/0x610 [ 2923.957204] evict+0x15f/0x280 [ 2923.961898] __dentry_kill+0x161/0x250 [ 2923.966634] shrink_dentry_list+0xf3/0x250 [ 2923.971897] shrink_dcache_parent+0xa9/0x100 [ 2923.976561] ? shrink_dcache_sb+0x1f0/0x1f0 [ 2923.981177] ? wait_for_completion+0x8a/0x210 [ 2923.985781] ? migrate_swap_stop+0x2d0/0x2d0 [ 2923.990332] do_one_tree+0xe/0x40 [ 2923.994735] shrink_dcache_for_umount+0x3a/0xa0 [ 2923.999077] generic_shutdown_super+0x3e/0x1c0 [ 2924.003350] kill_block_super+0x4b/0x70 [ 2924.007619] deactivate_locked_super+0x65/0x90 [ 2924.011812] cleanup_mnt+0x5c/0xa0 [ 2924.015995] task_work_run+0xce/0xf0 [ 2924.020174] exit_to_usermode_loop+0x115/0x120 [ 2924.024293] do_syscall_64+0x12f/0x160 [ 2924.028479] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 2924.032709] RIP: 0033:0x7fa8b2868487 [ 2924.036888] Code: 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 31 f6 e9 09 00 00 00 66 0f 1f 84 00 00 00 00 00 b8 a6 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d e1 c9 2b 00 f7 d8 64 89 01 48 [ 2924.045750] RSP: 002b:00007ffc39824d58 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 [ 2924.050190] RAX: 0000000000000000 RBX: 00000000008ea030 RCX: 00007fa8b2868487 [ 2924.054604] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 00000000008f4360 [ 2924.058940] RBP: 00000000008f4360 R08: 0000000000000000 R09: 0000000000000014 [ 2924.063186] R10: 00000000000006b2 R11: 0000000000000246 R12: 00007fa8b2d7183c [ 2924.067418] R13: 0000000000000000 R14: 00000000008ea210 R15: 00007ffc39824fe0 [ 2924.071534] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep snd_pcm snd_timer joydev input_leds serio_raw snd soundcore mac_hid i2c_piix4 ib_iser rdma_cm iw_cm ib_cm ib_core configfs iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi btrfs zstd_decompress zstd_compress xxhash raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c raid1 raid0 multipath linear 8139too qxl ttm drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops drm crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pcbc aesni_intel psmouse aes_x86_64 8139cp crypto_simd cryptd mii glue_helper pata_acpi floppy [ 2924.098044] CR2: ffff88018f0db000 [ 2924.102520] ---[ end trace a8e0d899985faf31 ]--- [ 2924.107012] RIP: 0010:__memset+0x24/0x30 [ 2924.111448] Code: 90 90 90 90 90 90 66 66 90 66 90 49 89 f9 48 89 d1 83 e2 07 48 c1 e9 03 40 0f b6 f6 48 b8 01 01 01 01 01 01 01 01 48 0f af c6 48 ab 89 d1 f3 aa 4c 89 c8 c3 90 49 89 f9 40 88 f0 48 89 d1 f3 [ 2924.120724] RSP: 0018:ffff88018ddf7ae0 EFLAGS: 00010206 [ 2924.125312] RAX: 0000000000000000 RBX: ffff8801d549d888 RCX: 1ffffffffffdaffb [ 2924.129931] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff88018f0daffc [ 2924.134537] RBP: ffff88018efc206c R08: 1ffff10031df840d R09: ffff88018efc206c [ 2924.139175] R10: ffffffffffffe1ee R11: ffffed0031df65fa R12: 0000000000000000 [ 2924.143825] R13: ffff8801d549dc98 R14: 00000000ffffc3db R15: ffffea00063bec80 [ 2924.148500] FS: 00007fa8b2f8a840(0000) GS:ffff8801f3b00000(0000) knlGS:0000000000000000 [ 2924.153247] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 2924.158003] CR2: ffff88018f0db000 CR3: 000000018f892000 CR4: 00000000000006e0 [ 2924.164641] BUG: Bad rss-counter state mm:00000000fa04621e idx:0 val:4 [ 2924.170007] BUG: Bad rss-counter tate mm:00000000fa04621e idx:1 val:2 - Location https://elixir.bootlin.com/linux/v4.18-rc3/source/fs/f2fs/inline.c#L78 memset(addr + from, 0, MAX_INLINE_DATA(inode) - from); Here the length can be negative. Reported-by Wen Xu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 07e68a306224..ea6787f8eff8 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -196,6 +196,7 @@ void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page) static bool sanity_check_inode(struct inode *inode, struct page *node_page) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); unsigned long long iblocks; iblocks = le64_to_cpu(F2FS_INODE(node_page)->i_blocks); @@ -237,6 +238,17 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } + if (fi->i_extra_isize > F2FS_TOTAL_EXTRA_ATTR_SIZE || + fi->i_extra_isize % sizeof(__le32)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: inode (ino=%lx) has corrupted i_extra_isize: %d, " + "max: %zu", + __func__, inode->i_ino, fi->i_extra_isize, + F2FS_TOTAL_EXTRA_ATTR_SIZE); + return false; + } + if (F2FS_I(inode)->extent_tree) { struct extent_info *ei = &F2FS_I(inode)->extent_tree->largest; @@ -305,11 +317,6 @@ static int do_read_inode(struct inode *inode) get_inline_info(inode, ri); - if (!sanity_check_inode(inode, node_page)) { - f2fs_put_page(node_page, 1); - return -EINVAL; - } - fi->i_extra_isize = f2fs_has_extra_attr(inode) ? le16_to_cpu(ri->i_extra_isize) : 0; @@ -329,6 +336,11 @@ static int do_read_inode(struct inode *inode) fi->i_inline_xattr_size = 0; } + if (!sanity_check_inode(inode, node_page)) { + f2fs_put_page(node_page, 1); + return -EINVAL; + } + /* check data exist */ if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) __recover_inline_status(inode, node_page); From 7fcfa4fe9b47eea0b32e4e7cc8f8a2758905f10d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 17 Jul 2018 00:02:17 +0800 Subject: [PATCH 0934/1212] f2fs: fix to propagate error from __get_meta_page() If caller of __get_meta_page() can handle error, let's propagate error from __get_meta_page(). Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 55 ++++++++++++++++++++------- fs/f2fs/data.c | 24 ++++++++++-- fs/f2fs/f2fs.h | 8 +++- fs/f2fs/file.c | 7 +++- fs/f2fs/gc.c | 16 ++++++-- fs/f2fs/inline.c | 14 ++++++- fs/f2fs/inode.c | 12 +++++- fs/f2fs/node.c | 90 ++++++++++++++++++++++++++++++++++---------- fs/f2fs/recovery.c | 13 ++++++- fs/f2fs/segment.c | 37 +++++++++++++----- 10 files changed, 220 insertions(+), 56 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index ccbf1a7ec95a..6a6cfa8446d6 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -71,6 +71,7 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, .encrypted_page = NULL, .is_meta = is_meta, }; + int err; if (unlikely(!is_meta)) fio.op_flags &= ~REQ_META; @@ -85,11 +86,10 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, fio.page = page; - if (f2fs_submit_page_bio(&fio)) { - memset(page_address(page), 0, PAGE_SIZE); - f2fs_stop_checkpoint(sbi, false); - f2fs_bug_on(sbi, 1); - return page; + err = f2fs_submit_page_bio(&fio); + if (err) { + f2fs_put_page(page, 1); + return ERR_PTR(err); } lock_page(page); @@ -98,14 +98,9 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, goto repeat; } - /* - * if there is any IO error when accessing device, make our filesystem - * readonly and make sure do not write checkpoint with non-uptodate - * meta page. - */ if (unlikely(!PageUptodate(page))) { - memset(page_address(page), 0, PAGE_SIZE); - f2fs_stop_checkpoint(sbi, false); + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); } out: return page; @@ -116,6 +111,25 @@ struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) return __get_meta_page(sbi, index, true); } +struct page *f2fs_get_meta_page_nofail(struct f2fs_sb_info *sbi, pgoff_t index) +{ + struct page *page; + int count = 0; + +retry: + page = __get_meta_page(sbi, index, true); + if (IS_ERR(page)) { + if (PTR_ERR(page) == -EIO && + ++count <= DEFAULT_RETRY_IO_COUNT) + goto retry; + + f2fs_stop_checkpoint(sbi, false); + f2fs_bug_on(sbi, 1); + } + + return page; +} + /* for POR only */ struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index) { @@ -608,7 +622,9 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) /* truncate all the data during iput */ iput(inode); - f2fs_get_node_info(sbi, ino, &ni); + err = f2fs_get_node_info(sbi, ino, &ni); + if (err) + goto err_out; /* ENOMEM was fully retried in f2fs_evict_inode. */ if (ni.blk_addr != NULL_ADDR) { @@ -656,9 +672,15 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi) f2fs_ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP, true); for (i = 0; i < orphan_blocks; i++) { - struct page *page = f2fs_get_meta_page(sbi, start_blk + i); + struct page *page; struct f2fs_orphan_block *orphan_blk; + page = f2fs_get_meta_page(sbi, start_blk + i); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out; + } + orphan_blk = (struct f2fs_orphan_block *)page_address(page); for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) { nid_t ino = le32_to_cpu(orphan_blk->ino[j]); @@ -749,6 +771,9 @@ static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, __u32 crc = 0; *cp_page = f2fs_get_meta_page(sbi, cp_addr); + if (IS_ERR(*cp_page)) + return PTR_ERR(*cp_page); + *cp_block = (struct f2fs_checkpoint *)page_address(*cp_page); crc_offset = le32_to_cpu((*cp_block)->checksum_offset); @@ -874,6 +899,8 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi) unsigned char *ckpt = (unsigned char *)sbi->ckpt; cur_page = f2fs_get_meta_page(sbi, cp_blk_no + i); + if (IS_ERR(cur_page)) + goto free_fail_no_cp; sit_bitmap_ptr = page_address(cur_page); memcpy(ckpt + i * blk_size, sit_bitmap_ptr, blk_size); f2fs_put_page(cur_page, 1); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 4ac63e77e474..60f8a92c117b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -877,6 +877,10 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; + err = f2fs_get_node_info(sbi, dn->nid, &ni); + if (err) + return err; + dn->data_blkaddr = datablock_addr(dn->inode, dn->node_page, dn->ofs_in_node); if (dn->data_blkaddr == NEW_ADDR) @@ -886,7 +890,6 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) return err; alloc: - f2fs_get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); f2fs_allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr, @@ -1289,7 +1292,11 @@ static int f2fs_xattr_fiemap(struct inode *inode, if (!page) return -ENOMEM; - f2fs_get_node_info(sbi, inode->i_ino, &ni); + err = f2fs_get_node_info(sbi, inode->i_ino, &ni); + if (err) { + f2fs_put_page(page, 1); + return err; + } phys = (__u64)blk_to_logical(inode, ni.blk_addr); offset = offsetof(struct f2fs_inode, i_addr) + @@ -1316,7 +1323,11 @@ static int f2fs_xattr_fiemap(struct inode *inode, if (!page) return -ENOMEM; - f2fs_get_node_info(sbi, xnid, &ni); + err = f2fs_get_node_info(sbi, xnid, &ni); + if (err) { + f2fs_put_page(page, 1); + return err; + } phys = (__u64)blk_to_logical(inode, ni.blk_addr); len = inode->i_sb->s_blocksize; @@ -1702,6 +1713,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) struct inode *inode = page->mapping->host; struct dnode_of_data dn; struct extent_info ei = {0,0,0}; + struct node_info ni; bool ipu_force = false; int err = 0; @@ -1770,6 +1782,12 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) fio->need_lock = LOCK_REQ; } + err = f2fs_get_node_info(fio->sbi, dn.nid, &ni); + if (err) + goto out_writepage; + + fio->version = ni.version; + err = encrypt_one_page(fio); if (err) goto out_writepage; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4c69b7f2194b..027076a7285b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -578,6 +578,8 @@ enum { */ }; +#define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO count */ + #define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */ #define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */ @@ -1085,6 +1087,7 @@ struct f2fs_io_info { bool retry; /* need to reallocate block address */ enum iostat_type io_type; /* io type */ struct writeback_control *io_wbc; /* writeback control */ + unsigned char version; /* version of the node */ }; #define is_read_io(rw) ((rw) == READ) @@ -2908,7 +2911,7 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type); int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino); -void f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, +int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni); pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs); int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode); @@ -2935,7 +2938,7 @@ int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink); void f2fs_recover_inline_xattr(struct inode *inode, struct page *page); int f2fs_recover_xattr_data(struct inode *inode, struct page *page); int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); -void f2fs_restore_node_summary(struct f2fs_sb_info *sbi, +int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum); void f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); int f2fs_build_node_manager(struct f2fs_sb_info *sbi); @@ -3013,6 +3016,7 @@ enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io); struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); +struct page *f2fs_get_meta_page_nofail(struct f2fs_sb_info *sbi, pgoff_t index); struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index); bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 9908f00fb671..407854794590 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1074,7 +1074,12 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, if (ret) return ret; - f2fs_get_node_info(sbi, dn.nid, &ni); + ret = f2fs_get_node_info(sbi, dn.nid, &ni); + if (ret) { + f2fs_put_dnode(&dn); + return ret; + } + ilen = min((pgoff_t) ADDRS_PER_PAGE(dn.node_page, dst_inode) - dn.ofs_in_node, len - i); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 78b256c71d1f..c0a949d83de7 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -517,7 +517,11 @@ static void gc_node_segment(struct f2fs_sb_info *sbi, continue; } - f2fs_get_node_info(sbi, nid, &ni); + if (f2fs_get_node_info(sbi, nid, &ni)) { + f2fs_put_page(node_page, 1); + continue; + } + if (ni.blk_addr != start_addr + off) { f2fs_put_page(node_page, 1); continue; @@ -576,7 +580,10 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (IS_ERR(node_page)) return false; - f2fs_get_node_info(sbi, nid, dni); + if (f2fs_get_node_info(sbi, nid, dni)) { + f2fs_put_page(node_page, 1); + return false; + } if (sum->version != dni->version) { f2fs_msg(sbi->sb, KERN_WARNING, @@ -655,7 +662,10 @@ static void move_data_block(struct inode *inode, block_t bidx, */ f2fs_wait_on_page_writeback(page, DATA, true); - f2fs_get_node_info(fio.sbi, dn.nid, &ni); + err = f2fs_get_node_info(fio.sbi, dn.nid, &ni); + if (err) + goto put_out; + set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); /* read page */ diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 17125baa2ba7..a628c747e693 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -121,6 +121,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) .encrypted_page = NULL, .io_type = FS_DATA_IO, }; + struct node_info ni; int dirty, err; if (!f2fs_exist_data(dn->inode)) @@ -130,6 +131,14 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) if (err) return err; + err = f2fs_get_node_info(fio.sbi, dn->nid, &ni); + if (err) { + f2fs_put_dnode(dn); + return err; + } + + fio.version = ni.version; + if (unlikely(dn->data_blkaddr != NEW_ADDR)) { f2fs_put_dnode(dn); set_sbi_flag(fio.sbi, SBI_NEED_FSCK); @@ -690,7 +699,10 @@ int f2fs_inline_data_fiemap(struct inode *inode, ilen = start + len; ilen -= start; - f2fs_get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni); + err = f2fs_get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni); + if (err) + goto out; + byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits; byteaddr += (char *)inline_data_addr(inode, ipage) - (char *)F2FS_INODE(ipage); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index ea6787f8eff8..480351d836f4 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -699,6 +699,7 @@ void f2fs_handle_failed_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct node_info ni; + int err; /* * clear nlink of inode in order to release resource of inode @@ -721,10 +722,16 @@ void f2fs_handle_failed_inode(struct inode *inode) * so we can prevent losing this orphan when encoutering checkpoint * and following suddenly power-off. */ - f2fs_get_node_info(sbi, inode->i_ino, &ni); + err = f2fs_get_node_info(sbi, inode->i_ino, &ni); + if (err) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "May loss orphan inode, run fsck to fix."); + goto out; + } if (ni.blk_addr != NULL_ADDR) { - int err = f2fs_acquire_orphan_inode(sbi); + err = f2fs_acquire_orphan_inode(sbi); if (err) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_msg(sbi->sb, KERN_WARNING, @@ -737,6 +744,7 @@ void f2fs_handle_failed_inode(struct inode *inode) set_inode_flag(inode, FI_FREE_NID); } +out: f2fs_unlock_op(sbi); /* iput will drop the inode object */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 912f37394577..cfa4ceaeab65 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -113,7 +113,7 @@ static void clear_node_page_dirty(struct page *page) static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid) { pgoff_t index = current_nat_addr(sbi, nid); - return f2fs_get_meta_page(sbi, index); + return f2fs_get_meta_page_nofail(sbi, index); } static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) @@ -419,7 +419,7 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) /* * This function always returns success */ -void f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, +int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -443,7 +443,7 @@ void f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, ni->blk_addr = nat_get_blkaddr(e); ni->version = nat_get_version(e); up_read(&nm_i->nat_tree_lock); - return; + return 0; } memset(&ne, 0, sizeof(struct f2fs_nat_entry)); @@ -466,6 +466,9 @@ void f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, up_read(&nm_i->nat_tree_lock); page = f2fs_get_meta_page(sbi, index); + if (IS_ERR(page)) + return PTR_ERR(page); + nat_blk = (struct f2fs_nat_block *)page_address(page); ne = nat_blk->entries[nid - start_nid]; node_info_from_raw_nat(ni, &ne); @@ -473,6 +476,7 @@ void f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, cache: /* cache nat entry */ cache_nat_entry(sbi, nid, &ne); + return 0; } /* @@ -722,12 +726,15 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) return err; } -static void truncate_node(struct dnode_of_data *dn) +static int truncate_node(struct dnode_of_data *dn) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct node_info ni; + int err; - f2fs_get_node_info(sbi, dn->nid, &ni); + err = f2fs_get_node_info(sbi, dn->nid, &ni); + if (err) + return err; /* Deallocate node address */ f2fs_invalidate_blocks(sbi, ni.blk_addr); @@ -750,11 +757,14 @@ static void truncate_node(struct dnode_of_data *dn) dn->node_page = NULL; trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr); + + return 0; } static int truncate_dnode(struct dnode_of_data *dn) { struct page *page; + int err; if (dn->nid == 0) return 1; @@ -770,7 +780,10 @@ static int truncate_dnode(struct dnode_of_data *dn) dn->node_page = page; dn->ofs_in_node = 0; f2fs_truncate_data_blocks(dn); - truncate_node(dn); + err = truncate_node(dn); + if (err) + return err; + return 1; } @@ -835,7 +848,9 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, if (!ofs) { /* remove current indirect node */ dn->node_page = page; - truncate_node(dn); + ret = truncate_node(dn); + if (ret) + goto out_err; freed++; } else { f2fs_put_page(page, 1); @@ -893,7 +908,9 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, if (offset[idx + 1] == 0) { dn->node_page = pages[idx]; dn->nid = nid[idx]; - truncate_node(dn); + err = truncate_node(dn); + if (err) + goto fail; } else { f2fs_put_page(pages[idx], 1); } @@ -1014,6 +1031,7 @@ int f2fs_truncate_xattr_node(struct inode *inode) nid_t nid = F2FS_I(inode)->i_xattr_nid; struct dnode_of_data dn; struct page *npage; + int err; if (!nid) return 0; @@ -1022,10 +1040,15 @@ int f2fs_truncate_xattr_node(struct inode *inode) if (IS_ERR(npage)) return PTR_ERR(npage); + set_new_dnode(&dn, inode, NULL, npage, nid); + err = truncate_node(&dn); + if (err) { + f2fs_put_page(npage, 1); + return err; + } + f2fs_i_xnid_write(inode, 0); - set_new_dnode(&dn, inode, NULL, npage, nid); - truncate_node(&dn); return 0; } @@ -1059,7 +1082,11 @@ int f2fs_remove_inode_page(struct inode *inode) inode->i_blocks != 0 && inode->i_blocks != 8); /* will put inode & node pages */ - truncate_node(&dn); + err = truncate_node(&dn); + if (err) { + f2fs_put_dnode(&dn); + return err; + } return 0; } @@ -1092,7 +1119,11 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs) goto fail; #ifdef CONFIG_F2FS_CHECK_FS - f2fs_get_node_info(sbi, dn->nid, &new_ni); + err = f2fs_get_node_info(sbi, dn->nid, &new_ni); + if (err) { + dec_valid_node_count(sbi, dn->inode, !ofs); + goto fail; + } f2fs_bug_on(sbi, new_ni.blk_addr != NULL_ADDR); #endif new_ni.nid = dn->nid; @@ -1140,6 +1171,7 @@ static int read_node_page(struct page *page, int op_flags) .page = page, .encrypted_page = NULL, }; + int err; if (PageUptodate(page)) { #ifdef CONFIG_F2FS_CHECK_FS @@ -1148,7 +1180,9 @@ static int read_node_page(struct page *page, int op_flags) return LOCKED_PAGE; } - f2fs_get_node_info(sbi, page->index, &ni); + err = f2fs_get_node_info(sbi, page->index, &ni); + if (err) + return err; if (unlikely(ni.blk_addr == NULL_ADDR) || is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN)) { @@ -1383,6 +1417,9 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, nid = nid_of_node(page); f2fs_bug_on(sbi, page->index != nid); + if (f2fs_get_node_info(sbi, nid, &ni)) + goto redirty_out; + if (wbc->for_reclaim) { if (!down_read_trylock(&sbi->node_write)) goto redirty_out; @@ -1390,8 +1427,6 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, down_read(&sbi->node_write); } - f2fs_get_node_info(sbi, nid, &ni); - /* This page is already truncated */ if (unlikely(ni.blk_addr == NULL_ADDR)) { ClearPageUptodate(page); @@ -2314,12 +2349,16 @@ int f2fs_recover_xattr_data(struct inode *inode, struct page *page) struct dnode_of_data dn; struct node_info ni; struct page *xpage; + int err; if (!prev_xnid) goto recover_xnid; /* 1: invalidate the previous xattr nid */ - f2fs_get_node_info(sbi, prev_xnid, &ni); + err = f2fs_get_node_info(sbi, prev_xnid, &ni); + if (err) + return err; + f2fs_invalidate_blocks(sbi, ni.blk_addr); dec_valid_node_count(sbi, inode, false); set_node_addr(sbi, &ni, NULL_ADDR, false); @@ -2354,8 +2393,11 @@ int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) nid_t ino = ino_of_node(page); struct node_info old_ni, new_ni; struct page *ipage; + int err; - f2fs_get_node_info(sbi, ino, &old_ni); + err = f2fs_get_node_info(sbi, ino, &old_ni); + if (err) + return err; if (unlikely(old_ni.blk_addr != NULL_ADDR)) return -EINVAL; @@ -2409,7 +2451,7 @@ int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) return 0; } -void f2fs_restore_node_summary(struct f2fs_sb_info *sbi, +int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum) { struct f2fs_node *rn; @@ -2431,6 +2473,9 @@ void f2fs_restore_node_summary(struct f2fs_sb_info *sbi, for (idx = addr; idx < addr + nrpages; idx++) { struct page *page = f2fs_get_tmp_page(sbi, idx); + if (IS_ERR(page)) + return PTR_ERR(page); + rn = F2FS_NODE(page); sum_entry->nid = rn->footer.nid; sum_entry->version = 0; @@ -2442,6 +2487,7 @@ void f2fs_restore_node_summary(struct f2fs_sb_info *sbi, invalidate_mapping_pages(META_MAPPING(sbi), addr, addr + nrpages); } + return 0; } static void remove_nats_in_journal(struct f2fs_sb_info *sbi) @@ -2678,7 +2724,13 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg - nm_i->nat_bits_blocks; for (i = 0; i < nm_i->nat_bits_blocks; i++) { - struct page *page = f2fs_get_meta_page(sbi, nat_bits_addr++); + struct page *page; + + page = f2fs_get_meta_page(sbi, nat_bits_addr++); + if (IS_ERR(page)) { + disable_nat_bits(sbi, true); + return PTR_ERR(page); + } memcpy(nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS), page_address(page), F2FS_BLKSIZE); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 501767451e2b..8c4695865278 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -256,6 +256,10 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, return 0; page = f2fs_get_tmp_page(sbi, blkaddr); + if (IS_ERR(page)) { + err = PTR_ERR(page); + break; + } if (!is_recoverable_dnode(page)) break; @@ -471,7 +475,10 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, f2fs_wait_on_page_writeback(dn.node_page, NODE, true); - f2fs_get_node_info(sbi, dn.nid, &ni); + err = f2fs_get_node_info(sbi, dn.nid, &ni); + if (err) + goto err; + f2fs_bug_on(sbi, ni.ino != ino_of_node(page)); f2fs_bug_on(sbi, ofs_of_node(dn.node_page) != ofs_of_node(page)); @@ -574,6 +581,10 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, f2fs_ra_meta_pages_cond(sbi, blkaddr); page = f2fs_get_tmp_page(sbi, blkaddr); + if (IS_ERR(page)) { + err = PTR_ERR(page); + break; + } if (!is_recoverable_dnode(page)) { f2fs_put_page(page, 1); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index cc7a78bbb422..2637799f5c27 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -250,7 +250,13 @@ static int __revoke_inmem_pages(struct inode *inode, err = -EAGAIN; goto next; } - f2fs_get_node_info(sbi, dn.nid, &ni); + + err = f2fs_get_node_info(sbi, dn.nid, &ni); + if (err) { + f2fs_put_dnode(&dn); + return err; + } + if (cur->old_addr == NEW_ADDR) { f2fs_invalidate_blocks(sbi, dn.data_blkaddr); f2fs_update_data_blkaddr(&dn, NEW_ADDR); @@ -2131,7 +2137,7 @@ int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) */ struct page *f2fs_get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno) { - return f2fs_get_meta_page(sbi, GET_SUM_BLOCK(sbi, segno)); + return f2fs_get_meta_page_nofail(sbi, GET_SUM_BLOCK(sbi, segno)); } void f2fs_update_meta_page(struct f2fs_sb_info *sbi, @@ -2992,11 +2998,9 @@ void f2fs_outplace_write_data(struct dnode_of_data *dn, { struct f2fs_sb_info *sbi = fio->sbi; struct f2fs_summary sum; - struct node_info ni; f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR); - f2fs_get_node_info(sbi, dn->nid, &ni); - set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); + set_summary(&sum, dn->nid, dn->ofs_in_node, fio->version); do_write_page(&sum, fio); f2fs_update_data_blkaddr(dn, fio->new_blkaddr); @@ -3158,7 +3162,7 @@ void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr) } } -static void read_compacted_summaries(struct f2fs_sb_info *sbi) +static int read_compacted_summaries(struct f2fs_sb_info *sbi) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct curseg_info *seg_i; @@ -3170,6 +3174,8 @@ static void read_compacted_summaries(struct f2fs_sb_info *sbi) start = start_sum_block(sbi); page = f2fs_get_meta_page(sbi, start++); + if (IS_ERR(page)) + return PTR_ERR(page); kaddr = (unsigned char *)page_address(page); /* Step 1: restore nat cache */ @@ -3210,11 +3216,14 @@ static void read_compacted_summaries(struct f2fs_sb_info *sbi) page = NULL; page = f2fs_get_meta_page(sbi, start++); + if (IS_ERR(page)) + return PTR_ERR(page); kaddr = (unsigned char *)page_address(page); offset = 0; } } f2fs_put_page(page, 1); + return 0; } static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) @@ -3226,6 +3235,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) unsigned short blk_off; unsigned int segno = 0; block_t blk_addr = 0; + int err = 0; /* get segment number and block addr */ if (IS_DATASEG(type)) { @@ -3249,6 +3259,8 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) } new = f2fs_get_meta_page(sbi, blk_addr); + if (IS_ERR(new)) + return PTR_ERR(new); sum = (struct f2fs_summary_block *)page_address(new); if (IS_NODESEG(type)) { @@ -3260,7 +3272,9 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) ns->ofs_in_node = 0; } } else { - f2fs_restore_node_summary(sbi, segno, sum); + err = f2fs_restore_node_summary(sbi, segno, sum); + if (err) + goto out; } } @@ -3280,8 +3294,9 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) curseg->alloc_type = ckpt->alloc_type[type]; curseg->next_blkoff = blk_off; mutex_unlock(&curseg->curseg_mutex); +out: f2fs_put_page(new, 1); - return 0; + return err; } static int restore_curseg_summaries(struct f2fs_sb_info *sbi) @@ -3299,7 +3314,9 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) META_CP, true); /* restore for compacted data summary */ - read_compacted_summaries(sbi); + err = read_compacted_summaries(sbi); + if (err) + return err; type = CURSEG_HOT_NODE; } @@ -3430,7 +3447,7 @@ int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, static struct page *get_current_sit_page(struct f2fs_sb_info *sbi, unsigned int segno) { - return f2fs_get_meta_page(sbi, current_sit_addr(sbi, segno)); + return f2fs_get_meta_page_nofail(sbi, current_sit_addr(sbi, segno)); } static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, From ca5241674a6aa1f46d1ce3562554cb0446f7e4b4 Mon Sep 17 00:00:00 2001 From: Hyunchul Lee Date: Thu, 19 Jul 2018 09:23:57 +0900 Subject: [PATCH 0935/1212] f2fs: avoid duplicated permission check for "trusted." xattrs Because xattr_permission already checks CAP_SYS_ADMIN capability, we don't need to check it. Signed-off-by: Hyunchul Lee Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 61a5d9284bc0..34b7f691cf12 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -38,9 +38,6 @@ static size_t f2fs_xattr_generic_list(const struct xattr_handler *handler, return -EOPNOTSUPP; break; case F2FS_XATTR_INDEX_TRUSTED: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - break; case F2FS_XATTR_INDEX_SECURITY: break; default: @@ -69,9 +66,6 @@ static int f2fs_xattr_generic_get(const struct xattr_handler *handler, return -EOPNOTSUPP; break; case F2FS_XATTR_INDEX_TRUSTED: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - break; case F2FS_XATTR_INDEX_SECURITY: break; default: From 836f5b31414e18b1c5ef9c1553f061d43123c66c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 17 Jul 2018 20:41:45 +0800 Subject: [PATCH 0936/1212] f2fs: kill EXT_TREE_VEC_SIZE Since commit 201ef5e080c9 ("f2fs: improve shrink performance of extent nodes"), there is no user of EXT_TREE_VEC_SIZE, just kill it for cleanup. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 027076a7285b..cbbe917412a4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -584,9 +584,6 @@ enum { #define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */ -/* vector size for gang look-up from extent cache that consists of radix tree */ -#define EXT_TREE_VEC_SIZE 64 - /* for in-memory extent cache entry */ #define F2FS_MIN_EXTENT_LEN 64 /* minimum extent length */ From 207f081d6a1b7705ecaf0cfeebbd8f9ea12bde0a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 17 Jul 2018 20:41:46 +0800 Subject: [PATCH 0937/1212] f2fs: clean up with get_current_nat_page Just cleanup, no logic change. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index cfa4ceaeab65..074c576242d4 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -112,25 +112,22 @@ static void clear_node_page_dirty(struct page *page) static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid) { - pgoff_t index = current_nat_addr(sbi, nid); - return f2fs_get_meta_page_nofail(sbi, index); + return f2fs_get_meta_page_nofail(sbi, current_nat_addr(sbi, nid)); } static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) { struct page *src_page; struct page *dst_page; - pgoff_t src_off; pgoff_t dst_off; void *src_addr; void *dst_addr; struct f2fs_nm_info *nm_i = NM_I(sbi); - src_off = current_nat_addr(sbi, nid); - dst_off = next_nat_addr(sbi, src_off); + dst_off = next_nat_addr(sbi, current_nat_addr(sbi, nid)); /* get current nat block page with lock */ - src_page = f2fs_get_meta_page(sbi, src_off); + src_page = get_current_nat_page(sbi, nid); dst_page = f2fs_grab_meta_page(sbi, dst_off); f2fs_bug_on(sbi, PageDirty(src_page)); From a4160d79ece6d82f72a129098d80d5f4af07ab9e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 17 Jul 2018 20:41:47 +0800 Subject: [PATCH 0938/1212] f2fs: clean up with f2fs_encrypted_inode() Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 407854794590..3257104e0838 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1622,7 +1622,7 @@ static int f2fs_ioc_getflags(struct file *filp, unsigned long arg) struct f2fs_inode_info *fi = F2FS_I(inode); unsigned int flags = fi->i_flags; - if (file_is_encrypt(inode)) + if (f2fs_encrypted_inode(inode)) flags |= F2FS_ENCRYPT_FL; if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) flags |= F2FS_INLINE_DATA_FL; From 29cad83d334723ad78b5ad3e1e6cf6ab594f0fa1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 17 Jul 2018 20:41:48 +0800 Subject: [PATCH 0939/1212] f2fs: clean up with f2fs_is_{atomic,volatile}_file() Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 2637799f5c27..5591f727f36b 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2800,8 +2800,8 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) return CURSEG_COLD_DATA; if (file_is_hot(inode) || is_inode_flag_set(inode, FI_HOT_DATA) || - is_inode_flag_set(inode, FI_ATOMIC_FILE) || - is_inode_flag_set(inode, FI_VOLATILE_FILE)) + f2fs_is_atomic_file(inode) || + f2fs_is_volatile_file(inode)) return CURSEG_HOT_DATA; /* f2fs_rw_hint_to_seg_type(inode->i_write_hint); */ return CURSEG_WARM_DATA; From af6d504a3d3d6a18e1f4aedc86fc7bb9894161e4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 17 Jul 2018 20:41:49 +0800 Subject: [PATCH 0940/1212] f2fs: clean up ioctl interface naming Romve redundant prefix 'f2fs_' in the middle of f2fs_ioc_f2fs_write_checkpoint(). Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 3257104e0838..dd5f08943feb 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2129,7 +2129,7 @@ static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) return ret; } -static int f2fs_ioc_f2fs_write_checkpoint(struct file *filp, unsigned long arg) +static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -2722,7 +2722,7 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case F2FS_IOC_GARBAGE_COLLECT_RANGE: return f2fs_ioc_gc_range(filp, arg); case F2FS_IOC_WRITE_CHECKPOINT: - return f2fs_ioc_f2fs_write_checkpoint(filp, arg); + return f2fs_ioc_write_checkpoint(filp, arg); case F2FS_IOC_DEFRAGMENT: return f2fs_ioc_defragment(filp, arg); case F2FS_IOC_MOVE_RANGE: From bc9e6f1a0f5037cbd6ed8402d24f7293e90834eb Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Thu, 19 Jul 2018 14:57:14 +0800 Subject: [PATCH 0941/1212] f2fs: fix wrong kernel message when recover fsync data on ro fs This patch fix wrong message info for recover fsync data on readonly fs. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 8c4695865278..0a6e81879a1f 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -639,7 +639,8 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) #endif if (s_flags & MS_RDONLY) { - f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs"); + f2fs_msg(sbi->sb, KERN_INFO, + "recover fsync data on readonly fs"); sbi->sb->s_flags &= ~MS_RDONLY; } From a2ee1be2b06a82a248c3591f8c3ffd4fa1eee074 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 19 Jul 2018 23:57:54 +0800 Subject: [PATCH 0942/1212] f2fs: restrict setting up inode.i_advise In order to give advise to f2fs to recognize hot/cold file, it is possible that we can set specific bit in inode.i_advise through setxattr(), but there are several bits which are used internally, such as encrypt_bit, keep_size_bit, they should never be changed through setxattr(). So that this patch 1) adds FADVISE_MODIFIABLE_BITS to filter modifiable bits user given, 2) supports to clear {hot,cold}_file bits. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/xattr.c | 11 ++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index cbbe917412a4..17fa394449b6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -669,6 +669,8 @@ enum { #define FADVISE_HOT_BIT 0x20 #define FADVISE_VERITY_BIT 0x40 /* reserved */ +#define FADVISE_MODIFIABLE_BITS (FADVISE_COLD_BIT | FADVISE_HOT_BIT) + #define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) #define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) #define file_set_cold(inode) set_file(inode, FADVISE_COLD_BIT) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 34b7f691cf12..152078bb4829 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -136,6 +136,8 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler, size_t size, int flags) { struct inode *inode = d_inode(dentry); + unsigned char old_advise = F2FS_I(inode)->i_advise; + unsigned char new_advise; if (strcmp(name, "") != 0) return -EINVAL; @@ -144,7 +146,14 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler, if (value == NULL) return -EINVAL; - F2FS_I(inode)->i_advise |= *(char *)value; + new_advise = *(char *)value; + if (new_advise & ~FADVISE_MODIFIABLE_BITS) + return -EINVAL; + + new_advise = new_advise & FADVISE_MODIFIABLE_BITS; + new_advise |= old_advise & ~FADVISE_MODIFIABLE_BITS; + + F2FS_I(inode)->i_advise = new_advise; f2fs_mark_inode_dirty_sync(inode, true); return 0; } From 06da8b3d9ba5904933b889f4a78987675d02c19b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 27 Jul 2018 18:15:11 +0900 Subject: [PATCH 0943/1212] f2fs: don't allow any writes on aborted atomic writes In order to prevent abusing atomic writes by abnormal users, we've added a threshold, 20% over memory footprint, which disallows further atomic writes. Previously, however, SQLite doesn't know the files became normal, so that it could write stale data and commit on revoked normal database file. Once f2fs detects such the abnormal behavior, this patch tries to avoid further writes in write_begin(). Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 5 +++-- fs/f2fs/file.c | 7 ++++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 60f8a92c117b..a9418f3f891a 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2288,8 +2288,9 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, trace_f2fs_write_begin(inode, pos, len, flags); - if (f2fs_is_atomic_file(inode) && - !f2fs_available_free_memory(sbi, INMEM_PAGES)) { + if ((f2fs_is_atomic_file(inode) && + !f2fs_available_free_memory(sbi, INMEM_PAGES)) || + is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) { err = -ENOMEM; drop_atomic = true; goto fail; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index dd5f08943feb..53100ebac81e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1708,8 +1708,11 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - if (f2fs_is_atomic_file(inode)) + if (f2fs_is_atomic_file(inode)) { + if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) + ret = -EINVAL; goto out; + } ret = f2fs_convert_inline_inode(inode); if (ret) @@ -1871,6 +1874,8 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); } + clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); + inode_unlock(inode); mnt_drop_write_file(filp); From 819d915b47b0c0933a925718398e45dcdd631ede Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Thu, 19 Jul 2018 20:58:15 +0800 Subject: [PATCH 0944/1212] f2fs: issue discard align to section in LFS mode For the case when sbi->segs_per_sec > 1 with lfs mode, take section:segment = 5 for example, if the section prefree_map is ...previous section | current section (1 1 0 1 1) | next section..., then the start = x, end = x + 1, after start = start_segno + sbi->segs_per_sec, start = x + 5, then it will skip x + 3 and x + 4, but their bitmap is still set, which will cause duplicated f2fs_issue_discard of this same section in the next write_checkpoint: round 1: section bitmap : 1 1 1 1 1, all valid, prefree_map: 0 0 0 0 0 then rm data block NO.2, block NO.2 becomes invalid, prefree_map: 0 0 1 0 0 write_checkpoint: section bitmap: 1 1 0 1 1, prefree_map: 0 0 0 0 0, prefree of NO.2 is cleared, and no discard issued round 2: rm data block NO.0, NO.1, NO.3, NO.4 all invalid, but prefree bit of NO.2 is set and cleared in round 1, then prefree_map: 1 1 0 1 1 write_checkpoint: section bitmap: 0 0 0 0 0, prefree_map: 0 0 0 1 1, no valid blocks of this section, so discard issued, but this time prefree bit of NO.3 and NO.4 is skipped due to start = start_segno + sbi->segs_per_sec; round 3: write_checkpoint: section bitmap: 0 0 0 0 0, prefree_map: 0 0 0 1 1 -> 0 0 0 0 0, no valid blocks of this section, so discard issued, this time prefree bit of NO.3 and NO.4 is cleared, but the discard of this section is sent again... To fix this problem, we can align the start and end value to section boundary for fstrim and real-time discard operation, and decide to issue discard only when the whole section is invalid, which can issue discard aligned to section size as much as possible and avoid redundant discard. Signed-off-by: Yunlong Song Signed-off-by: Chao Yu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5591f727f36b..5b79ba95d56e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1795,21 +1795,30 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, unsigned int start = 0, end = -1; unsigned int secno, start_segno; bool force = (cpc->reason & CP_DISCARD); + bool need_align = test_opt(sbi, LFS) && sbi->segs_per_sec > 1; mutex_lock(&dirty_i->seglist_lock); while (1) { int i; + + if (need_align && end != -1) + end--; start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1); if (start >= MAIN_SEGS(sbi)) break; end = find_next_zero_bit(prefree_map, MAIN_SEGS(sbi), start + 1); - for (i = start; i < end; i++) - clear_bit(i, prefree_map); + if (need_align) { + start = rounddown(start, sbi->segs_per_sec); + end = roundup(end, sbi->segs_per_sec); + } - dirty_i->nr_dirty[PRE] -= end - start; + for (i = start; i < end; i++) { + if (test_and_clear_bit(i, prefree_map)) + dirty_i->nr_dirty[PRE]--; + } if (!test_opt(sbi, DISCARD)) continue; @@ -2596,6 +2605,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) struct discard_policy dpolicy; unsigned long long trimmed = 0; int err = 0; + bool need_align = test_opt(sbi, LFS) && sbi->segs_per_sec > 1; if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize) return -EINVAL; @@ -2613,6 +2623,10 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start); end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 : GET_SEGNO(sbi, end); + if (need_align) { + start_segno = rounddown(start_segno, sbi->segs_per_sec); + end_segno = roundup(end_segno + 1, sbi->segs_per_sec) - 1; + } cpc.reason = CP_DISCARD; cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen)); From b404b3cbb80afcdd5aaa3355c57b71a71a79359e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 25 Jul 2018 19:16:21 +0800 Subject: [PATCH 0945/1212] f2fs: let checkpoint flush dnode page of regular Fsyncer will wait on all dnode pages of regular writeback before flushing, if there are async dnode pages blocked by IO scheduler, it may decrease fsync's performance. In this patch, we choose to let f2fs_balance_fs_bg() to trigger checkpoint to flush these dnode pages of regular, so async IO of dnode page can be elimitnated, making fsyncer only need to wait for sync IO. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 8 +++++++- fs/f2fs/node.h | 5 +++++ fs/f2fs/segment.c | 4 +++- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 074c576242d4..23a841025d9d 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1410,6 +1410,10 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; + if (wbc->sync_mode == WB_SYNC_NONE && + IS_DNODE(page) && is_cold_node(page)) + goto redirty_out; + /* get old block addr of this node page */ nid = nid_of_node(page); f2fs_bug_on(sbi, page->index != nid); @@ -1727,10 +1731,12 @@ int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, } if (step < 2) { + if (wbc->sync_mode == WB_SYNC_NONE && step == 1) + goto out; step++; goto next_step; } - +out: if (nwritten) f2fs_submit_merged_write(sbi, NODE); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 8f34bdffde93..0f4db7a61254 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -135,6 +135,11 @@ static inline bool excess_cached_nats(struct f2fs_sb_info *sbi) return NM_I(sbi)->nat_cnt >= DEF_NAT_CACHE_THRESHOLD; } +static inline bool excess_dirty_nodes(struct f2fs_sb_info *sbi) +{ + return get_pages(sbi, F2FS_DIRTY_NODES) >= sbi->blocks_per_seg * 8; +} + enum mem_type { FREE_NIDS, /* indicates the free nid list */ NAT_ENTRIES, /* indicates the cached nat entry */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5b79ba95d56e..e067d4768360 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -509,7 +509,8 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) else f2fs_build_free_nids(sbi, false, false); - if (!is_idle(sbi) && !excess_dirty_nats(sbi)) + if (!is_idle(sbi) && + (!excess_dirty_nats(sbi) && !excess_dirty_nodes(sbi))) return; /* checkpoint is the only way to shrink partial cached entries */ @@ -517,6 +518,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) !f2fs_available_free_memory(sbi, INO_ENTRIES) || excess_prefree_segs(sbi) || excess_dirty_nats(sbi) || + excess_dirty_nodes(sbi) || f2fs_time_over(sbi, CP_TIME)) { if (test_opt(sbi, DATA_FLUSH)) { struct blk_plug plug; From bd293daff1f3ad54b7f3b24c3d91a2aaa4de3a45 Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Mon, 23 Jul 2018 22:10:22 +0800 Subject: [PATCH 0946/1212] f2fs: add proc entry to show victim_secmap bitmap This patch adds a new proc entry to show victim_secmap information in more detail, which is very helpful to know the get_victim candidate status clearly, and helpful to debug problems (e.g., some sections can not gc all of its blocks, since some blocks belong to atomic file, leaving victim_secmap with section bit setting, in extrem case, this will lead all bytes of victim_secmap setting with 0xff). Signed-off-by: Yunlong Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 47f24b922af0..79e47e7d737c 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -615,6 +615,28 @@ static int __maybe_unused iostat_info_seq_show(struct seq_file *seq, return 0; } +static int __maybe_unused victim_bits_seq_show(struct seq_file *seq, + void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + int i; + + seq_puts(seq, "format: victim_secmap bitmaps\n"); + + for (i = 0; i < MAIN_SECS(sbi); i++) { + if ((i % 10) == 0) + seq_printf(seq, "%-10d", i); + seq_printf(seq, "%d", test_bit(i, dirty_i->victim_secmap) ? 1 : 0); + if ((i % 10) == 9 || i == (MAIN_SECS(sbi) - 1)) + seq_putc(seq, '\n'); + else + seq_putc(seq, ' '); + } + return 0; +} + #define F2FS_PROC_FILE_DEF(_name) \ static int _name##_open_fs(struct inode *inode, struct file *file) \ { \ @@ -631,6 +653,7 @@ static const struct file_operations f2fs_seq_##_name##_fops = { \ F2FS_PROC_FILE_DEF(segment_info); F2FS_PROC_FILE_DEF(segment_bits); F2FS_PROC_FILE_DEF(iostat_info); +F2FS_PROC_FILE_DEF(victim_bits); int __init f2fs_init_sysfs(void) { @@ -681,6 +704,8 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) &f2fs_seq_segment_bits_fops, sb); proc_create_data("iostat_info", S_IRUGO, sbi->s_proc, &f2fs_seq_iostat_info_fops, sb); + proc_create_data("victim_bits", S_IRUGO, sbi->s_proc, + &f2fs_seq_victim_bits_fops, sb); } return 0; } @@ -691,6 +716,7 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) remove_proc_entry("iostat_info", sbi->s_proc); remove_proc_entry("segment_info", sbi->s_proc); remove_proc_entry("segment_bits", sbi->s_proc); + remove_proc_entry("victim_bits", sbi->s_proc); remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); } kobject_del(&sbi->s_kobj); From 3675f13c6283e8f5f3f8388c0f4ec7c64c6a5b8e Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Thu, 26 Jul 2018 19:24:25 +0800 Subject: [PATCH 0947/1212] f2fs: quota: fix incorrect comments Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 5 ++++- fs/f2fs/super.c | 5 +---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 6a6cfa8446d6..fb9529a7a02e 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -662,7 +662,10 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi) /* Needed for iput() to work correctly and not trash data */ sbi->sb->s_flags |= MS_ACTIVE; - /* Turn on quotas so that they are updated correctly */ + /* + * Turn on quotas which were not enabled for read-only mounts if + * filesystem has quota feature, so that they are updated correctly. + */ quota_enabled = f2fs_enable_quota_files(sbi, s_flags & MS_RDONLY); #endif diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 9cd7c3d159db..45f6858dfeb7 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2963,10 +2963,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) goto free_root_inode; #ifdef CONFIG_QUOTA - /* - * Turn on quotas which were not enabled for read-only mounts if - * filesystem has quota feature, so that they are updated correctly. - */ + /* Enable quota usage during mount */ if (f2fs_sb_has_quota_ino(sb) && !f2fs_readonly(sb)) { err = f2fs_enable_quotas(sb); if (err) { From bab53a178a57cc3b5e59eb5ff73a8830cf125626 Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Tue, 24 Jul 2018 20:17:52 +0800 Subject: [PATCH 0948/1212] f2fs: quota: do not mount as RDWR without QUOTA if quota feature enabled If quota feature is enabled, quota is on by default. However, if CONFIG_QUOTA is not built in kernel, dquot entries will not get updated, which leads to quota inconsistency. Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 45f6858dfeb7..4c2c891fef96 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -346,12 +346,6 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) "QUOTA feature is enabled, so ignore jquota_fmt"); F2FS_OPTION(sbi).s_jquota_fmt = 0; } - if (f2fs_sb_has_quota_ino(sbi->sb) && f2fs_readonly(sbi->sb)) { - f2fs_msg(sbi->sb, KERN_INFO, - "Filesystem with quota feature cannot be mounted RDWR " - "without CONFIG_QUOTA"); - return -1; - } return 0; } #endif @@ -774,6 +768,13 @@ static int parse_options(struct super_block *sb, char *options) #ifdef CONFIG_QUOTA if (f2fs_check_quota_options(sbi)) return -EINVAL; +#else + if (f2fs_sb_has_quota_ino(sbi->sb) && !f2fs_readonly(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_INFO, + "Filesystem with quota feature cannot be mounted RDWR " + "without CONFIG_QUOTA"); + return -EINVAL; + } #endif if (F2FS_IO_SIZE_BITS(sbi) && !test_opt(sbi, LFS)) { From 1597eac8347009f4cbe833391b4fe89cadf250c8 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 26 Jul 2018 07:19:48 +0800 Subject: [PATCH 0949/1212] f2fs: fix to restrict mount condition when without CONFIG_QUOTA Like quota_ino feature, we need to reject mounting RDWR with image which enables project_quota feature when there is no CONFIG_QUOTA be set in kernel. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4c2c891fef96..9d44f60da697 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -775,6 +775,12 @@ static int parse_options(struct super_block *sb, char *options) "without CONFIG_QUOTA"); return -EINVAL; } + if (f2fs_sb_has_project_quota(sbi->sb) && !f2fs_readonly(sbi->sb)) { + f2fs_msg(sb, KERN_ERR, + "Filesystem with project quota feature cannot be " + "mounted RDWR without CONFIG_QUOTA"); + return -EINVAL; + } #endif if (F2FS_IO_SIZE_BITS(sbi) && !test_opt(sbi, LFS)) { From ed523c069502dcd1084aa4fcfef62817c1c17f79 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 27 Jul 2018 18:15:13 +0800 Subject: [PATCH 0950/1212] f2fs: don't keep meta pages used for block migration For migration of encrypted inode's block, we load data of encrypted block into meta inode's page cache, after checkpoint, those all intermediate pages should be clean, and no one will read them again, so let's just release them for more memory. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index fb9529a7a02e..901d696dded7 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1409,6 +1409,14 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) commit_checkpoint(sbi, ckpt, start_blk); wait_on_all_pages_writeback(sbi); + /* + * invalidate intermediate page cache borrowed from meta inode + * which are used for migration of encrypted inode's blocks. + */ + if (f2fs_sb_has_encrypt(sbi->sb)) + invalidate_mapping_pages(META_MAPPING(sbi), + MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1); + f2fs_release_ino_entry(sbi, false); clear_sbi_flag(sbi, SBI_IS_DIRTY); From 7b4d1de67b1d81016fb4759fb3582f05f67f2f4c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 27 Jul 2018 18:15:14 +0800 Subject: [PATCH 0951/1212] f2fs: fix to active page in lru list for read path If config CONFIG_F2FS_FAULT_INJECTION is on, for both read or write path we will call find_lock_page() to get the page, but for read path, it missed to passing FGP_ACCESSED to allocator to active the page in LRU list, result in being reclaimed in advance incorrectly, fix it. Reported-by: Xianrong Zhou Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 17fa394449b6..5323390f49b4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2029,8 +2029,13 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, pgoff_t index, bool for_write) { #ifdef CONFIG_F2FS_FAULT_INJECTION - struct page *page = find_lock_page(mapping, index); + struct page *page; + if (!for_write) + page = find_get_page_flags(mapping, index, + FGP_LOCK | FGP_ACCESSED); + else + page = find_lock_page(mapping, index); if (page) return page; From 45cd1d9d075fc6c4d90d3c758f1fe60e9a3e10e0 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 29 Jul 2018 12:16:59 +0800 Subject: [PATCH 0952/1212] f2fs: fix to clear PG_checked flag in set_page_dirty() PG_checked flag will be set on data page during GC, later, we can recognize such page by the flag and migrate page to cold segment. But previously, we don't clear this flag when invalidating data page, after page redirtying, we will write it into wrong log. Let's clear PG_checked flag in set_page_dirty() to avoid this. Signed-off-by: Weichao Guo Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a9418f3f891a..a05a4d2205ad 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2539,6 +2539,10 @@ static int f2fs_set_data_page_dirty(struct page *page) if (!PageUptodate(page)) SetPageUptodate(page); + /* don't remain PG_checked flag which was set during GC */ + if (is_cold_data(page)) + clear_cold_data(page); + if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) { if (!IS_ATOMIC_WRITTEN_PAGE(page)) { f2fs_register_inmem_page(inode, page); From fc320c924c06a5ae587fbdcef85bcf6b8abadaa0 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 31 Jul 2018 09:09:01 -0700 Subject: [PATCH 0953/1212] f2fs: avoid f2fs_bug_on() in cp_error case There is a subtle race condition to invoke f2fs_bug_on() in shutdown tests. I've confirmed that the last checkpoint is preserved in consistent state, so it'd be fine to just return error at this moment. Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 23a841025d9d..c5d230733285 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1075,6 +1075,10 @@ int f2fs_remove_inode_page(struct inode *inode) f2fs_truncate_data_blocks_range(&dn, 1); /* 0 is possible, after f2fs_new_inode() has failed */ + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) { + f2fs_put_dnode(&dn); + return -EIO; + } f2fs_bug_on(F2FS_I_SB(inode), inode->i_blocks != 0 && inode->i_blocks != 8); From 65db828dbd3127d54a8abcc5844b0f602121016f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 1 Aug 2018 19:16:11 +0800 Subject: [PATCH 0954/1212] f2fs: fix to do sanity check with cp_pack_start_sum After fuzzing, cp_pack_start_sum could be corrupted, so current log's summary info should be wrong due to loading incorrect summary block. Then, if segment's type in current log is exceeded NR_CURSEG_TYPE, it can lead accessing invalid dirty_i->dirty_segmap bitmap finally. Add sanity check for cp_pack_start_sum to fix this issue. https://bugzilla.kernel.org/show_bug.cgi?id=200419 - Reproduce - Kernel message (f2fs-dev w/ KASAN) [ 3117.578432] F2FS-fs (loop0): Invalid log blocks per segment (8) [ 3117.578445] F2FS-fs (loop0): Can't find valid F2FS filesystem in 2th superblock [ 3117.581364] F2FS-fs (loop0): invalid crc_offset: 30716 [ 3117.583564] WARNING: CPU: 1 PID: 1225 at fs/f2fs/checkpoint.c:90 __get_meta_page+0x448/0x4b0 [ 3117.583570] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep snd_pcm snd_timer joydev input_leds serio_raw snd soundcore mac_hid i2c_piix4 ib_iser rdma_cm iw_cm ib_cm ib_core configfs iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi btrfs zstd_decompress zstd_compress xxhash raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c raid1 raid0 multipath linear 8139too qxl ttm drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops drm crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pcbc aesni_intel psmouse aes_x86_64 8139cp crypto_simd cryptd mii glue_helper pata_acpi floppy [ 3117.584014] CPU: 1 PID: 1225 Comm: mount Not tainted 4.17.0+ #1 [ 3117.584017] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 3117.584022] RIP: 0010:__get_meta_page+0x448/0x4b0 [ 3117.584023] Code: 00 49 8d bc 24 84 00 00 00 e8 74 54 da ff 41 83 8c 24 84 00 00 00 08 4c 89 f6 4c 89 ef e8 c0 d9 95 00 48 89 ef e8 18 e3 00 00 <0f> 0b f0 80 4d 48 04 e9 0f fe ff ff 0f 0b 48 89 c7 48 89 04 24 e8 [ 3117.584072] RSP: 0018:ffff88018eb678c0 EFLAGS: 00010286 [ 3117.584082] RAX: ffff88018f0a6a78 RBX: ffffea0007a46600 RCX: ffffffff9314d1b2 [ 3117.584085] RDX: ffffffff00000001 RSI: 0000000000000000 RDI: ffff88018f0a6a98 [ 3117.584087] RBP: ffff88018ebe9980 R08: 0000000000000002 R09: 0000000000000001 [ 3117.584090] R10: 0000000000000001 R11: ffffed00326e4450 R12: ffff880193722200 [ 3117.584092] R13: ffff88018ebe9afc R14: 0000000000000206 R15: ffff88018eb67900 [ 3117.584096] FS: 00007f5694636840(0000) GS:ffff8801f3b00000(0000) knlGS:0000000000000000 [ 3117.584098] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 3117.584101] CR2: 00000000016f21b8 CR3: 0000000191c22000 CR4: 00000000000006e0 [ 3117.584112] Call Trace: [ 3117.584121] ? f2fs_set_meta_page_dirty+0x150/0x150 [ 3117.584127] ? f2fs_build_segment_manager+0xbf9/0x3190 [ 3117.584133] ? f2fs_npages_for_summary_flush+0x75/0x120 [ 3117.584145] f2fs_build_segment_manager+0xda8/0x3190 [ 3117.584151] ? f2fs_get_valid_checkpoint+0x298/0xa00 [ 3117.584156] ? f2fs_flush_sit_entries+0x10e0/0x10e0 [ 3117.584184] ? map_id_range_down+0x17c/0x1b0 [ 3117.584188] ? __put_user_ns+0x30/0x30 [ 3117.584206] ? find_next_bit+0x53/0x90 [ 3117.584237] ? cpumask_next+0x16/0x20 [ 3117.584249] f2fs_fill_super+0x1948/0x2b40 [ 3117.584258] ? f2fs_commit_super+0x1a0/0x1a0 [ 3117.584279] ? sget_userns+0x65e/0x690 [ 3117.584296] ? set_blocksize+0x88/0x130 [ 3117.584302] ? f2fs_commit_super+0x1a0/0x1a0 [ 3117.584305] mount_bdev+0x1c0/0x200 [ 3117.584310] mount_fs+0x5c/0x190 [ 3117.584320] vfs_kern_mount+0x64/0x190 [ 3117.584330] do_mount+0x2e4/0x1450 [ 3117.584343] ? lockref_put_return+0x130/0x130 [ 3117.584347] ? copy_mount_string+0x20/0x20 [ 3117.584357] ? kasan_unpoison_shadow+0x31/0x40 [ 3117.584362] ? kasan_kmalloc+0xa6/0xd0 [ 3117.584373] ? memcg_kmem_put_cache+0x16/0x90 [ 3117.584377] ? __kmalloc_track_caller+0x196/0x210 [ 3117.584383] ? _copy_from_user+0x61/0x90 [ 3117.584396] ? memdup_user+0x3e/0x60 [ 3117.584401] ksys_mount+0x7e/0xd0 [ 3117.584405] __x64_sys_mount+0x62/0x70 [ 3117.584427] do_syscall_64+0x73/0x160 [ 3117.584440] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 3117.584455] RIP: 0033:0x7f5693f14b9a [ 3117.584456] Code: 48 8b 0d 01 c3 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ce c2 2b 00 f7 d8 64 89 01 48 [ 3117.584505] RSP: 002b:00007fff27346488 EFLAGS: 00000206 ORIG_RAX: 00000000000000a5 [ 3117.584510] RAX: ffffffffffffffda RBX: 00000000016e2030 RCX: 00007f5693f14b9a [ 3117.584512] RDX: 00000000016e2210 RSI: 00000000016e3f30 RDI: 00000000016ee040 [ 3117.584514] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000013 [ 3117.584516] R10: 00000000c0ed0000 R11: 0000000000000206 R12: 00000000016ee040 [ 3117.584519] R13: 00000000016e2210 R14: 0000000000000000 R15: 0000000000000003 [ 3117.584523] ---[ end trace a8e0d899985faf31 ]--- [ 3117.685663] F2FS-fs (loop0): f2fs_check_nid_range: out-of-range nid=2, run fsck to fix. [ 3117.685673] F2FS-fs (loop0): recover_data: ino = 2 (i_size: recover) recovered = 1, err = 0 [ 3117.685707] ================================================================== [ 3117.685955] BUG: KASAN: slab-out-of-bounds in __remove_dirty_segment+0xdd/0x1e0 [ 3117.686175] Read of size 8 at addr ffff88018f0a63d0 by task mount/1225 [ 3117.686477] CPU: 0 PID: 1225 Comm: mount Tainted: G W 4.17.0+ #1 [ 3117.686481] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 3117.686483] Call Trace: [ 3117.686494] dump_stack+0x71/0xab [ 3117.686512] print_address_description+0x6b/0x290 [ 3117.686517] kasan_report+0x28e/0x390 [ 3117.686522] ? __remove_dirty_segment+0xdd/0x1e0 [ 3117.686527] __remove_dirty_segment+0xdd/0x1e0 [ 3117.686532] locate_dirty_segment+0x189/0x190 [ 3117.686538] f2fs_allocate_new_segments+0xa9/0xe0 [ 3117.686543] recover_data+0x703/0x2c20 [ 3117.686547] ? f2fs_recover_fsync_data+0x48f/0xd50 [ 3117.686553] ? ksys_mount+0x7e/0xd0 [ 3117.686564] ? policy_nodemask+0x1a/0x90 [ 3117.686567] ? policy_node+0x56/0x70 [ 3117.686571] ? add_fsync_inode+0xf0/0xf0 [ 3117.686592] ? blk_finish_plug+0x44/0x60 [ 3117.686597] ? f2fs_ra_meta_pages+0x38b/0x5e0 [ 3117.686602] ? find_inode_fast+0xac/0xc0 [ 3117.686606] ? f2fs_is_valid_blkaddr+0x320/0x320 [ 3117.686618] ? __radix_tree_lookup+0x150/0x150 [ 3117.686633] ? dqget+0x670/0x670 [ 3117.686648] ? pagecache_get_page+0x29/0x410 [ 3117.686656] ? kmem_cache_alloc+0x176/0x1e0 [ 3117.686660] ? f2fs_is_valid_blkaddr+0x11d/0x320 [ 3117.686664] f2fs_recover_fsync_data+0xc23/0xd50 [ 3117.686670] ? f2fs_space_for_roll_forward+0x60/0x60 [ 3117.686674] ? rb_insert_color+0x323/0x3d0 [ 3117.686678] ? f2fs_recover_orphan_inodes+0xa5/0x700 [ 3117.686683] ? proc_register+0x153/0x1d0 [ 3117.686686] ? f2fs_remove_orphan_inode+0x10/0x10 [ 3117.686695] ? f2fs_attr_store+0x50/0x50 [ 3117.686700] ? proc_create_single_data+0x52/0x60 [ 3117.686707] f2fs_fill_super+0x1d06/0x2b40 [ 3117.686728] ? f2fs_commit_super+0x1a0/0x1a0 [ 3117.686735] ? sget_userns+0x65e/0x690 [ 3117.686740] ? set_blocksize+0x88/0x130 [ 3117.686745] ? f2fs_commit_super+0x1a0/0x1a0 [ 3117.686748] mount_bdev+0x1c0/0x200 [ 3117.686753] mount_fs+0x5c/0x190 [ 3117.686758] vfs_kern_mount+0x64/0x190 [ 3117.686762] do_mount+0x2e4/0x1450 [ 3117.686769] ? lockref_put_return+0x130/0x130 [ 3117.686773] ? copy_mount_string+0x20/0x20 [ 3117.686777] ? kasan_unpoison_shadow+0x31/0x40 [ 3117.686780] ? kasan_kmalloc+0xa6/0xd0 [ 3117.686786] ? memcg_kmem_put_cache+0x16/0x90 [ 3117.686790] ? __kmalloc_track_caller+0x196/0x210 [ 3117.686795] ? _copy_from_user+0x61/0x90 [ 3117.686801] ? memdup_user+0x3e/0x60 [ 3117.686804] ksys_mount+0x7e/0xd0 [ 3117.686809] __x64_sys_mount+0x62/0x70 [ 3117.686816] do_syscall_64+0x73/0x160 [ 3117.686824] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 3117.686829] RIP: 0033:0x7f5693f14b9a [ 3117.686830] Code: 48 8b 0d 01 c3 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ce c2 2b 00 f7 d8 64 89 01 48 [ 3117.686887] RSP: 002b:00007fff27346488 EFLAGS: 00000206 ORIG_RAX: 00000000000000a5 [ 3117.686892] RAX: ffffffffffffffda RBX: 00000000016e2030 RCX: 00007f5693f14b9a [ 3117.686894] RDX: 00000000016e2210 RSI: 00000000016e3f30 RDI: 00000000016ee040 [ 3117.686896] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000013 [ 3117.686899] R10: 00000000c0ed0000 R11: 0000000000000206 R12: 00000000016ee040 [ 3117.686901] R13: 00000000016e2210 R14: 0000000000000000 R15: 0000000000000003 [ 3117.687005] Allocated by task 1225: [ 3117.687152] kasan_kmalloc+0xa6/0xd0 [ 3117.687157] kmem_cache_alloc_trace+0xfd/0x200 [ 3117.687161] f2fs_build_segment_manager+0x2d09/0x3190 [ 3117.687165] f2fs_fill_super+0x1948/0x2b40 [ 3117.687168] mount_bdev+0x1c0/0x200 [ 3117.687171] mount_fs+0x5c/0x190 [ 3117.687174] vfs_kern_mount+0x64/0x190 [ 3117.687177] do_mount+0x2e4/0x1450 [ 3117.687180] ksys_mount+0x7e/0xd0 [ 3117.687182] __x64_sys_mount+0x62/0x70 [ 3117.687186] do_syscall_64+0x73/0x160 [ 3117.687190] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 3117.687285] Freed by task 19: [ 3117.687412] __kasan_slab_free+0x137/0x190 [ 3117.687416] kfree+0x8b/0x1b0 [ 3117.687460] ttm_bo_man_put_node+0x61/0x80 [ttm] [ 3117.687476] ttm_bo_cleanup_refs+0x15f/0x250 [ttm] [ 3117.687492] ttm_bo_delayed_delete+0x2f0/0x300 [ttm] [ 3117.687507] ttm_bo_delayed_workqueue+0x17/0x50 [ttm] [ 3117.687528] process_one_work+0x2f9/0x740 [ 3117.687531] worker_thread+0x78/0x6b0 [ 3117.687541] kthread+0x177/0x1c0 [ 3117.687545] ret_from_fork+0x35/0x40 [ 3117.687638] The buggy address belongs to the object at ffff88018f0a6300 which belongs to the cache kmalloc-192 of size 192 [ 3117.688014] The buggy address is located 16 bytes to the right of 192-byte region [ffff88018f0a6300, ffff88018f0a63c0) [ 3117.688382] The buggy address belongs to the page: [ 3117.688554] page:ffffea00063c2980 count:1 mapcount:0 mapping:ffff8801f3403180 index:0x0 [ 3117.688788] flags: 0x17fff8000000100(slab) [ 3117.688944] raw: 017fff8000000100 ffffea00063c2840 0000000e0000000e ffff8801f3403180 [ 3117.689166] raw: 0000000000000000 0000000080100010 00000001ffffffff 0000000000000000 [ 3117.689386] page dumped because: kasan: bad access detected [ 3117.689653] Memory state around the buggy address: [ 3117.689816] ffff88018f0a6280: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc [ 3117.690027] ffff88018f0a6300: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 3117.690239] >ffff88018f0a6380: 00 00 fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 3117.690448] ^ [ 3117.690644] ffff88018f0a6400: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 3117.690868] ffff88018f0a6480: 00 00 fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 3117.691077] ================================================================== [ 3117.691290] Disabling lock debugging due to kernel taint [ 3117.693893] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 [ 3117.694120] PGD 80000001f01bc067 P4D 80000001f01bc067 PUD 1d9638067 PMD 0 [ 3117.694338] Oops: 0002 [#1] SMP KASAN PTI [ 3117.694490] CPU: 1 PID: 1225 Comm: mount Tainted: G B W 4.17.0+ #1 [ 3117.694703] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 3117.695073] RIP: 0010:__remove_dirty_segment+0xe2/0x1e0 [ 3117.695246] Code: c4 48 89 c7 e8 cf bb d7 ff 45 0f b6 24 24 41 83 e4 3f 44 88 64 24 07 41 83 e4 3f 4a 8d 7c e3 08 e8 b3 bc d7 ff 4a 8b 4c e3 08 4c 0f b3 29 0f 82 94 00 00 00 48 8d bd 20 04 00 00 e8 97 bb d7 [ 3117.695793] RSP: 0018:ffff88018eb67638 EFLAGS: 00010292 [ 3117.695969] RAX: 0000000000000000 RBX: ffff88018f0a6300 RCX: 0000000000000000 [ 3117.696182] RDX: 0000000000000000 RSI: 0000000000000297 RDI: 0000000000000297 [ 3117.696391] RBP: ffff88018ebe9980 R08: ffffed003e743ebb R09: ffffed003e743ebb [ 3117.696604] R10: 0000000000000001 R11: ffffed003e743eba R12: 0000000000000019 [ 3117.696813] R13: 0000000000000014 R14: 0000000000000320 R15: ffff88018ebe99e0 [ 3117.697032] FS: 00007f5694636840(0000) GS:ffff8801f3b00000(0000) knlGS:0000000000000000 [ 3117.697280] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 3117.702357] CR2: 00007fe89bb1a000 CR3: 0000000191c22000 CR4: 00000000000006e0 [ 3117.707235] Call Trace: [ 3117.712077] locate_dirty_segment+0x189/0x190 [ 3117.716891] f2fs_allocate_new_segments+0xa9/0xe0 [ 3117.721617] recover_data+0x703/0x2c20 [ 3117.726316] ? f2fs_recover_fsync_data+0x48f/0xd50 [ 3117.730957] ? ksys_mount+0x7e/0xd0 [ 3117.735573] ? policy_nodemask+0x1a/0x90 [ 3117.740198] ? policy_node+0x56/0x70 [ 3117.744829] ? add_fsync_inode+0xf0/0xf0 [ 3117.749487] ? blk_finish_plug+0x44/0x60 [ 3117.754152] ? f2fs_ra_meta_pages+0x38b/0x5e0 [ 3117.758831] ? find_inode_fast+0xac/0xc0 [ 3117.763448] ? f2fs_is_valid_blkaddr+0x320/0x320 [ 3117.768046] ? __radix_tree_lookup+0x150/0x150 [ 3117.772603] ? dqget+0x670/0x670 [ 3117.777159] ? pagecache_get_page+0x29/0x410 [ 3117.781648] ? kmem_cache_alloc+0x176/0x1e0 [ 3117.786067] ? f2fs_is_valid_blkaddr+0x11d/0x320 [ 3117.790476] f2fs_recover_fsync_data+0xc23/0xd50 [ 3117.794790] ? f2fs_space_for_roll_forward+0x60/0x60 [ 3117.799086] ? rb_insert_color+0x323/0x3d0 [ 3117.803304] ? f2fs_recover_orphan_inodes+0xa5/0x700 [ 3117.807563] ? proc_register+0x153/0x1d0 [ 3117.811766] ? f2fs_remove_orphan_inode+0x10/0x10 [ 3117.815947] ? f2fs_attr_store+0x50/0x50 [ 3117.820087] ? proc_create_single_data+0x52/0x60 [ 3117.824262] f2fs_fill_super+0x1d06/0x2b40 [ 3117.828367] ? f2fs_commit_super+0x1a0/0x1a0 [ 3117.832432] ? sget_userns+0x65e/0x690 [ 3117.836500] ? set_blocksize+0x88/0x130 [ 3117.840501] ? f2fs_commit_super+0x1a0/0x1a0 [ 3117.844420] mount_bdev+0x1c0/0x200 [ 3117.848275] mount_fs+0x5c/0x190 [ 3117.852053] vfs_kern_mount+0x64/0x190 [ 3117.855810] do_mount+0x2e4/0x1450 [ 3117.859441] ? lockref_put_return+0x130/0x130 [ 3117.862996] ? copy_mount_string+0x20/0x20 [ 3117.866417] ? kasan_unpoison_shadow+0x31/0x40 [ 3117.869719] ? kasan_kmalloc+0xa6/0xd0 [ 3117.872948] ? memcg_kmem_put_cache+0x16/0x90 [ 3117.876121] ? __kmalloc_track_caller+0x196/0x210 [ 3117.879333] ? _copy_from_user+0x61/0x90 [ 3117.882467] ? memdup_user+0x3e/0x60 [ 3117.885604] ksys_mount+0x7e/0xd0 [ 3117.888700] __x64_sys_mount+0x62/0x70 [ 3117.891742] do_syscall_64+0x73/0x160 [ 3117.894692] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 3117.897669] RIP: 0033:0x7f5693f14b9a [ 3117.900563] Code: 48 8b 0d 01 c3 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ce c2 2b 00 f7 d8 64 89 01 48 [ 3117.906922] RSP: 002b:00007fff27346488 EFLAGS: 00000206 ORIG_RAX: 00000000000000a5 [ 3117.910159] RAX: ffffffffffffffda RBX: 00000000016e2030 RCX: 00007f5693f14b9a [ 3117.913469] RDX: 00000000016e2210 RSI: 00000000016e3f30 RDI: 00000000016ee040 [ 3117.916764] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000013 [ 3117.920071] R10: 00000000c0ed0000 R11: 0000000000000206 R12: 00000000016ee040 [ 3117.923393] R13: 00000000016e2210 R14: 0000000000000000 R15: 0000000000000003 [ 3117.926680] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep snd_pcm snd_timer joydev input_leds serio_raw snd soundcore mac_hid i2c_piix4 ib_iser rdma_cm iw_cm ib_cm ib_core configfs iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi btrfs zstd_decompress zstd_compress xxhash raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c raid1 raid0 multipath linear 8139too qxl ttm drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops drm crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pcbc aesni_intel psmouse aes_x86_64 8139cp crypto_simd cryptd mii glue_helper pata_acpi floppy [ 3117.949979] CR2: 0000000000000000 [ 3117.954283] ---[ end trace a8e0d899985faf32 ]--- [ 3117.958575] RIP: 0010:__remove_dirty_segment+0xe2/0x1e0 [ 3117.962810] Code: c4 48 89 c7 e8 cf bb d7 ff 45 0f b6 24 24 41 83 e4 3f 44 88 64 24 07 41 83 e4 3f 4a 8d 7c e3 08 e8 b3 bc d7 ff 4a 8b 4c e3 08 4c 0f b3 29 0f 82 94 00 00 00 48 8d bd 20 04 00 00 e8 97 bb d7 [ 3117.971789] RSP: 0018:ffff88018eb67638 EFLAGS: 00010292 [ 3117.976333] RAX: 0000000000000000 RBX: ffff88018f0a6300 RCX: 0000000000000000 [ 3117.980926] RDX: 0000000000000000 RSI: 0000000000000297 RDI: 0000000000000297 [ 3117.985497] RBP: ffff88018ebe9980 R08: ffffed003e743ebb R09: ffffed003e743ebb [ 3117.990098] R10: 0000000000000001 R11: ffffed003e743eba R12: 0000000000000019 [ 3117.994761] R13: 0000000000000014 R14: 0000000000000320 R15: ffff88018ebe99e0 [ 3117.999392] FS: 00007f5694636840(0000) GS:ffff8801f3b00000(0000) knlGS:0000000000000000 [ 3118.004096] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 3118.008816] CR2: 00007fe89bb1a000 CR3: 0000000191c22000 CR4: 00000000000006e0 - Location https://elixir.bootlin.com/linux/v4.18-rc3/source/fs/f2fs/segment.c#L775 if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) dirty_i->nr_dirty[t]--; Here dirty_i->dirty_segmap[t] can be NULL which leads to crash in test_and_clear_bit() Reported-by Wen Xu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 8 ++++---- fs/f2fs/super.c | 12 ++++++++++++ 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 901d696dded7..08e3dcfc0cd9 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -881,15 +881,15 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi) cp_block = (struct f2fs_checkpoint *)page_address(cur_page); memcpy(sbi->ckpt, cp_block, blk_size); - /* Sanity checking of checkpoint */ - if (f2fs_sanity_check_ckpt(sbi)) - goto free_fail_no_cp; - if (cur_page == cp1) sbi->cur_cp_pack = 1; else sbi->cur_cp_pack = 2; + /* Sanity checking of checkpoint */ + if (f2fs_sanity_check_ckpt(sbi)) + goto free_fail_no_cp; + if (cp_blks <= 1) goto done; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 9d44f60da697..b00f1b342474 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2297,6 +2297,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) unsigned int sit_bitmap_size, nat_bitmap_size; unsigned int log_blocks_per_seg; unsigned int segment_count_main; + unsigned int cp_pack_start_sum, cp_payload; block_t user_block_count; int i; @@ -2357,6 +2358,17 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) return 1; } + cp_pack_start_sum = __start_sum_addr(sbi); + cp_payload = __cp_payload(sbi); + if (cp_pack_start_sum < cp_payload + 1 || + cp_pack_start_sum > blocks_per_seg - 1 - + NR_CURSEG_TYPE) { + f2fs_msg(sbi->sb, KERN_ERR, + "Wrong cp_pack_start_sum: %u", + cp_pack_start_sum); + return 1; + } + if (unlikely(f2fs_cp_error(sbi))) { f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); return 1; From 2e68719a085228d521f4178755f8b7ce8a0839a5 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 1 Aug 2018 19:51:38 -0500 Subject: [PATCH 0955/1212] f2fs: use true and false for boolean values Return statements in functions returning bool should use true or false instead of an integer value. This issue was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5323390f49b4..4f7ec7ec48ec 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1399,7 +1399,7 @@ static inline bool is_idle(struct f2fs_sb_info *sbi) struct request_list *rl = &q->root_rl; if (rl->count[BLK_RW_SYNC] || rl->count[BLK_RW_ASYNC]) - return 0; + return false; return f2fs_time_over(sbi, REQ_TIME); } @@ -3485,7 +3485,7 @@ static inline bool f2fs_may_encrypt(struct inode *inode) return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)); #else - return 0; + return false; #endif } From 46ce4b0af9098fd8b425ce1662e4ef49c285d75f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 2 Aug 2018 23:03:19 +0800 Subject: [PATCH 0956/1212] f2fs: fix to avoid broken of dnode block list f2fs recovery flow is relying on dnode block link list, it means fsynced file recovery depends on previous dnode's persistence in the list, so during fsync() we should wait on all regular inode's dnode writebacked before issuing flush. By this way, we can avoid dnode block list being broken by out-of-order IO submission due to IO scheduler or driver. Sheng Yong helps to do the test with this patch: Target:/data (f2fs, -) 64MB / 32768KB / 4KB / 8 1 / PERSIST / Index Base: SEQ-RD(MB/s) SEQ-WR(MB/s) RND-RD(IOPS) RND-WR(IOPS) Insert(TPS) Update(TPS) Delete(TPS) 1 867.82 204.15 41440.03 41370.54 680.8 1025.94 1031.08 2 871.87 205.87 41370.3 40275.2 791.14 1065.84 1101.7 3 866.52 205.69 41795.67 40596.16 694.69 1037.16 1031.48 Avg 868.7366667 205.2366667 41535.33333 40747.3 722.21 1042.98 1054.753333 After: SEQ-RD(MB/s) SEQ-WR(MB/s) RND-RD(IOPS) RND-WR(IOPS) Insert(TPS) Update(TPS) Delete(TPS) 1 798.81 202.5 41143 40613.87 602.71 838.08 913.83 2 805.79 206.47 40297.2 41291.46 604.44 840.75 924.27 3 814.83 206.17 41209.57 40453.62 602.85 834.66 927.91 Avg 806.4766667 205.0466667 40883.25667 40786.31667 603.3333333 837.83 922.0033333 Patched/Original: 0.928332713 0.999074239 0.984300676 1.000957528 0.835398753 0.803303994 0.874141189 It looks like atomic write will suffer performance regression. I suspect that the criminal is that we forcing to wait all dnode being in storage cache before we issue PREFLUSH+FUA. BTW, will commit ("f2fs: don't need to wait for node writes for atomic write") cause the problem: we will lose data of last transaction after SPO, even if atomic write return no error: - atomic_open(); - write() P1, P2, P3; - atomic_commit(); - writeback data: P1, P2, P3; - writeback node: N1, N2, N3; <--- If N1, N2 is not writebacked, N3 with fsync_mark is writebacked, In SPOR, we won't find N3 since node chain is broken, turns out that losing last transaction. - preflush + fua; - power-cut If we don't wait dnode writeback for atomic_write: SEQ-RD(MB/s) SEQ-WR(MB/s) RND-RD(IOPS) RND-WR(IOPS) Insert(TPS) Update(TPS) Delete(TPS) 1 779.91 206.03 41621.5 40333.16 716.9 1038.21 1034.85 2 848.51 204.35 40082.44 39486.17 791.83 1119.96 1083.77 3 772.12 206.27 41335.25 41599.65 723.29 1055.07 971.92 Avg 800.18 205.55 41013.06333 40472.99333 744.0066667 1071.08 1030.18 Patched/Original: 0.92108464 1.001526693 0.987425886 0.993268102 1.030180511 1.026942031 0.976702294 SQLite's performance recovers. Jaegeuk: "Practically, I don't see db corruption becase of this. We can excuse to lose the last transaction." Finally, we decide to keep original implementation of atomic write interface sematics that we don't wait all dnode writeback before preflush+fua submission. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 8 ++- fs/f2fs/data.c | 2 + fs/f2fs/f2fs.h | 22 ++++++- fs/f2fs/file.c | 5 +- fs/f2fs/node.c | 148 +++++++++++++++++++++++++++++++++++-------- fs/f2fs/super.c | 6 ++ 6 files changed, 158 insertions(+), 33 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 08e3dcfc0cd9..ebbe3bc8e9cd 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1162,7 +1162,7 @@ static void unblock_operations(struct f2fs_sb_info *sbi) f2fs_unlock_all(sbi); } -static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) +void f2fs_wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) { DEFINE_WAIT(wait); @@ -1398,7 +1398,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); /* wait for previous submitted meta pages writeback */ - wait_on_all_pages_writeback(sbi); + f2fs_wait_on_all_pages_writeback(sbi); /* flush all device cache */ err = f2fs_flush_device_cache(sbi); @@ -1407,7 +1407,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* barrier and flush checkpoint cp pack 2 page if it can */ commit_checkpoint(sbi, ckpt, start_blk); - wait_on_all_pages_writeback(sbi); + f2fs_wait_on_all_pages_writeback(sbi); /* * invalidate intermediate page cache borrowed from meta inode @@ -1419,6 +1419,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_release_ino_entry(sbi, false); + f2fs_reset_fsync_node_info(sbi); + clear_sbi_flag(sbi, SBI_IS_DIRTY); clear_sbi_flag(sbi, SBI_NEED_CP); __set_cp_next_pack(sbi); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a05a4d2205ad..5088609f0b8a 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -176,6 +176,8 @@ static void f2fs_write_end_io(struct bio *bio) page->index != nid_of_node(page)); dec_page_count(sbi, type); + if (f2fs_in_warm_node_list(sbi, page)) + f2fs_del_fsync_node_entry(sbi, page); clear_cold_data(page); end_page_writeback(page); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4f7ec7ec48ec..c07f67178767 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -296,6 +296,12 @@ struct inode_entry { struct inode *inode; /* vfs inode pointer */ }; +struct fsync_node_entry { + struct list_head list; /* list head */ + struct page *page; /* warm node page pointer */ + unsigned int seq_id; /* sequence id */ +}; + /* for the bitmap indicate blocks to be discarded */ struct discard_entry { struct list_head list; /* list head */ @@ -1221,6 +1227,11 @@ struct f2fs_sb_info { struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ + spinlock_t fsync_node_lock; /* for node entry lock */ + struct list_head fsync_node_list; /* node list head */ + unsigned int fsync_seg_id; /* sequence id */ + unsigned int fsync_node_num; /* number of node entries */ + /* for orphan inode, use 0'th array */ unsigned int max_orphans; /* max orphan inodes */ @@ -2912,6 +2923,10 @@ struct node_info; int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type); +bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page); +void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi); +void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page); +void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi); int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino); @@ -2921,7 +2936,8 @@ pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs); int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode); int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from); int f2fs_truncate_xattr_node(struct inode *inode); -int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino); +int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, + unsigned int seq_id); int f2fs_remove_inode_page(struct inode *inode); struct page *f2fs_new_inode_page(struct inode *inode); struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs); @@ -2930,7 +2946,8 @@ struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); struct page *f2fs_get_node_page_ra(struct page *parent, int start); void f2fs_move_node_page(struct page *node_page, int gc_type); int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, - struct writeback_control *wbc, bool atomic); + struct writeback_control *wbc, bool atomic, + unsigned int *seq_id); int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, bool do_balance, enum iostat_type io_type); @@ -3047,6 +3064,7 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi); void f2fs_update_dirty_page(struct inode *inode, struct page *page); void f2fs_remove_dirty_inode(struct inode *inode); int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type); +void f2fs_wait_on_all_pages_writeback(struct f2fs_sb_info *sbi); int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi); int __init f2fs_create_checkpoint_caches(void); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 53100ebac81e..40567031b2fc 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -216,6 +216,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, .nr_to_write = LONG_MAX, .for_reclaim = 0, }; + unsigned int seq_id = 0; if (unlikely(f2fs_readonly(inode->i_sb))) return 0; @@ -278,7 +279,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, } sync_nodes: atomic_inc(&sbi->wb_sync_req[NODE]); - ret = f2fs_fsync_node_pages(sbi, inode, &wbc, atomic); + ret = f2fs_fsync_node_pages(sbi, inode, &wbc, atomic, &seq_id); atomic_dec(&sbi->wb_sync_req[NODE]); if (ret) goto out; @@ -304,7 +305,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, * given fsync mark. */ if (!atomic) { - ret = f2fs_wait_on_node_pages_writeback(sbi, ino); + ret = f2fs_wait_on_node_pages_writeback(sbi, seq_id); if (ret) goto out; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index c5d230733285..5cdd59031674 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -28,6 +28,7 @@ static struct kmem_cache *nat_entry_slab; static struct kmem_cache *free_nid_slab; static struct kmem_cache *nat_entry_set_slab; +static struct kmem_cache *fsync_node_entry_slab; /* * Check whether the given nid is within node id range. @@ -264,6 +265,72 @@ static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, start, nr); } +bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page) +{ + return NODE_MAPPING(sbi) == page->mapping && + IS_DNODE(page) && is_cold_node(page); +} + +void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi) +{ + spin_lock_init(&sbi->fsync_node_lock); + INIT_LIST_HEAD(&sbi->fsync_node_list); + sbi->fsync_seg_id = 0; + sbi->fsync_node_num = 0; +} + +static unsigned int f2fs_add_fsync_node_entry(struct f2fs_sb_info *sbi, + struct page *page) +{ + struct fsync_node_entry *fn; + unsigned long flags; + unsigned int seq_id; + + fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab, GFP_NOFS); + + get_page(page); + fn->page = page; + INIT_LIST_HEAD(&fn->list); + + spin_lock_irqsave(&sbi->fsync_node_lock, flags); + list_add_tail(&fn->list, &sbi->fsync_node_list); + fn->seq_id = sbi->fsync_seg_id++; + seq_id = fn->seq_id; + sbi->fsync_node_num++; + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); + + return seq_id; +} + +void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page) +{ + struct fsync_node_entry *fn; + unsigned long flags; + + spin_lock_irqsave(&sbi->fsync_node_lock, flags); + list_for_each_entry(fn, &sbi->fsync_node_list, list) { + if (fn->page == page) { + list_del(&fn->list); + sbi->fsync_node_num--; + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); + kmem_cache_free(fsync_node_entry_slab, fn); + put_page(page); + return; + } + } + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); + f2fs_bug_on(sbi, 1); +} + +void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi) +{ + unsigned long flags; + + spin_lock_irqsave(&sbi->fsync_node_lock, flags); + sbi->fsync_seg_id = 0; + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); +} + int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -1388,7 +1455,7 @@ static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) static int __write_node_page(struct page *page, bool atomic, bool *submitted, struct writeback_control *wbc, bool do_balance, - enum iostat_type io_type) + enum iostat_type io_type, unsigned int *seq_id) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); nid_t nid; @@ -1405,6 +1472,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, .io_type = io_type, .io_wbc = wbc, }; + unsigned int seq; trace_f2fs_writepage(page, NODE); @@ -1450,6 +1518,13 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, set_page_writeback(page); ClearPageError(page); + + if (f2fs_in_warm_node_list(sbi, page)) { + seq = f2fs_add_fsync_node_entry(sbi, page); + if (seq_id) + *seq_id = seq; + } + fio.old_blkaddr = ni.blk_addr; f2fs_do_write_node_page(nid, &fio); set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); @@ -1497,7 +1572,7 @@ void f2fs_move_node_page(struct page *node_page, int gc_type) goto out_page; if (__write_node_page(node_page, false, NULL, - &wbc, false, FS_GC_NODE_IO)) + &wbc, false, FS_GC_NODE_IO, NULL)) unlock_page(node_page); goto release_page; } else { @@ -1514,11 +1589,13 @@ void f2fs_move_node_page(struct page *node_page, int gc_type) static int f2fs_write_node_page(struct page *page, struct writeback_control *wbc) { - return __write_node_page(page, false, NULL, wbc, false, FS_NODE_IO); + return __write_node_page(page, false, NULL, wbc, false, + FS_NODE_IO, NULL); } int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, - struct writeback_control *wbc, bool atomic) + struct writeback_control *wbc, bool atomic, + unsigned int *seq_id) { pgoff_t index; pgoff_t last_idx = ULONG_MAX; @@ -1599,7 +1676,7 @@ int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, ret = __write_node_page(page, atomic && page == last_page, &submitted, wbc, true, - FS_NODE_IO); + FS_NODE_IO, seq_id); if (ret) { unlock_page(page); f2fs_put_page(last_page, 0); @@ -1716,7 +1793,7 @@ int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, set_dentry_mark(page, 0); ret = __write_node_page(page, false, &submitted, - wbc, do_balance, io_type); + wbc, do_balance, io_type, NULL); if (ret) unlock_page(page); else if (submitted) @@ -1749,30 +1826,40 @@ int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, return ret; } -int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) +int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, + unsigned int seq_id) { - pgoff_t index = 0; - struct pagevec pvec; + struct fsync_node_entry *fn; + struct page *page; + struct list_head *head = &sbi->fsync_node_list; + unsigned long flags; + unsigned int cur_seq_id = 0; int ret2 = 0, ret = 0; - int nr_pages; - pagevec_init(&pvec, 0); - - while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_WRITEBACK))) { - int i; - - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - - if (ino && ino_of_node(page) == ino) { - f2fs_wait_on_page_writeback(page, NODE, true); - if (TestClearPageError(page)) - ret = -EIO; - } + while (seq_id && cur_seq_id < seq_id) { + spin_lock_irqsave(&sbi->fsync_node_lock, flags); + if (list_empty(head)) { + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); + break; } - pagevec_release(&pvec); - cond_resched(); + fn = list_first_entry(head, struct fsync_node_entry, list); + if (fn->seq_id > seq_id) { + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); + break; + } + cur_seq_id = fn->seq_id; + page = fn->page; + get_page(page); + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); + + f2fs_wait_on_page_writeback(page, NODE, true); + if (TestClearPageError(page)) + ret = -EIO; + + put_page(page); + + if (ret) + break; } if (unlikely(test_and_clear_bit(AS_ENOSPC, &NODE_MAPPING(sbi)->flags))) @@ -1781,6 +1868,7 @@ int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) ret2 = -EIO; if (!ret) ret = ret2; + return ret; } @@ -2995,8 +3083,15 @@ int __init f2fs_create_node_manager_caches(void) sizeof(struct nat_entry_set)); if (!nat_entry_set_slab) goto destroy_free_nid; + + fsync_node_entry_slab = f2fs_kmem_cache_create("fsync_node_entry", + sizeof(struct fsync_node_entry)); + if (!fsync_node_entry_slab) + goto destroy_nat_entry_set; return 0; +destroy_nat_entry_set: + kmem_cache_destroy(nat_entry_set_slab); destroy_free_nid: kmem_cache_destroy(free_nid_slab); destroy_nat_entry: @@ -3007,6 +3102,7 @@ int __init f2fs_create_node_manager_caches(void) void f2fs_destroy_node_manager_caches(void) { + kmem_cache_destroy(fsync_node_entry_slab); kmem_cache_destroy(nat_entry_set_slab); kmem_cache_destroy(free_nid_slab); kmem_cache_destroy(nat_entry_slab); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b00f1b342474..a2368f4934f5 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1036,6 +1036,10 @@ static void f2fs_put_super(struct super_block *sb) /* our cp_error case, we can wait for any writeback page */ f2fs_flush_merged_writes(sbi); + f2fs_wait_on_all_pages_writeback(sbi); + + f2fs_bug_on(sbi, sbi->fsync_node_num); + iput(sbi->node_inode); iput(sbi->meta_inode); @@ -2919,6 +2923,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) f2fs_init_ino_entry_info(sbi); + f2fs_init_fsync_node_info(sbi); + /* setup f2fs internal modules */ err = f2fs_build_segment_manager(sbi); if (err) { From a08bdb50e2a44af1c5e645da3d68fd78c26a05dd Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 2 Aug 2018 22:59:12 +0800 Subject: [PATCH 0957/1212] f2fs: fix invalid memory access syzbot found the following crash on: HEAD commit: d9bd94c0bcaa Add linux-next specific files for 20180801 git tree: linux-next console output: https://syzkaller.appspot.com/x/log.txt?x=1001189c400000 kernel config: https://syzkaller.appspot.com/x/.config?x=cc8964ea4d04518c dashboard link: https://syzkaller.appspot.com/bug?extid=c966a82db0b14aa37e81 compiler: gcc (GCC) 8.0.1 20180413 (experimental) Unfortunately, I don't have any reproducer for this crash yet. IMPORTANT: if you fix the bug, please add the following tag to the commit: Reported-by: syzbot+c966a82db0b14aa37e81@syzkaller.appspotmail.com loop7: rw=12288, want=8200, limit=20 netlink: 65342 bytes leftover after parsing attributes in process `syz-executor4'. openvswitch: netlink: Message has 8 unknown bytes. kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN CPU: 1 PID: 7615 Comm: syz-executor7 Not tainted 4.18.0-rc7-next-20180801+ #29 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__read_once_size include/linux/compiler.h:188 [inline] RIP: 0010:compound_head include/linux/page-flags.h:142 [inline] RIP: 0010:PageLocked include/linux/page-flags.h:272 [inline] RIP: 0010:f2fs_put_page fs/f2fs/f2fs.h:2011 [inline] RIP: 0010:validate_checkpoint+0x66d/0xec0 fs/f2fs/checkpoint.c:835 Code: e8 58 05 7f fe 4c 8d 6b 80 4d 8d 74 24 08 48 b8 00 00 00 00 00 fc ff df 4c 89 ea 48 c1 ea 03 c6 04 02 00 4c 89 f2 48 c1 ea 03 <80> 3c 02 00 0f 85 f4 06 00 00 4c 89 ea 4d 8b 7c 24 08 48 b8 00 00 RSP: 0018:ffff8801937cebe8 EFLAGS: 00010246 RAX: dffffc0000000000 RBX: ffff8801937cef30 RCX: ffffc90006035000 RDX: 0000000000000000 RSI: ffffffff82fd9658 RDI: 0000000000000005 RBP: ffff8801937cef58 R08: ffff8801ab254700 R09: fffff94000d9e026 R10: fffff94000d9e026 R11: ffffea0006cf0137 R12: fffffffffffffffb R13: ffff8801937ceeb0 R14: 0000000000000003 R15: ffff880193419b40 FS: 00007f36a61d5700(0000) GS:ffff8801db100000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fc04ff93000 CR3: 00000001d0562000 CR4: 00000000001426e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: f2fs_get_valid_checkpoint+0x436/0x1ec0 fs/f2fs/checkpoint.c:860 f2fs_fill_super+0x2d42/0x8110 fs/f2fs/super.c:2883 mount_bdev+0x314/0x3e0 fs/super.c:1344 f2fs_mount+0x3c/0x50 fs/f2fs/super.c:3133 legacy_get_tree+0x131/0x460 fs/fs_context.c:729 vfs_get_tree+0x1cb/0x5c0 fs/super.c:1743 do_new_mount fs/namespace.c:2603 [inline] do_mount+0x6f2/0x1e20 fs/namespace.c:2927 ksys_mount+0x12d/0x140 fs/namespace.c:3143 __do_sys_mount fs/namespace.c:3157 [inline] __se_sys_mount fs/namespace.c:3154 [inline] __x64_sys_mount+0xbe/0x150 fs/namespace.c:3154 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x45943a Code: b8 a6 00 00 00 0f 05 48 3d 01 f0 ff ff 0f 83 bd 8a fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 0f 83 9a 8a fb ff c3 66 0f 1f 84 00 00 00 00 00 RSP: 002b:00007f36a61d4a88 EFLAGS: 00000206 ORIG_RAX: 00000000000000a5 RAX: ffffffffffffffda RBX: 00007f36a61d4b30 RCX: 000000000045943a RDX: 00007f36a61d4ad0 RSI: 0000000020000100 RDI: 00007f36a61d4af0 RBP: 0000000020000100 R08: 00007f36a61d4b30 R09: 00007f36a61d4ad0 R10: 0000000000000000 R11: 0000000000000206 R12: 0000000000000013 R13: 0000000000000000 R14: 00000000004c8ea0 R15: 0000000000000000 Modules linked in: Dumping ftrace buffer: (ftrace buffer empty) ---[ end trace bd8550c129352286 ]--- RIP: 0010:__read_once_size include/linux/compiler.h:188 [inline] RIP: 0010:compound_head include/linux/page-flags.h:142 [inline] RIP: 0010:PageLocked include/linux/page-flags.h:272 [inline] RIP: 0010:f2fs_put_page fs/f2fs/f2fs.h:2011 [inline] RIP: 0010:validate_checkpoint+0x66d/0xec0 fs/f2fs/checkpoint.c:835 Code: e8 58 05 7f fe 4c 8d 6b 80 4d 8d 74 24 08 48 b8 00 00 00 00 00 fc ff df 4c 89 ea 48 c1 ea 03 c6 04 02 00 4c 89 f2 48 c1 ea 03 <80> 3c 02 00 0f 85 f4 06 00 00 4c 89 ea 4d 8b 7c 24 08 48 b8 00 00 RSP: 0018:ffff8801937cebe8 EFLAGS: 00010246 RAX: dffffc0000000000 RBX: ffff8801937cef30 RCX: ffffc90006035000 RDX: 0000000000000000 RSI: ffffffff82fd9658 RDI: 0000000000000005 netlink: 65342 bytes leftover after parsing attributes in process `syz-executor4'. RBP: ffff8801937cef58 R08: ffff8801ab254700 R09: fffff94000d9e026 openvswitch: netlink: Message has 8 unknown bytes. R10: fffff94000d9e026 R11: ffffea0006cf0137 R12: fffffffffffffffb R13: ffff8801937ceeb0 R14: 0000000000000003 R15: ffff880193419b40 FS: 00007f36a61d5700(0000) GS:ffff8801db100000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fc04ff93000 CR3: 00000001d0562000 CR4: 00000000001426e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 In validate_checkpoint(), if we failed to call get_checkpoint_version(), we will pass returned invalid page pointer into f2fs_put_page, cause accessing invalid memory, this patch tries to handle error path correctly to fix this issue. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index ebbe3bc8e9cd..9caec7622d1e 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -781,6 +781,7 @@ static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, crc_offset = le32_to_cpu((*cp_block)->checksum_offset); if (crc_offset > (blk_size - sizeof(__le32))) { + f2fs_put_page(*cp_page, 1); f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc_offset: %zu", crc_offset); return -EINVAL; @@ -788,6 +789,7 @@ static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, crc = cur_cp_crc(*cp_block); if (!f2fs_crc_valid(sbi, crc, *cp_block, crc_offset)) { + f2fs_put_page(*cp_page, 1); f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc value"); return -EINVAL; } @@ -807,14 +809,14 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, err = get_checkpoint_version(sbi, cp_addr, &cp_block, &cp_page_1, version); if (err) - goto invalid_cp1; + return NULL; if (le32_to_cpu(cp_block->cp_pack_total_block_count) > sbi->blocks_per_seg) { f2fs_msg(sbi->sb, KERN_WARNING, "invalid cp_pack_total_block_count:%u", le32_to_cpu(cp_block->cp_pack_total_block_count)); - goto invalid_cp1; + goto invalid_cp; } pre_version = *version; @@ -822,7 +824,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, err = get_checkpoint_version(sbi, cp_addr, &cp_block, &cp_page_2, version); if (err) - goto invalid_cp2; + goto invalid_cp; cur_version = *version; if (cur_version == pre_version) { @@ -830,9 +832,8 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, f2fs_put_page(cp_page_2, 1); return cp_page_1; } -invalid_cp2: f2fs_put_page(cp_page_2, 1); -invalid_cp1: +invalid_cp: f2fs_put_page(cp_page_1, 1); return NULL; } From d94f27f040148d500519e8bf5f2a8a4d43f2945f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 28 Jul 2018 18:37:58 +0800 Subject: [PATCH 0958/1212] f2fs: fix to reset i_gc_failures correctly Let's reset i_gc_failures to zero when we unset pinned state for file. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 40567031b2fc..f476c4857c32 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2622,7 +2622,7 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) if (!pin) { clear_inode_flag(inode, FI_PIN_FILE); - F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN] = 1; + f2fs_i_gc_failures_write(inode, 0); goto done; } From ad792eda0e992b647eccc5ae297f705a7b8c3577 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 29 Jun 2018 00:19:25 +0800 Subject: [PATCH 0959/1212] f2fs: fix to do sanity check with inline flags https://bugzilla.kernel.org/show_bug.cgi?id=200221 - Overview BUG() in clear_inode() when mounting and un-mounting a corrupted f2fs image - Reproduce - Kernel message [ 538.601448] F2FS-fs (loop0): Invalid segment/section count (31, 24 x 1376257) [ 538.601458] F2FS-fs (loop0): Can't find valid F2FS filesystem in 2th superblock [ 538.724091] F2FS-fs (loop0): Try to recover 2th superblock, ret: 0 [ 538.724102] F2FS-fs (loop0): Mounted with checkpoint version = 2 [ 540.970834] ------------[ cut here ]------------ [ 540.970838] kernel BUG at fs/inode.c:512! [ 540.971750] invalid opcode: 0000 [#1] SMP KASAN PTI [ 540.972755] CPU: 1 PID: 1305 Comm: umount Not tainted 4.18.0-rc1+ #4 [ 540.974034] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 540.982913] RIP: 0010:clear_inode+0xc0/0xd0 [ 540.983774] Code: 8d a3 30 01 00 00 4c 89 e7 e8 1c ec f8 ff 48 8b 83 30 01 00 00 49 39 c4 75 1a 48 c7 83 a0 00 00 00 60 00 00 00 5b 41 5c 5d c3 <0f> 0b 0f 0b 0f 0b 0f 0b 0f 0b 0f 0b 0f 1f 40 00 66 66 66 66 90 55 [ 540.987570] RSP: 0018:ffff8801e34a7b70 EFLAGS: 00010002 [ 540.988636] RAX: 0000000000000000 RBX: ffff8801e9b744e8 RCX: ffffffffb840eb3a [ 540.990063] RDX: dffffc0000000000 RSI: 0000000000000004 RDI: ffff8801e9b746b8 [ 540.991499] RBP: ffff8801e34a7b80 R08: ffffed003d36e8ce R09: ffffed003d36e8ce [ 540.992923] R10: 0000000000000001 R11: ffffed003d36e8cd R12: ffff8801e9b74668 [ 540.994360] R13: ffff8801e9b74760 R14: ffff8801e9b74528 R15: ffff8801e9b74530 [ 540.995786] FS: 00007f4662bdf840(0000) GS:ffff8801f6f00000(0000) knlGS:0000000000000000 [ 540.997403] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 540.998571] CR2: 000000000175c568 CR3: 00000001dcfe6000 CR4: 00000000000006e0 [ 541.000015] Call Trace: [ 541.000554] f2fs_evict_inode+0x253/0x630 [ 541.001381] evict+0x16f/0x290 [ 541.002015] iput+0x280/0x300 [ 541.002654] dentry_unlink_inode+0x165/0x1e0 [ 541.003528] __dentry_kill+0x16a/0x260 [ 541.004300] dentry_kill+0x70/0x250 [ 541.005018] dput+0x154/0x1d0 [ 541.005635] do_one_tree+0x34/0x40 [ 541.006354] shrink_dcache_for_umount+0x3f/0xa0 [ 541.007285] generic_shutdown_super+0x43/0x1c0 [ 541.008192] kill_block_super+0x52/0x80 [ 541.008978] kill_f2fs_super+0x62/0x70 [ 541.009750] deactivate_locked_super+0x6f/0xa0 [ 541.010664] deactivate_super+0x5e/0x80 [ 541.011450] cleanup_mnt+0x61/0xa0 [ 541.012151] __cleanup_mnt+0x12/0x20 [ 541.012893] task_work_run+0xc8/0xf0 [ 541.013635] exit_to_usermode_loop+0x125/0x130 [ 541.014555] do_syscall_64+0x138/0x170 [ 541.015340] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 541.016375] RIP: 0033:0x7f46624bf487 [ 541.017104] Code: 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 31 f6 e9 09 00 00 00 66 0f 1f 84 00 00 00 00 00 b8 a6 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d e1 c9 2b 00 f7 d8 64 89 01 48 [ 541.020923] RSP: 002b:00007fff5e12e9a8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 [ 541.022452] RAX: 0000000000000000 RBX: 0000000001753030 RCX: 00007f46624bf487 [ 541.023885] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 000000000175a1e0 [ 541.025318] RBP: 000000000175a1e0 R08: 0000000000000000 R09: 0000000000000014 [ 541.026755] R10: 00000000000006b2 R11: 0000000000000246 R12: 00007f46629c883c [ 541.028186] R13: 0000000000000000 R14: 0000000001753210 R15: 00007fff5e12ec30 [ 541.029626] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy [ 541.039445] ---[ end trace 4ce02f25ff7d3df5 ]--- [ 541.040392] RIP: 0010:clear_inode+0xc0/0xd0 [ 541.041240] Code: 8d a3 30 01 00 00 4c 89 e7 e8 1c ec f8 ff 48 8b 83 30 01 00 00 49 39 c4 75 1a 48 c7 83 a0 00 00 00 60 00 00 00 5b 41 5c 5d c3 <0f> 0b 0f 0b 0f 0b 0f 0b 0f 0b 0f 0b 0f 1f 40 00 66 66 66 66 90 55 [ 541.045042] RSP: 0018:ffff8801e34a7b70 EFLAGS: 00010002 [ 541.046099] RAX: 0000000000000000 RBX: ffff8801e9b744e8 RCX: ffffffffb840eb3a [ 541.047537] RDX: dffffc0000000000 RSI: 0000000000000004 RDI: ffff8801e9b746b8 [ 541.048965] RBP: ffff8801e34a7b80 R08: ffffed003d36e8ce R09: ffffed003d36e8ce [ 541.050402] R10: 0000000000000001 R11: ffffed003d36e8cd R12: ffff8801e9b74668 [ 541.051832] R13: ffff8801e9b74760 R14: ffff8801e9b74528 R15: ffff8801e9b74530 [ 541.053263] FS: 00007f4662bdf840(0000) GS:ffff8801f6f00000(0000) knlGS:0000000000000000 [ 541.054891] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 541.056039] CR2: 000000000175c568 CR3: 00000001dcfe6000 CR4: 00000000000006e0 [ 541.058506] ================================================================== [ 541.059991] BUG: KASAN: stack-out-of-bounds in update_stack_state+0x38c/0x3e0 [ 541.061513] Read of size 8 at addr ffff8801e34a7970 by task umount/1305 [ 541.063302] CPU: 1 PID: 1305 Comm: umount Tainted: G D 4.18.0-rc1+ #4 [ 541.064838] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 541.066778] Call Trace: [ 541.067294] dump_stack+0x7b/0xb5 [ 541.067986] print_address_description+0x70/0x290 [ 541.068941] kasan_report+0x291/0x390 [ 541.069692] ? update_stack_state+0x38c/0x3e0 [ 541.070598] __asan_load8+0x54/0x90 [ 541.071315] update_stack_state+0x38c/0x3e0 [ 541.072172] ? __read_once_size_nocheck.constprop.7+0x20/0x20 [ 541.073340] ? vprintk_func+0x27/0x60 [ 541.074096] ? printk+0xa3/0xd3 [ 541.074762] ? __save_stack_trace+0x5e/0x100 [ 541.075634] unwind_next_frame.part.5+0x18e/0x490 [ 541.076594] ? unwind_dump+0x290/0x290 [ 541.077368] ? __show_regs+0x2c4/0x330 [ 541.078142] __unwind_start+0x106/0x190 [ 541.085422] __save_stack_trace+0x5e/0x100 [ 541.086268] ? __save_stack_trace+0x5e/0x100 [ 541.087161] ? unlink_anon_vmas+0xba/0x2c0 [ 541.087997] save_stack_trace+0x1f/0x30 [ 541.088782] save_stack+0x46/0xd0 [ 541.089475] ? __alloc_pages_slowpath+0x1420/0x1420 [ 541.090477] ? flush_tlb_mm_range+0x15e/0x220 [ 541.091364] ? __dec_node_state+0x24/0xb0 [ 541.092180] ? lock_page_memcg+0x85/0xf0 [ 541.092979] ? unlock_page_memcg+0x16/0x80 [ 541.093812] ? page_remove_rmap+0x198/0x520 [ 541.094674] ? mark_page_accessed+0x133/0x200 [ 541.095559] ? _cond_resched+0x1a/0x50 [ 541.096326] ? unmap_page_range+0xcd4/0xe50 [ 541.097179] ? rb_next+0x58/0x80 [ 541.097845] ? rb_next+0x58/0x80 [ 541.098518] __kasan_slab_free+0x13c/0x1a0 [ 541.099352] ? unlink_anon_vmas+0xba/0x2c0 [ 541.100184] kasan_slab_free+0xe/0x10 [ 541.100934] kmem_cache_free+0x89/0x1e0 [ 541.101724] unlink_anon_vmas+0xba/0x2c0 [ 541.102534] free_pgtables+0x101/0x1b0 [ 541.103299] exit_mmap+0x146/0x2a0 [ 541.103996] ? __ia32_sys_munmap+0x50/0x50 [ 541.104829] ? kasan_check_read+0x11/0x20 [ 541.105649] ? mm_update_next_owner+0x322/0x380 [ 541.106578] mmput+0x8b/0x1d0 [ 541.107191] do_exit+0x43a/0x1390 [ 541.107876] ? mm_update_next_owner+0x380/0x380 [ 541.108791] ? deactivate_super+0x5e/0x80 [ 541.109610] ? cleanup_mnt+0x61/0xa0 [ 541.110351] ? __cleanup_mnt+0x12/0x20 [ 541.111115] ? task_work_run+0xc8/0xf0 [ 541.111879] ? exit_to_usermode_loop+0x125/0x130 [ 541.112817] rewind_stack_do_exit+0x17/0x20 [ 541.113666] RIP: 0033:0x7f46624bf487 [ 541.114404] Code: Bad RIP value. [ 541.115094] RSP: 002b:00007fff5e12e9a8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 [ 541.116605] RAX: 0000000000000000 RBX: 0000000001753030 RCX: 00007f46624bf487 [ 541.118034] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 000000000175a1e0 [ 541.119472] RBP: 000000000175a1e0 R08: 0000000000000000 R09: 0000000000000014 [ 541.120890] R10: 00000000000006b2 R11: 0000000000000246 R12: 00007f46629c883c [ 541.122321] R13: 0000000000000000 R14: 0000000001753210 R15: 00007fff5e12ec30 [ 541.124061] The buggy address belongs to the page: [ 541.125042] page:ffffea00078d29c0 count:0 mapcount:0 mapping:0000000000000000 index:0x0 [ 541.126651] flags: 0x2ffff0000000000() [ 541.127418] raw: 02ffff0000000000 dead000000000100 dead000000000200 0000000000000000 [ 541.128963] raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 [ 541.130516] page dumped because: kasan: bad access detected [ 541.131954] Memory state around the buggy address: [ 541.132924] ffff8801e34a7800: 00 f1 f1 f1 f1 00 f4 f4 f4 f3 f3 f3 f3 00 00 00 [ 541.134378] ffff8801e34a7880: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 541.135814] >ffff8801e34a7900: 00 00 00 00 00 00 00 00 00 00 00 00 00 f1 f1 f1 [ 541.137253] ^ [ 541.138637] ffff8801e34a7980: f1 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 541.140075] ffff8801e34a7a00: 00 00 00 00 00 00 00 00 f3 00 00 00 00 00 00 00 [ 541.141509] ================================================================== - Location https://elixir.bootlin.com/linux/v4.18-rc1/source/fs/inode.c#L512 BUG_ON(inode->i_data.nrpages); The root cause is root directory inode is corrupted, it has both inline_data and inline_dentry flag, and its nlink is zero, so in ->evict(), after dropping all page cache, it grabs page #0 for inline data truncation, result in panic in later clear_inode() where we will check inode->i_data.nrpages value. This patch adds inline flags check in sanity_check_inode, in addition, do sanity check with root inode's nlink. Reported-by Wen Xu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 20 ++++++++++++++++++++ fs/f2fs/super.c | 3 ++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 480351d836f4..abb9f4877c53 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -265,6 +265,26 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } } + + if (f2fs_has_inline_data(inode) && + (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode))) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: inode (ino=%lx, mode=%u) should not have " + "inline_data, run fsck to fix", + __func__, inode->i_ino, inode->i_mode); + return false; + } + + if (f2fs_has_inline_dentry(inode) && !S_ISDIR(inode->i_mode)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: inode (ino=%lx, mode=%u) should not have " + "inline_dentry, run fsck to fix", + __func__, inode->i_ino, inode->i_mode); + return false; + } + return true; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a2368f4934f5..59e20a6b6bd9 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2971,7 +2971,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) err = PTR_ERR(root); goto free_stats; } - if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { + if (!S_ISDIR(root->i_mode) || !root->i_blocks || + !root->i_size || !root->i_nlink) { iput(root); err = -EINVAL; goto free_stats; From bee931f3f2edf7cd0af619574b92cf36619a7333 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 10 Jul 2018 23:01:45 +0800 Subject: [PATCH 0960/1212] f2fs: fix to do sanity check with block address in main area v2 This patch adds f2fs_is_valid_blkaddr() in below functions to do sanity check with block address to avoid pentential panic: - f2fs_grab_read_bio() - __written_first_block() https://bugzilla.kernel.org/show_bug.cgi?id=200465 - Reproduce - POC (poc.c) #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void activity(char *mpoint) { char *xattr; int err; err = asprintf(&xattr, "%s/foo/bar/xattr", mpoint); char buf2[113]; memset(buf2, 0, sizeof(buf2)); listxattr(xattr, buf2, sizeof(buf2)); } int main(int argc, char *argv[]) { activity(argv[1]); return 0; } - kernel message [ 844.718738] F2FS-fs (loop0): Mounted with checkpoint version = 2 [ 846.430929] F2FS-fs (loop0): access invalid blkaddr:1024 [ 846.431058] WARNING: CPU: 1 PID: 1249 at fs/f2fs/checkpoint.c:154 f2fs_is_valid_blkaddr+0x10f/0x160 [ 846.431059] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep snd_pcm snd_timer snd input_leds joydev soundcore serio_raw i2c_piix4 mac_hid ib_iser rdma_cm iw_cm ib_cm ib_core configfs iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi autofs4 raid10 raid456 libcrc32c async_raid6_recov async_memcpy async_pq async_xor xor async_tx raid6_pq raid1 raid0 multipath linear qxl ttm crct10dif_pclmul crc32_pclmul drm_kms_helper ghash_clmulni_intel syscopyarea sysfillrect sysimgblt fb_sys_fops pcbc drm 8139too aesni_intel 8139cp floppy psmouse mii aes_x86_64 crypto_simd pata_acpi cryptd glue_helper [ 846.431310] CPU: 1 PID: 1249 Comm: a.out Not tainted 4.18.0-rc3+ #1 [ 846.431312] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 846.431315] RIP: 0010:f2fs_is_valid_blkaddr+0x10f/0x160 [ 846.431316] Code: 00 eb ed 31 c0 83 fa 05 75 ae 48 83 ec 08 48 8b 3f 89 f1 48 c7 c2 fc 0b 0f 8b 48 c7 c6 8b d7 09 8b 88 44 24 07 e8 61 8b ff ff <0f> 0b 0f b6 44 24 07 48 83 c4 08 eb 81 4c 8b 47 10 8b 8f 38 04 00 [ 846.431347] RSP: 0018:ffff961c414a7bc0 EFLAGS: 00010282 [ 846.431349] RAX: 0000000000000000 RBX: ffffc5f787b8ea80 RCX: 0000000000000000 [ 846.431350] RDX: 0000000000000000 RSI: ffff89dfffd165d8 RDI: ffff89dfffd165d8 [ 846.431351] RBP: ffff961c414a7c20 R08: 0000000000000001 R09: 0000000000000248 [ 846.431353] R10: 0000000000000000 R11: 0000000000000248 R12: 0000000000000007 [ 846.431369] R13: ffff89dff5492800 R14: ffff89dfae3aa000 R15: ffff89dff4ff88d0 [ 846.431372] FS: 00007f882e2fb700(0000) GS:ffff89dfffd00000(0000) knlGS:0000000000000000 [ 846.431373] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 846.431374] CR2: 0000000001a88008 CR3: 00000001eb572000 CR4: 00000000000006e0 [ 846.431384] Call Trace: [ 846.431426] f2fs_iget+0x6f4/0xe70 [ 846.431430] ? f2fs_find_entry+0x71/0x90 [ 846.431432] f2fs_lookup+0x1aa/0x390 [ 846.431452] __lookup_slow+0x97/0x150 [ 846.431459] lookup_slow+0x35/0x50 [ 846.431462] walk_component+0x1c6/0x470 [ 846.431479] ? memcg_kmem_charge_memcg+0x70/0x90 [ 846.431488] ? page_add_file_rmap+0x13/0x200 [ 846.431491] path_lookupat+0x76/0x230 [ 846.431501] ? __alloc_pages_nodemask+0xfc/0x280 [ 846.431504] filename_lookup+0xb8/0x1a0 [ 846.431534] ? _cond_resched+0x16/0x40 [ 846.431541] ? kmem_cache_alloc+0x160/0x1d0 [ 846.431549] ? path_listxattr+0x41/0xa0 [ 846.431551] path_listxattr+0x41/0xa0 [ 846.431570] do_syscall_64+0x55/0x100 [ 846.431583] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 846.431607] RIP: 0033:0x7f882de1c0d7 [ 846.431607] Code: f0 ff ff 73 01 c3 48 8b 0d be dd 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 b8 c2 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 91 dd 2b 00 f7 d8 64 89 01 48 [ 846.431639] RSP: 002b:00007ffe8e66c238 EFLAGS: 00000202 ORIG_RAX: 00000000000000c2 [ 846.431641] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f882de1c0d7 [ 846.431642] RDX: 0000000000000071 RSI: 00007ffe8e66c280 RDI: 0000000001a880c0 [ 846.431643] RBP: 00007ffe8e66c300 R08: 0000000001a88010 R09: 0000000000000000 [ 846.431645] R10: 00000000000001ab R11: 0000000000000202 R12: 0000000000400550 [ 846.431646] R13: 00007ffe8e66c400 R14: 0000000000000000 R15: 0000000000000000 [ 846.431648] ---[ end trace abca54df39d14f5c ]--- [ 846.431651] F2FS-fs (loop0): invalid blkaddr: 1024, type: 5, run fsck to fix. [ 846.431762] WARNING: CPU: 1 PID: 1249 at fs/f2fs/f2fs.h:2697 f2fs_iget+0xd17/0xe70 [ 846.431763] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep snd_pcm snd_timer snd input_leds joydev soundcore serio_raw i2c_piix4 mac_hid ib_iser rdma_cm iw_cm ib_cm ib_core configfs iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi autofs4 raid10 raid456 libcrc32c async_raid6_recov async_memcpy async_pq async_xor xor async_tx raid6_pq raid1 raid0 multipath linear qxl ttm crct10dif_pclmul crc32_pclmul drm_kms_helper ghash_clmulni_intel syscopyarea sysfillrect sysimgblt fb_sys_fops pcbc drm 8139too aesni_intel 8139cp floppy psmouse mii aes_x86_64 crypto_simd pata_acpi cryptd glue_helper [ 846.431797] CPU: 1 PID: 1249 Comm: a.out Tainted: G W 4.18.0-rc3+ #1 [ 846.431798] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 846.431800] RIP: 0010:f2fs_iget+0xd17/0xe70 [ 846.431801] Code: ff ff 48 63 d8 e9 e1 f6 ff ff 48 8b 45 c8 41 b8 05 00 00 00 48 c7 c2 d8 e8 0e 8b 48 c7 c6 1d b0 0a 8b 48 8b 38 e8 f9 b4 00 00 <0f> 0b 48 8b 45 c8 f0 80 48 48 04 e9 d8 f9 ff ff 0f 0b 48 8b 43 18 [ 846.431832] RSP: 0018:ffff961c414a7bd0 EFLAGS: 00010282 [ 846.431834] RAX: 0000000000000000 RBX: ffffc5f787b8ea80 RCX: 0000000000000006 [ 846.431835] RDX: 0000000000000000 RSI: 0000000000000096 RDI: ffff89dfffd165d0 [ 846.431836] RBP: ffff961c414a7c20 R08: 0000000000000000 R09: 0000000000000273 [ 846.431837] R10: 0000000000000000 R11: ffff89dfad50ca60 R12: 0000000000000007 [ 846.431838] R13: ffff89dff5492800 R14: ffff89dfae3aa000 R15: ffff89dff4ff88d0 [ 846.431840] FS: 00007f882e2fb700(0000) GS:ffff89dfffd00000(0000) knlGS:0000000000000000 [ 846.431841] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 846.431842] CR2: 0000000001a88008 CR3: 00000001eb572000 CR4: 00000000000006e0 [ 846.431846] Call Trace: [ 846.431850] ? f2fs_find_entry+0x71/0x90 [ 846.431853] f2fs_lookup+0x1aa/0x390 [ 846.431856] __lookup_slow+0x97/0x150 [ 846.431858] lookup_slow+0x35/0x50 [ 846.431874] walk_component+0x1c6/0x470 [ 846.431878] ? memcg_kmem_charge_memcg+0x70/0x90 [ 846.431880] ? page_add_file_rmap+0x13/0x200 [ 846.431882] path_lookupat+0x76/0x230 [ 846.431884] ? __alloc_pages_nodemask+0xfc/0x280 [ 846.431886] filename_lookup+0xb8/0x1a0 [ 846.431890] ? _cond_resched+0x16/0x40 [ 846.431891] ? kmem_cache_alloc+0x160/0x1d0 [ 846.431894] ? path_listxattr+0x41/0xa0 [ 846.431896] path_listxattr+0x41/0xa0 [ 846.431898] do_syscall_64+0x55/0x100 [ 846.431901] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 846.431902] RIP: 0033:0x7f882de1c0d7 [ 846.431903] Code: f0 ff ff 73 01 c3 48 8b 0d be dd 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 b8 c2 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 91 dd 2b 00 f7 d8 64 89 01 48 [ 846.431934] RSP: 002b:00007ffe8e66c238 EFLAGS: 00000202 ORIG_RAX: 00000000000000c2 [ 846.431936] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f882de1c0d7 [ 846.431937] RDX: 0000000000000071 RSI: 00007ffe8e66c280 RDI: 0000000001a880c0 [ 846.431939] RBP: 00007ffe8e66c300 R08: 0000000001a88010 R09: 0000000000000000 [ 846.431940] R10: 00000000000001ab R11: 0000000000000202 R12: 0000000000400550 [ 846.431941] R13: 00007ffe8e66c400 R14: 0000000000000000 R15: 0000000000000000 [ 846.431943] ---[ end trace abca54df39d14f5d ]--- [ 846.432033] F2FS-fs (loop0): access invalid blkaddr:1024 [ 846.432051] WARNING: CPU: 1 PID: 1249 at fs/f2fs/checkpoint.c:154 f2fs_is_valid_blkaddr+0x10f/0x160 [ 846.432051] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep snd_pcm snd_timer snd input_leds joydev soundcore serio_raw i2c_piix4 mac_hid ib_iser rdma_cm iw_cm ib_cm ib_core configfs iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi autofs4 raid10 raid456 libcrc32c async_raid6_recov async_memcpy async_pq async_xor xor async_tx raid6_pq raid1 raid0 multipath linear qxl ttm crct10dif_pclmul crc32_pclmul drm_kms_helper ghash_clmulni_intel syscopyarea sysfillrect sysimgblt fb_sys_fops pcbc drm 8139too aesni_intel 8139cp floppy psmouse mii aes_x86_64 crypto_simd pata_acpi cryptd glue_helper [ 846.432085] CPU: 1 PID: 1249 Comm: a.out Tainted: G W 4.18.0-rc3+ #1 [ 846.432086] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 846.432089] RIP: 0010:f2fs_is_valid_blkaddr+0x10f/0x160 [ 846.432089] Code: 00 eb ed 31 c0 83 fa 05 75 ae 48 83 ec 08 48 8b 3f 89 f1 48 c7 c2 fc 0b 0f 8b 48 c7 c6 8b d7 09 8b 88 44 24 07 e8 61 8b ff ff <0f> 0b 0f b6 44 24 07 48 83 c4 08 eb 81 4c 8b 47 10 8b 8f 38 04 00 [ 846.432120] RSP: 0018:ffff961c414a7900 EFLAGS: 00010286 [ 846.432122] RAX: 0000000000000000 RBX: 0000000000000400 RCX: 0000000000000006 [ 846.432123] RDX: 0000000000000000 RSI: 0000000000000096 RDI: ffff89dfffd165d0 [ 846.432124] RBP: ffff89dff5492800 R08: 0000000000000001 R09: 000000000000029d [ 846.432125] R10: ffff961c414a7820 R11: 000000000000029d R12: 0000000000000400 [ 846.432126] R13: 0000000000000000 R14: ffff89dff4ff88d0 R15: 0000000000000000 [ 846.432128] FS: 00007f882e2fb700(0000) GS:ffff89dfffd00000(0000) knlGS:0000000000000000 [ 846.432130] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 846.432131] CR2: 0000000001a88008 CR3: 00000001eb572000 CR4: 00000000000006e0 [ 846.432135] Call Trace: [ 846.432151] f2fs_wait_on_block_writeback+0x20/0x110 [ 846.432158] f2fs_grab_read_bio+0xbc/0xe0 [ 846.432161] f2fs_submit_page_read+0x21/0x280 [ 846.432163] f2fs_get_read_data_page+0xb7/0x3c0 [ 846.432165] f2fs_get_lock_data_page+0x29/0x1e0 [ 846.432167] f2fs_get_new_data_page+0x148/0x550 [ 846.432170] f2fs_add_regular_entry+0x1d2/0x550 [ 846.432178] ? __switch_to+0x12f/0x460 [ 846.432181] f2fs_add_dentry+0x6a/0xd0 [ 846.432184] f2fs_do_add_link+0xe9/0x140 [ 846.432186] __recover_dot_dentries+0x260/0x280 [ 846.432189] f2fs_lookup+0x343/0x390 [ 846.432193] __lookup_slow+0x97/0x150 [ 846.432195] lookup_slow+0x35/0x50 [ 846.432208] walk_component+0x1c6/0x470 [ 846.432212] ? memcg_kmem_charge_memcg+0x70/0x90 [ 846.432215] ? page_add_file_rmap+0x13/0x200 [ 846.432217] path_lookupat+0x76/0x230 [ 846.432219] ? __alloc_pages_nodemask+0xfc/0x280 [ 846.432221] filename_lookup+0xb8/0x1a0 [ 846.432224] ? _cond_resched+0x16/0x40 [ 846.432226] ? kmem_cache_alloc+0x160/0x1d0 [ 846.432228] ? path_listxattr+0x41/0xa0 [ 846.432230] path_listxattr+0x41/0xa0 [ 846.432233] do_syscall_64+0x55/0x100 [ 846.432235] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 846.432237] RIP: 0033:0x7f882de1c0d7 [ 846.432237] Code: f0 ff ff 73 01 c3 48 8b 0d be dd 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 b8 c2 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 91 dd 2b 00 f7 d8 64 89 01 48 [ 846.432269] RSP: 002b:00007ffe8e66c238 EFLAGS: 00000202 ORIG_RAX: 00000000000000c2 [ 846.432271] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f882de1c0d7 [ 846.432272] RDX: 0000000000000071 RSI: 00007ffe8e66c280 RDI: 0000000001a880c0 [ 846.432273] RBP: 00007ffe8e66c300 R08: 0000000001a88010 R09: 0000000000000000 [ 846.432274] R10: 00000000000001ab R11: 0000000000000202 R12: 0000000000400550 [ 846.432275] R13: 00007ffe8e66c400 R14: 0000000000000000 R15: 0000000000000000 [ 846.432277] ---[ end trace abca54df39d14f5e ]--- [ 846.432279] F2FS-fs (loop0): invalid blkaddr: 1024, type: 5, run fsck to fix. [ 846.432376] WARNING: CPU: 1 PID: 1249 at fs/f2fs/f2fs.h:2697 f2fs_wait_on_block_writeback+0xb1/0x110 [ 846.432376] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep snd_pcm snd_timer snd input_leds joydev soundcore serio_raw i2c_piix4 mac_hid ib_iser rdma_cm iw_cm ib_cm ib_core configfs iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi autofs4 raid10 raid456 libcrc32c async_raid6_recov async_memcpy async_pq async_xor xor async_tx raid6_pq raid1 raid0 multipath linear qxl ttm crct10dif_pclmul crc32_pclmul drm_kms_helper ghash_clmulni_intel syscopyarea sysfillrect sysimgblt fb_sys_fops pcbc drm 8139too aesni_intel 8139cp floppy psmouse mii aes_x86_64 crypto_simd pata_acpi cryptd glue_helper [ 846.432410] CPU: 1 PID: 1249 Comm: a.out Tainted: G W 4.18.0-rc3+ #1 [ 846.432411] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 846.432413] RIP: 0010:f2fs_wait_on_block_writeback+0xb1/0x110 [ 846.432414] Code: 66 90 f0 ff 4b 34 74 59 5b 5d c3 48 8b 7d 00 41 b8 05 00 00 00 89 d9 48 c7 c2 d8 e8 0e 8b 48 c7 c6 1d b0 0a 8b e8 df bc fd ff <0f> 0b f0 80 4d 48 04 e9 67 ff ff ff 48 8b 03 48 c1 e8 37 83 e0 07 [ 846.432445] RSP: 0018:ffff961c414a7910 EFLAGS: 00010286 [ 846.432447] RAX: 0000000000000000 RBX: 0000000000000400 RCX: 0000000000000006 [ 846.432448] RDX: 0000000000000000 RSI: 0000000000000092 RDI: ffff89dfffd165d0 [ 846.432449] RBP: ffff89dff5492800 R08: 0000000000000000 R09: 00000000000002d1 [ 846.432450] R10: ffff961c414a7820 R11: ffff89dfad50cf80 R12: 0000000000000400 [ 846.432451] R13: 0000000000000000 R14: ffff89dff4ff88d0 R15: 0000000000000000 [ 846.432453] FS: 00007f882e2fb700(0000) GS:ffff89dfffd00000(0000) knlGS:0000000000000000 [ 846.432454] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 846.432455] CR2: 0000000001a88008 CR3: 00000001eb572000 CR4: 00000000000006e0 [ 846.432459] Call Trace: [ 846.432463] f2fs_grab_read_bio+0xbc/0xe0 [ 846.432464] f2fs_submit_page_read+0x21/0x280 [ 846.432466] f2fs_get_read_data_page+0xb7/0x3c0 [ 846.432468] f2fs_get_lock_data_page+0x29/0x1e0 [ 846.432470] f2fs_get_new_data_page+0x148/0x550 [ 846.432473] f2fs_add_regular_entry+0x1d2/0x550 [ 846.432475] ? __switch_to+0x12f/0x460 [ 846.432477] f2fs_add_dentry+0x6a/0xd0 [ 846.432480] f2fs_do_add_link+0xe9/0x140 [ 846.432483] __recover_dot_dentries+0x260/0x280 [ 846.432485] f2fs_lookup+0x343/0x390 [ 846.432488] __lookup_slow+0x97/0x150 [ 846.432490] lookup_slow+0x35/0x50 [ 846.432505] walk_component+0x1c6/0x470 [ 846.432509] ? memcg_kmem_charge_memcg+0x70/0x90 [ 846.432511] ? page_add_file_rmap+0x13/0x200 [ 846.432513] path_lookupat+0x76/0x230 [ 846.432515] ? __alloc_pages_nodemask+0xfc/0x280 [ 846.432517] filename_lookup+0xb8/0x1a0 [ 846.432520] ? _cond_resched+0x16/0x40 [ 846.432522] ? kmem_cache_alloc+0x160/0x1d0 [ 846.432525] ? path_listxattr+0x41/0xa0 [ 846.432526] path_listxattr+0x41/0xa0 [ 846.432529] do_syscall_64+0x55/0x100 [ 846.432531] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 846.432533] RIP: 0033:0x7f882de1c0d7 [ 846.432533] Code: f0 ff ff 73 01 c3 48 8b 0d be dd 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 b8 c2 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 91 dd 2b 00 f7 d8 64 89 01 48 [ 846.432565] RSP: 002b:00007ffe8e66c238 EFLAGS: 00000202 ORIG_RAX: 00000000000000c2 [ 846.432567] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f882de1c0d7 [ 846.432568] RDX: 0000000000000071 RSI: 00007ffe8e66c280 RDI: 0000000001a880c0 [ 846.432569] RBP: 00007ffe8e66c300 R08: 0000000001a88010 R09: 0000000000000000 [ 846.432570] R10: 00000000000001ab R11: 0000000000000202 R12: 0000000000400550 [ 846.432571] R13: 00007ffe8e66c400 R14: 0000000000000000 R15: 0000000000000000 [ 846.432573] ---[ end trace abca54df39d14f5f ]--- [ 846.434280] BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 [ 846.434424] PGD 80000001ebd3a067 P4D 80000001ebd3a067 PUD 1eb1ae067 PMD 0 [ 846.434551] Oops: 0000 [#1] SMP PTI [ 846.434697] CPU: 0 PID: 44 Comm: kworker/u5:0 Tainted: G W 4.18.0-rc3+ #1 [ 846.434805] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 846.435000] Workqueue: fscrypt_read_queue decrypt_work [ 846.435174] RIP: 0010:fscrypt_do_page_crypto+0x6e/0x2d0 [ 846.435351] Code: 00 65 48 8b 04 25 28 00 00 00 48 89 84 24 88 00 00 00 31 c0 e8 43 c2 e0 ff 49 8b 86 48 02 00 00 85 ed c7 44 24 70 00 00 00 00 <48> 8b 58 08 0f 84 14 02 00 00 48 8b 78 10 48 8b 0c 24 48 c7 84 24 [ 846.435696] RSP: 0018:ffff961c40f9bd60 EFLAGS: 00010206 [ 846.435870] RAX: 0000000000000000 RBX: ffffc5f787719b80 RCX: ffffc5f787719b80 [ 846.436051] RDX: ffffffff8b9f4b88 RSI: ffffffff8b0ae622 RDI: ffff961c40f9bdb8 [ 846.436261] RBP: 0000000000001000 R08: ffffc5f787719b80 R09: 0000000000001000 [ 846.436433] R10: 0000000000000018 R11: fefefefefefefeff R12: ffffc5f787719b80 [ 846.436562] R13: ffffc5f787719b80 R14: ffff89dff4ff88d0 R15: 0ffff89dfaddee60 [ 846.436658] FS: 0000000000000000(0000) GS:ffff89dfffc00000(0000) knlGS:0000000000000000 [ 846.436758] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 846.436898] CR2: 0000000000000008 CR3: 00000001eddd0000 CR4: 00000000000006f0 [ 846.437001] Call Trace: [ 846.437181] ? check_preempt_wakeup+0xf2/0x230 [ 846.437276] ? check_preempt_curr+0x7c/0x90 [ 846.437370] fscrypt_decrypt_page+0x48/0x4d [ 846.437466] __fscrypt_decrypt_bio+0x5b/0x90 [ 846.437542] decrypt_work+0x12/0x20 [ 846.437651] process_one_work+0x15e/0x3d0 [ 846.437740] worker_thread+0x4c/0x440 [ 846.437848] kthread+0xf8/0x130 [ 846.437938] ? rescuer_thread+0x350/0x350 [ 846.438022] ? kthread_associate_blkcg+0x90/0x90 [ 846.438117] ret_from_fork+0x35/0x40 [ 846.438201] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep snd_pcm snd_timer snd input_leds joydev soundcore serio_raw i2c_piix4 mac_hid ib_iser rdma_cm iw_cm ib_cm ib_core configfs iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi autofs4 raid10 raid456 libcrc32c async_raid6_recov async_memcpy async_pq async_xor xor async_tx raid6_pq raid1 raid0 multipath linear qxl ttm crct10dif_pclmul crc32_pclmul drm_kms_helper ghash_clmulni_intel syscopyarea sysfillrect sysimgblt fb_sys_fops pcbc drm 8139too aesni_intel 8139cp floppy psmouse mii aes_x86_64 crypto_simd pata_acpi cryptd glue_helper [ 846.438653] CR2: 0000000000000008 [ 846.438713] ---[ end trace abca54df39d14f60 ]--- [ 846.438796] RIP: 0010:fscrypt_do_page_crypto+0x6e/0x2d0 [ 846.438844] Code: 00 65 48 8b 04 25 28 00 00 00 48 89 84 24 88 00 00 00 31 c0 e8 43 c2 e0 ff 49 8b 86 48 02 00 00 85 ed c7 44 24 70 00 00 00 00 <48> 8b 58 08 0f 84 14 02 00 00 48 8b 78 10 48 8b 0c 24 48 c7 84 24 [ 846.439084] RSP: 0018:ffff961c40f9bd60 EFLAGS: 00010206 [ 846.439176] RAX: 0000000000000000 RBX: ffffc5f787719b80 RCX: ffffc5f787719b80 [ 846.440927] RDX: ffffffff8b9f4b88 RSI: ffffffff8b0ae622 RDI: ffff961c40f9bdb8 [ 846.442083] RBP: 0000000000001000 R08: ffffc5f787719b80 R09: 0000000000001000 [ 846.443284] R10: 0000000000000018 R11: fefefefefefefeff R12: ffffc5f787719b80 [ 846.444448] R13: ffffc5f787719b80 R14: ffff89dff4ff88d0 R15: 0ffff89dfaddee60 [ 846.445558] FS: 0000000000000000(0000) GS:ffff89dfffc00000(0000) knlGS:0000000000000000 [ 846.446687] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 846.447796] CR2: 0000000000000008 CR3: 00000001eddd0000 CR4: 00000000000006f0 - Location https://elixir.bootlin.com/linux/v4.18-rc4/source/fs/crypto/crypto.c#L149 struct crypto_skcipher *tfm = ci->ci_ctfm; Here ci can be NULL Note that this issue maybe require CONFIG_F2FS_FS_ENCRYPTION=y to reproduce. Reported-by Wen Xu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 +++ fs/f2fs/inode.c | 18 +++++++++++++----- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 5088609f0b8a..dfb88956ca9e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -544,6 +544,9 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, struct bio_post_read_ctx *ctx; unsigned int post_read_steps = 0; + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC)) + return ERR_PTR(-EFAULT); + bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), false); if (!bio) return ERR_PTR(-ENOMEM); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index abb9f4877c53..b050ec5075fe 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -68,14 +68,16 @@ static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) } } -static bool __written_first_block(struct f2fs_sb_info *sbi, +static int __written_first_block(struct f2fs_sb_info *sbi, struct f2fs_inode *ri) { block_t addr = le32_to_cpu(ri->i_addr[offset_in_addr(ri)]); - if (is_valid_data_blkaddr(sbi, addr)) - return true; - return false; + if (!__is_valid_data_blkaddr(addr)) + return 1; + if (!f2fs_is_valid_blkaddr(sbi, addr, DATA_GENERIC)) + return -EFAULT; + return 0; } static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri) @@ -295,6 +297,7 @@ static int do_read_inode(struct inode *inode) struct page *node_page; struct f2fs_inode *ri; projid_t i_projid; + int err; /* Check if ino is within scope */ if (f2fs_check_nid_range(sbi, inode->i_ino)) @@ -368,7 +371,12 @@ static int do_read_inode(struct inode *inode) /* get rdev by using inline_info */ __get_inode_rdev(inode, ri); - if (__written_first_block(sbi, ri)) + err = __written_first_block(sbi, ri); + if (err < 0) { + f2fs_put_page(node_page, 1); + return err; + } + if (!err) set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); if (!f2fs_need_inode_block_update(sbi, inode->i_ino)) From e356f6e4e509586df5cb1e01b0290b49f22abee6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 5 Aug 2018 23:02:22 +0800 Subject: [PATCH 0961/1212] f2fs: avoid race between zero_range and background GC Thread A Background GC - f2fs_zero_range - truncate_pagecache_range - gc_data_segment - get_read_data_page - move_data_page - set_page_dirty - set_cold_data - f2fs_do_zero_range - dn->data_blkaddr = NEW_ADDR; - f2fs_set_data_blkaddr Actually, we don't need to set dirty & checked flag on the page, since all valid data in the page should be zeroed by zero_range(). Use i_gc_rwsem[WRITE] to avoid such race condition. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f476c4857c32..c046834dde1b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1321,8 +1321,6 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) goto out_sem; - truncate_pagecache_range(inode, offset, offset + len - 1); - pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; @@ -1352,12 +1350,19 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, unsigned int end_offset; pgoff_t end; + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + + truncate_pagecache_range(inode, + (loff_t)index << PAGE_SHIFT, + ((loff_t)pg_end << PAGE_SHIFT) - 1); + f2fs_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); ret = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE); if (ret) { f2fs_unlock_op(sbi); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); goto out; } @@ -1366,7 +1371,9 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = f2fs_do_zero_range(&dn, index, end); f2fs_put_dnode(&dn); + f2fs_unlock_op(sbi); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); f2fs_balance_fs(sbi, dn.node_changed); From 4e76883a94dab869a6ecc77d49b6f39de5ccc9c9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 5 Aug 2018 23:04:25 +0800 Subject: [PATCH 0962/1212] f2fs: fix avoid race between truncate and background GC Thread A Background GC - f2fs_setattr isize to 0 - truncate_setsize - gc_data_segment - f2fs_get_read_data_page page #0 - set_page_dirty - set_cold_data - f2fs_truncate - f2fs_setattr isize to 4k - read 4k <--- hit data in cached page #0 Above race condition can cause read out invalid data in a truncated page, fix it by i_gc_rwsem[WRITE] lock. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 ++++ fs/f2fs/file.c | 39 ++++++++++++++++++++++++--------------- 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index dfb88956ca9e..87086b98f08c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2196,8 +2196,12 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to) if (to > i_size) { down_write(&F2FS_I(inode)->i_mmap_sem); + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + truncate_pagecache(inode, i_size); f2fs_truncate_blocks(inode, i_size, true); + + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); up_write(&F2FS_I(inode)->i_mmap_sem); } } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c046834dde1b..17174234beb5 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -800,22 +800,26 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } if (attr->ia_valid & ATTR_SIZE) { - if (attr->ia_size <= i_size_read(inode)) { - down_write(&F2FS_I(inode)->i_mmap_sem); - truncate_setsize(inode, attr->ia_size); - err = f2fs_truncate(inode); - up_write(&F2FS_I(inode)->i_mmap_sem); - if (err) - return err; - } else { - /* - * do not trim all blocks after i_size if target size is - * larger than i_size. - */ - down_write(&F2FS_I(inode)->i_mmap_sem); - truncate_setsize(inode, attr->ia_size); - up_write(&F2FS_I(inode)->i_mmap_sem); + bool to_smaller = (attr->ia_size <= i_size_read(inode)); + down_write(&F2FS_I(inode)->i_mmap_sem); + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + + truncate_setsize(inode, attr->ia_size); + + if (to_smaller) + err = f2fs_truncate(inode); + /* + * do not trim all blocks after i_size if target size is + * larger than i_size. + */ + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + up_write(&F2FS_I(inode)->i_mmap_sem); + + if (err) + return err; + + if (!to_smaller) { /* should convert inline inode here */ if (!f2fs_may_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); @@ -965,13 +969,18 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) blk_start = (loff_t)pg_start << PAGE_SHIFT; blk_end = (loff_t)pg_end << PAGE_SHIFT; + down_write(&F2FS_I(inode)->i_mmap_sem); + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + truncate_inode_pages_range(mapping, blk_start, blk_end - 1); f2fs_lock_op(sbi); ret = f2fs_truncate_hole(inode, pg_start, pg_end); f2fs_unlock_op(sbi); + + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); up_write(&F2FS_I(inode)->i_mmap_sem); } } From a307972a12dd6c8c0296a62c407b666e880d0738 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 5 Aug 2018 23:08:59 +0800 Subject: [PATCH 0963/1212] f2fs: refresh recent accessed nat entry in lru list Introduce nat_list_lock to protect nm_i->nat_entries list, and manage it as a LRU list, refresh location for therein recent accessed entries in the list. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/node.c | 46 ++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 43 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c07f67178767..8fbeb2831461 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -845,6 +845,7 @@ struct f2fs_nm_info { struct radix_tree_root nat_set_root;/* root of the nat set cache */ struct rw_semaphore nat_tree_lock; /* protect nat_tree_lock */ struct list_head nat_entries; /* cached nat entry list (clean) */ + spinlock_t nat_list_lock; /* protect clean nat entry list */ unsigned int nat_cnt; /* the # of cached nat entries */ unsigned int dirty_nat_cnt; /* total num of nat entries in set */ unsigned int nat_blocks; /* # of nat blocks */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 5cdd59031674..b7944fe56f1f 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -174,14 +174,30 @@ static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i, if (raw_ne) node_info_from_raw_nat(&ne->ni, raw_ne); + + spin_lock(&nm_i->nat_list_lock); list_add_tail(&ne->list, &nm_i->nat_entries); + spin_unlock(&nm_i->nat_list_lock); + nm_i->nat_cnt++; return ne; } static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n) { - return radix_tree_lookup(&nm_i->nat_root, n); + struct nat_entry *ne; + + ne = radix_tree_lookup(&nm_i->nat_root, n); + + /* for recent accessed nat entry, move it to tail of lru list */ + if (ne && !get_nat_flag(ne, IS_DIRTY)) { + spin_lock(&nm_i->nat_list_lock); + if (!list_empty(&ne->list)) + list_move_tail(&ne->list, &nm_i->nat_entries); + spin_unlock(&nm_i->nat_list_lock); + } + + return ne; } static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i, @@ -192,7 +208,6 @@ static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i, static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e) { - list_del(&e->list); radix_tree_delete(&nm_i->nat_root, nat_get_nid(e)); nm_i->nat_cnt--; __free_nat_entry(e); @@ -243,16 +258,21 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, nm_i->dirty_nat_cnt++; set_nat_flag(ne, IS_DIRTY, true); refresh_list: + spin_lock(&nm_i->nat_list_lock); if (new_ne) list_del_init(&ne->list); else list_move_tail(&ne->list, &head->entry_list); + spin_unlock(&nm_i->nat_list_lock); } static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i, struct nat_entry_set *set, struct nat_entry *ne) { + spin_lock(&nm_i->nat_list_lock); list_move_tail(&ne->list, &nm_i->nat_entries); + spin_unlock(&nm_i->nat_list_lock); + set_nat_flag(ne, IS_DIRTY, false); set->entry_cnt--; nm_i->dirty_nat_cnt--; @@ -469,13 +489,25 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) if (!down_write_trylock(&nm_i->nat_tree_lock)) return 0; - while (nr_shrink && !list_empty(&nm_i->nat_entries)) { + spin_lock(&nm_i->nat_list_lock); + while (nr_shrink) { struct nat_entry *ne; + + if (list_empty(&nm_i->nat_entries)) + break; + ne = list_first_entry(&nm_i->nat_entries, struct nat_entry, list); + list_del(&ne->list); + spin_unlock(&nm_i->nat_list_lock); + __del_from_nat_cache(nm_i, ne); nr_shrink--; + + spin_lock(&nm_i->nat_list_lock); } + spin_unlock(&nm_i->nat_list_lock); + up_write(&nm_i->nat_tree_lock); return nr - nr_shrink; } @@ -2909,6 +2941,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO); INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO); INIT_LIST_HEAD(&nm_i->nat_entries); + spin_lock_init(&nm_i->nat_list_lock); mutex_init(&nm_i->build_lock); spin_lock_init(&nm_i->nid_list_lock); @@ -3027,8 +3060,13 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) unsigned idx; nid = nat_get_nid(natvec[found - 1]) + 1; - for (idx = 0; idx < found; idx++) + for (idx = 0; idx < found; idx++) { + spin_lock(&nm_i->nat_list_lock); + list_del(&natvec[idx]->list); + spin_unlock(&nm_i->nat_list_lock); + __del_from_nat_cache(nm_i, natvec[idx]); + } } f2fs_bug_on(sbi, nm_i->nat_cnt); From 3fbe7eea3365e6ccc4e190086fcc762705eda103 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 5 Aug 2018 23:09:00 +0800 Subject: [PATCH 0964/1212] f2fs: fix incorrect range->len in f2fs_trim_fs() generic/260 reported below error: [+] Default length with start set (should succeed) [+] Length beyond the end of fs (should succeed) [+] Length beyond the end of fs with start set (should succeed) +./tests/generic/260: line 94: [: 18446744073709551615: integer expression expected +./tests/generic/260: line 104: [: 18446744073709551615: integer expression expected Test done ... In f2fs_trim_fs(), if there is no discard being trimmed, we need to correct range->len before return. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e067d4768360..6dec5ab7dd52 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2662,8 +2662,9 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) trimmed += __wait_discard_cmd_range(sbi, &dpolicy, start_block, end_block); - range->len = F2FS_BLK_TO_BYTES(trimmed); out: + if (!err) + range->len = F2FS_BLK_TO_BYTES(trimmed); return err; } From 46998a9ad69906c3e2a73e98e397cd8d6ac9973b Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Sun, 5 Aug 2018 12:45:35 +0800 Subject: [PATCH 0965/1212] f2fs: wake up gc thread immediately when gc_urgent is set Fixes: 5b0e95398e2b ("f2fs: introduce sbi->gc_mode to determine the policy") Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 79e47e7d737c..d4c196552888 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -253,6 +253,7 @@ static ssize_t __sbi_store(struct f2fs_attr *a, if (t >= 1) { sbi->gc_mode = GC_URGENT; if (sbi->gc_thread) { + sbi->gc_thread->gc_wake = 1; wake_up_interruptible_all( &sbi->gc_thread->gc_wait_queue_head); wake_up_discard_thread(sbi, true); From 7f67de2a4cddd895dbd3583a1e43b90a3424b133 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 6 Aug 2018 22:43:50 +0800 Subject: [PATCH 0966/1212] f2fs: split discard command in prior to block layer Some devices has small max_{hw,}discard_sectors, so that in __blkdev_issue_discard(), one big size discard bio can be split into multiple small size discard bios, result in heavy load in IO scheduler and device, which can hang other sync IO for long time. Now, f2fs is trying to control discard commands more elaboratively, in order to make less conflict in between discard IO and user IO to enhance application's performance, so in this patch, we will split discard bio in f2fs in prior to in block layer to reduce issuing multiple discard bios in a short time. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 23 ++++---- fs/f2fs/segment.c | 145 +++++++++++++++++++++++++++++++++++----------- 2 files changed, 125 insertions(+), 43 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8fbeb2831461..132dbba9e5e1 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -246,7 +246,6 @@ enum { #define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi) #define DEF_MAX_DISCARD_REQUEST 8 /* issue 8 discards per round */ -#define DEF_MAX_DISCARD_LEN 512 /* Max. 2MB per discard */ #define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */ #define DEF_MID_DISCARD_ISSUE_TIME 500 /* 500 ms, if device busy */ #define DEF_MAX_DISCARD_ISSUE_TIME 60000 /* 60 s, if no candidates */ @@ -318,9 +317,10 @@ struct discard_entry { (MAX_PLIST_NUM - 1) : (blk_num - 1)) enum { - D_PREP, - D_SUBMIT, - D_DONE, + D_PREP, /* initial */ + D_PARTIAL, /* partially submitted */ + D_SUBMIT, /* all submitted */ + D_DONE, /* finished */ }; struct discard_info { @@ -345,7 +345,10 @@ struct discard_cmd { struct block_device *bdev; /* bdev */ unsigned short ref; /* reference count */ unsigned char state; /* state */ + unsigned char issuing; /* issuing discard */ int error; /* bio error */ + spinlock_t lock; /* for state/bio_ref updating */ + unsigned short bio_ref; /* bio reference count */ }; enum { @@ -775,22 +778,22 @@ static inline void set_extent_info(struct extent_info *ei, unsigned int fofs, } static inline bool __is_discard_mergeable(struct discard_info *back, - struct discard_info *front) + struct discard_info *front, unsigned int max_len) { return (back->lstart + back->len == front->lstart) && - (back->len + front->len < DEF_MAX_DISCARD_LEN); + (back->len + front->len <= max_len); } static inline bool __is_discard_back_mergeable(struct discard_info *cur, - struct discard_info *back) + struct discard_info *back, unsigned int max_len) { - return __is_discard_mergeable(back, cur); + return __is_discard_mergeable(back, cur, max_len); } static inline bool __is_discard_front_mergeable(struct discard_info *cur, - struct discard_info *front) + struct discard_info *front, unsigned int max_len) { - return __is_discard_mergeable(cur, front); + return __is_discard_mergeable(cur, front, max_len); } static inline bool __is_extent_mergeable(struct extent_info *back, diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6dec5ab7dd52..4914b76823c9 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -839,9 +839,12 @@ static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, dc->len = len; dc->ref = 0; dc->state = D_PREP; + dc->issuing = 0; dc->error = 0; init_completion(&dc->wait); list_add_tail(&dc->list, pend_list); + spin_lock_init(&dc->lock); + dc->bio_ref = 0; atomic_inc(&dcc->discard_cmd_cnt); dcc->undiscard_blks += len; @@ -868,7 +871,7 @@ static void __detach_discard_cmd(struct discard_cmd_control *dcc, struct discard_cmd *dc) { if (dc->state == D_DONE) - atomic_dec(&dcc->issing_discard); + atomic_sub(dc->issuing, &dcc->issing_discard); list_del(&dc->list); rb_erase(&dc->rb_node, &dcc->root); @@ -883,9 +886,17 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + unsigned long flags; trace_f2fs_remove_discard(dc->bdev, dc->start, dc->len); + spin_lock_irqsave(&dc->lock, flags); + if (dc->bio_ref) { + spin_unlock_irqrestore(&dc->lock, flags); + return; + } + spin_unlock_irqrestore(&dc->lock, flags); + f2fs_bug_on(sbi, dc->ref); if (dc->error == -EOPNOTSUPP) @@ -901,10 +912,17 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi, static void f2fs_submit_discard_endio(struct bio *bio) { struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; + unsigned long flags; dc->error = bio->bi_error; - dc->state = D_DONE; - complete_all(&dc->wait); + + spin_lock_irqsave(&dc->lock, flags); + dc->bio_ref--; + if (!dc->bio_ref && dc->state == D_SUBMIT) { + dc->state = D_DONE; + complete_all(&dc->wait); + } + spin_unlock_irqrestore(&dc->lock, flags); bio_put(bio); } @@ -1053,17 +1071,25 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, } } - +static void __update_discard_tree_range(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len); /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, - struct discard_cmd *dc) + struct discard_cmd *dc, + unsigned int *issued) { + struct block_device *bdev = dc->bdev; + struct request_queue *q = bdev_get_queue(bdev); + unsigned int max_discard_blocks = + SECTOR_TO_BLOCK(q->limits.max_discard_sectors); struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? &(dcc->fstrim_list) : &(dcc->wait_list); - struct bio *bio = NULL; int flag = dpolicy->sync ? REQ_SYNC : 0; + block_t lstart, start, len, total_len; + int err = 0; if (dc->state != D_PREP) return; @@ -1071,29 +1097,79 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) return; - trace_f2fs_issue_discard(dc->bdev, dc->start, dc->len); + trace_f2fs_issue_discard(bdev, dc->start, dc->len); + + lstart = dc->lstart; + start = dc->start; + len = dc->len; + total_len = len; + + dc->len = 0; + + while (total_len && *issued < dpolicy->max_requests && !err) { + struct bio *bio = NULL; + unsigned long flags; + bool last = true; + + if (len > max_discard_blocks) { + len = max_discard_blocks; + last = false; + } + + (*issued)++; + if (*issued == dpolicy->max_requests) + last = true; + + dc->len += len; + + err = __blkdev_issue_discard(bdev, + SECTOR_FROM_BLOCK(start), + SECTOR_FROM_BLOCK(len), + GFP_NOFS, 0, &bio); + if (!err && bio) { + /* + * should keep before submission to avoid D_DONE + * right away + */ + spin_lock_irqsave(&dc->lock, flags); + if (last) + dc->state = D_SUBMIT; + else + dc->state = D_PARTIAL; + dc->bio_ref++; + spin_unlock_irqrestore(&dc->lock, flags); + + atomic_inc(&dcc->issing_discard); + dc->issuing++; + list_move_tail(&dc->list, wait_list); + + /* sanity check on discard range */ + __check_sit_bitmap(sbi, start, start + len); - dc->error = __blkdev_issue_discard(dc->bdev, - SECTOR_FROM_BLOCK(dc->start), - SECTOR_FROM_BLOCK(dc->len), - GFP_NOFS, 0, &bio); - if (!dc->error) { - /* should keep before submission to avoid D_DONE right away */ - dc->state = D_SUBMIT; - atomic_inc(&dcc->issued_discard); - atomic_inc(&dcc->issing_discard); - if (bio) { bio->bi_private = dc; bio->bi_end_io = f2fs_submit_discard_endio; submit_bio(flag, bio); - list_move_tail(&dc->list, wait_list); - __check_sit_bitmap(sbi, dc->start, dc->start + dc->len); + atomic_inc(&dcc->issued_discard); f2fs_update_iostat(sbi, FS_DISCARD, 1); + } else { + spin_lock_irqsave(&dc->lock, flags); + if (dc->state == D_PARTIAL) + dc->state = D_SUBMIT; + spin_unlock_irqrestore(&dc->lock, flags); + + __remove_discard_cmd(sbi, dc); + err = -EIO; } - } else { - __remove_discard_cmd(sbi, dc); + + lstart += len; + start += len; + total_len -= len; + len = total_len; } + + if (len) + __update_discard_tree_range(sbi, bdev, lstart, start, len); } static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi, @@ -1174,10 +1250,11 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, struct discard_cmd *dc; struct discard_info di = {0}; struct rb_node **insert_p = NULL, *insert_parent = NULL; + struct request_queue *q = bdev_get_queue(bdev); + unsigned int max_discard_blocks = + SECTOR_TO_BLOCK(q->limits.max_discard_sectors); block_t end = lstart + len; - mutex_lock(&dcc->cmd_lock); - dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, NULL, lstart, (struct rb_entry **)&prev_dc, @@ -1217,7 +1294,8 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, if (prev_dc && prev_dc->state == D_PREP && prev_dc->bdev == bdev && - __is_discard_back_mergeable(&di, &prev_dc->di)) { + __is_discard_back_mergeable(&di, &prev_dc->di, + max_discard_blocks)) { prev_dc->di.len += di.len; dcc->undiscard_blks += di.len; __relocate_discard_cmd(dcc, prev_dc); @@ -1228,7 +1306,8 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, if (next_dc && next_dc->state == D_PREP && next_dc->bdev == bdev && - __is_discard_front_mergeable(&di, &next_dc->di)) { + __is_discard_front_mergeable(&di, &next_dc->di, + max_discard_blocks)) { next_dc->di.lstart = di.lstart; next_dc->di.len += di.len; next_dc->di.start = di.start; @@ -1251,8 +1330,6 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, node = rb_next(&prev_dc->rb_node); next_dc = rb_entry_safe(node, struct discard_cmd, rb_node); } - - mutex_unlock(&dcc->cmd_lock); } static int __queue_discard_cmd(struct f2fs_sb_info *sbi, @@ -1267,7 +1344,9 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, blkstart -= FDEV(devi).start_blk; } + mutex_lock(&SM_I(sbi)->dcc_info->cmd_lock); __update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen); + mutex_unlock(&SM_I(sbi)->dcc_info->cmd_lock); return 0; } @@ -1306,9 +1385,9 @@ static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, } dcc->next_pos = dc->lstart + dc->len; - __submit_discard_cmd(sbi, dpolicy, dc); + __submit_discard_cmd(sbi, dpolicy, dc, &issued); - if (++issued >= dpolicy->max_requests) + if (issued >= dpolicy->max_requests) break; next: node = rb_next(&dc->rb_node); @@ -1363,9 +1442,9 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, break; } - __submit_discard_cmd(sbi, dpolicy, dc); + __submit_discard_cmd(sbi, dpolicy, dc, &issued); - if (++issued >= dpolicy->max_requests) + if (issued >= dpolicy->max_requests) break; } blk_finish_plug(&plug); @@ -2572,9 +2651,9 @@ static unsigned int __issue_discard_cmd_range(struct f2fs_sb_info *sbi, goto skip; } - __submit_discard_cmd(sbi, dpolicy, dc); + __submit_discard_cmd(sbi, dpolicy, dc, &issued); - if (++issued >= dpolicy->max_requests) { + if (issued >= dpolicy->max_requests) { start = dc->lstart + dc->len; blk_finish_plug(&plug); From 783a75765aad650efef729b9b26b7ffeea7af5fe Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 6 Aug 2018 20:30:18 +0800 Subject: [PATCH 0967/1212] f2fs: support discard submission error injection This patch adds to support discard submission error injection for testing error handling of __submit_discard_cmd(). Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/segment.c | 8 ++++++++ fs/f2fs/super.c | 1 + 3 files changed, 10 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 132dbba9e5e1..926ba2fd2680 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -58,6 +58,7 @@ enum { FAULT_TRUNCATE, FAULT_IO, FAULT_CHECKPOINT, + FAULT_DISCARD, FAULT_MAX, }; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 4914b76823c9..17b6fcacca19 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1122,10 +1122,18 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, dc->len += len; +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_DISCARD)) { + f2fs_show_injection_info(FAULT_DISCARD); + err = -EIO; + goto submit; + } +#endif err = __blkdev_issue_discard(bdev, SECTOR_FROM_BLOCK(start), SECTOR_FROM_BLOCK(len), GFP_NOFS, 0, &bio); +submit: if (!err && bio) { /* * should keep before submission to avoid D_DONE diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 59e20a6b6bd9..5511fa92b917 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -55,6 +55,7 @@ char *f2fs_fault_name[FAULT_MAX] = { [FAULT_TRUNCATE] = "truncate fail", [FAULT_IO] = "IO error", [FAULT_CHECKPOINT] = "checkpoint error", + [FAULT_DISCARD] = "discard error", }; void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate) From 06962ff3351ac429a448be1b7e4a8097ee363fed Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 8 Aug 2018 10:14:55 +0800 Subject: [PATCH 0968/1212] f2fs: fix use-after-free of dicard command entry As Dan Carpenter reported: The patch 20ee4382322c: "f2fs: issue small discard by LBA order" from Jul 8, 2018, leads to the following Smatch warning: fs/f2fs/segment.c:1277 __issue_discard_cmd_orderly() warn: 'dc' was already freed. See also: fs/f2fs/segment.c:2550 __issue_discard_cmd_range() warn: 'dc' was already freed. In order to fix this issue, let's get error from __submit_discard_cmd(), and release current discard command after we referenced next one. Reported-by: Dan Carpenter Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 82 +++++++++++++++++++++++++++-------------------- 1 file changed, 47 insertions(+), 35 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 17b6fcacca19..6da010f7887d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1075,7 +1075,7 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t lstart, block_t start, block_t len); /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ -static void __submit_discard_cmd(struct f2fs_sb_info *sbi, +static int __submit_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, struct discard_cmd *dc, unsigned int *issued) @@ -1092,10 +1092,10 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, int err = 0; if (dc->state != D_PREP) - return; + return 0; if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) - return; + return 0; trace_f2fs_issue_discard(bdev, dc->start, dc->len); @@ -1134,50 +1134,53 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, SECTOR_FROM_BLOCK(len), GFP_NOFS, 0, &bio); submit: - if (!err && bio) { - /* - * should keep before submission to avoid D_DONE - * right away - */ - spin_lock_irqsave(&dc->lock, flags); - if (last) - dc->state = D_SUBMIT; - else - dc->state = D_PARTIAL; - dc->bio_ref++; - spin_unlock_irqrestore(&dc->lock, flags); - - atomic_inc(&dcc->issing_discard); - dc->issuing++; - list_move_tail(&dc->list, wait_list); - - /* sanity check on discard range */ - __check_sit_bitmap(sbi, start, start + len); - - bio->bi_private = dc; - bio->bi_end_io = f2fs_submit_discard_endio; - submit_bio(flag, bio); - atomic_inc(&dcc->issued_discard); - - f2fs_update_iostat(sbi, FS_DISCARD, 1); - } else { + if (err) { spin_lock_irqsave(&dc->lock, flags); if (dc->state == D_PARTIAL) dc->state = D_SUBMIT; spin_unlock_irqrestore(&dc->lock, flags); - __remove_discard_cmd(sbi, dc); - err = -EIO; + break; } + f2fs_bug_on(sbi, !bio); + + /* + * should keep before submission to avoid D_DONE + * right away + */ + spin_lock_irqsave(&dc->lock, flags); + if (last) + dc->state = D_SUBMIT; + else + dc->state = D_PARTIAL; + dc->bio_ref++; + spin_unlock_irqrestore(&dc->lock, flags); + + atomic_inc(&dcc->issing_discard); + dc->issuing++; + list_move_tail(&dc->list, wait_list); + + /* sanity check on discard range */ + __check_sit_bitmap(sbi, start, start + len); + + bio->bi_private = dc; + bio->bi_end_io = f2fs_submit_discard_endio; + submit_bio(flag, bio); + + atomic_inc(&dcc->issued_discard); + + f2fs_update_iostat(sbi, FS_DISCARD, 1); + lstart += len; start += len; total_len -= len; len = total_len; } - if (len) + if (!err && len) __update_discard_tree_range(sbi, bdev, lstart, start, len); + return err; } static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi, @@ -1383,6 +1386,7 @@ static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, while (dc) { struct rb_node *node; + int err = 0; if (dc->state != D_PREP) goto next; @@ -1393,12 +1397,14 @@ static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, } dcc->next_pos = dc->lstart + dc->len; - __submit_discard_cmd(sbi, dpolicy, dc, &issued); + err = __submit_discard_cmd(sbi, dpolicy, dc, &issued); if (issued >= dpolicy->max_requests) break; next: node = rb_next(&dc->rb_node); + if (err) + __remove_discard_cmd(sbi, dc); dc = rb_entry_safe(node, struct discard_cmd, rb_node); } @@ -2650,6 +2656,7 @@ static unsigned int __issue_discard_cmd_range(struct f2fs_sb_info *sbi, while (dc && dc->lstart <= end) { struct rb_node *node; + int err = 0; if (dc->len < dpolicy->granularity) goto skip; @@ -2659,11 +2666,14 @@ static unsigned int __issue_discard_cmd_range(struct f2fs_sb_info *sbi, goto skip; } - __submit_discard_cmd(sbi, dpolicy, dc, &issued); + err = __submit_discard_cmd(sbi, dpolicy, dc, &issued); if (issued >= dpolicy->max_requests) { start = dc->lstart + dc->len; + if (err) + __remove_discard_cmd(sbi, dc); + blk_finish_plug(&plug); mutex_unlock(&dcc->cmd_lock); trimmed += __wait_all_discard_cmd(sbi, NULL); @@ -2672,6 +2682,8 @@ static unsigned int __issue_discard_cmd_range(struct f2fs_sb_info *sbi, } skip: node = rb_next(&dc->rb_node); + if (err) + __remove_discard_cmd(sbi, dc); dc = rb_entry_safe(node, struct discard_cmd, rb_node); if (fatal_signal_pending(current)) From 4d1c79084199cb781f904c70b94501eae22ef3ed Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 8 Aug 2018 17:36:29 +0800 Subject: [PATCH 0969/1212] f2fs: fix to return success when trimming meta area generic/251 --- tests/generic/251.out 2016-05-03 20:20:11.381899000 +0800 QA output created by 251 Running the test: done. +fstrim: /mnt/scratch_f2fs: FITRIM ioctl failed: Invalid argument +fstrim: /mnt/scratch_f2fs: FITRIM ioctl failed: Invalid argument +fstrim: /mnt/scratch_f2fs: FITRIM ioctl failed: Invalid argument +fstrim: /mnt/scratch_f2fs: FITRIM ioctl failed: Invalid argument +fstrim: /mnt/scratch_f2fs: FITRIM ioctl failed: Invalid argument ... Ran: generic/251 Failures: generic/251 The reason is coverage of fstrim locates in meta area, previously we just return -EINVAL for such case, making generic/251 failed, to fix this problem, let's relieve restriction to return success with no block discarded. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6da010f7887d..259bc4343dd1 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2711,8 +2711,8 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize) return -EINVAL; - if (end <= MAIN_BLKADDR(sbi)) - return -EINVAL; + if (end < MAIN_BLKADDR(sbi)) + goto out; if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) { f2fs_msg(sbi->sb, KERN_WARNING, From ef6fc75e9a364f096598e59c623788f78427e7e2 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 8 Aug 2018 17:36:41 +0800 Subject: [PATCH 0970/1212] f2fs: support fault_type mount option Previously, once fault injection is on, by default, all kind of faults will be injected to f2fs, if we want to trigger single or specified combined type during the test, we need to configure sysfs entry, it will be a little inconvenient to integrate sysfs configuring into testsuit, such as xfstest. So this patch introduces a new mount option 'fault_type' to assist old option 'fault_injection', with these two mount options, we can specify any fault rate/type at mount-time. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 20 ++++++++++++++++++ fs/f2fs/checkpoint.c | 2 +- fs/f2fs/f2fs.h | 7 ++++-- fs/f2fs/super.c | 34 ++++++++++++++++++++++++------ 4 files changed, 53 insertions(+), 10 deletions(-) diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index ecccb51c7279..0c8bdd38cefd 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -155,6 +155,26 @@ noinline_data Disable the inline data feature, inline data feature is enabled by default. data_flush Enable data flushing before checkpoint in order to persist data of regular and symlink. +fault_injection=%d Enable fault injection in all supported types with + specified injection rate. +fault_type=%d Support configuring fault injection type, should be + enabled with fault_injection option, fault type value + is shown below, it supports single or combined type. + Type_Name Type_Value + FAULT_KMALLOC 0x000000001 + FAULT_KVMALLOC 0x000000002 + FAULT_PAGE_ALLOC 0x000000004 + FAULT_PAGE_GET 0x000000008 + FAULT_ALLOC_BIO 0x000000010 + FAULT_ALLOC_NID 0x000000020 + FAULT_ORPHAN 0x000000040 + FAULT_BLOCK 0x000000080 + FAULT_DIR_DEPTH 0x000000100 + FAULT_EVICT_INODE 0x000000200 + FAULT_TRUNCATE 0x000000400 + FAULT_IO 0x000000800 + FAULT_CHECKPOINT 0x000001000 + FAULT_DISCARD 0x000002000 mode=%s Control block allocation mode which supports "adaptive" and "lfs". In "lfs" mode, there should be no random writes towards main area. diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 9caec7622d1e..6db267732438 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -28,7 +28,7 @@ struct kmem_cache *f2fs_inode_entry_slab; void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) { - f2fs_build_fault_attr(sbi, 0); + f2fs_build_fault_attr(sbi, 0, 0); set_ckpt_flags(sbi, CP_ERROR_FLAG); if (!end_io) f2fs_flush_merged_writes(sbi); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 926ba2fd2680..08e34719bc6f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -62,6 +62,8 @@ enum { FAULT_MAX, }; +#define F2FS_ALL_FAULT_TYPE ((1 << FAULT_MAX) - 1) + struct f2fs_fault_info { atomic_t inject_ops; unsigned int inject_rate; @@ -3520,9 +3522,10 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, int rw) } #ifdef CONFIG_F2FS_FAULT_INJECTION -extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate); +extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, + unsigned int type); #else -#define f2fs_build_fault_attr(sbi, rate) do { } while (0) +#define f2fs_build_fault_attr(sbi, rate, type) do { } while (0) #endif #endif diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 5511fa92b917..563e9157d765 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -58,17 +58,21 @@ char *f2fs_fault_name[FAULT_MAX] = { [FAULT_DISCARD] = "discard error", }; -void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate) +void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, + unsigned int type) { struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info; if (rate) { atomic_set(&ffi->inject_ops, 0); ffi->inject_rate = rate; - ffi->inject_type = (1 << FAULT_MAX) - 1; - } else { - memset(ffi, 0, sizeof(struct f2fs_fault_info)); } + + if (type) + ffi->inject_type = type; + + if (!rate && !type) + memset(ffi, 0, sizeof(struct f2fs_fault_info)); } #endif @@ -113,6 +117,7 @@ enum { Opt_mode, Opt_io_size_bits, Opt_fault_injection, + Opt_fault_type, Opt_lazytime, Opt_nolazytime, Opt_quota, @@ -170,6 +175,7 @@ static match_table_t f2fs_tokens = { {Opt_mode, "mode=%s"}, {Opt_io_size_bits, "io_bits=%u"}, {Opt_fault_injection, "fault_injection=%u"}, + {Opt_fault_type, "fault_type=%u"}, {Opt_lazytime, "lazytime"}, {Opt_nolazytime, "nolazytime"}, {Opt_quota, "quota"}, @@ -600,7 +606,18 @@ static int parse_options(struct super_block *sb, char *options) if (args->from && match_int(args, &arg)) return -EINVAL; #ifdef CONFIG_F2FS_FAULT_INJECTION - f2fs_build_fault_attr(sbi, arg); + f2fs_build_fault_attr(sbi, arg, F2FS_ALL_FAULT_TYPE); + set_opt(sbi, FAULT_INJECTION); +#else + f2fs_msg(sb, KERN_INFO, + "FAULT_INJECTION was not selected"); +#endif + break; + case Opt_fault_type: + if (args->from && match_int(args, &arg)) + return -EINVAL; +#ifdef CONFIG_F2FS_FAULT_INJECTION + f2fs_build_fault_attr(sbi, 0, arg); set_opt(sbi, FAULT_INJECTION); #else f2fs_msg(sb, KERN_INFO, @@ -1322,9 +1339,12 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) if (F2FS_IO_SIZE_BITS(sbi)) seq_printf(seq, ",io_size=%uKB", F2FS_IO_SIZE_KB(sbi)); #ifdef CONFIG_F2FS_FAULT_INJECTION - if (test_opt(sbi, FAULT_INJECTION)) + if (test_opt(sbi, FAULT_INJECTION)) { seq_printf(seq, ",fault_injection=%u", F2FS_OPTION(sbi).fault_info.inject_rate); + seq_printf(seq, ",fault_type=%u", + F2FS_OPTION(sbi).fault_info.inject_type); + } #endif #ifdef CONFIG_QUOTA if (test_opt(sbi, QUOTA)) @@ -1394,7 +1414,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, POSIX_ACL); #endif - f2fs_build_fault_attr(sbi, 0); + f2fs_build_fault_attr(sbi, 0, 0); } #ifdef CONFIG_QUOTA From cd9641d9dda456e8e45163cbb818e601dfb554b3 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 13 Aug 2018 23:38:06 +0200 Subject: [PATCH 0971/1212] f2fs: rework fault injection handling to avoid a warning When CONFIG_F2FS_FAULT_INJECTION is disabled, we get a warning about an unused label: fs/f2fs/segment.c: In function '__submit_discard_cmd': fs/f2fs/segment.c:1059:1: error: label 'submit' defined but not used [-Werror=unused-label] This could be fixed by adding another #ifdef around it, but the more reliable way of doing this seems to be to remove the other #ifdefs where that is easily possible. By defining time_to_inject() as a trivial stub, most of the checks for CONFIG_F2FS_FAULT_INJECTION can go away. This also leads to nicer formatting of the code. Signed-off-by: Arnd Bergmann Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 3 +-- fs/f2fs/data.c | 2 -- fs/f2fs/dir.c | 3 +-- fs/f2fs/f2fs.h | 50 ++++++++++++++++++++++---------------------- fs/f2fs/file.c | 3 +-- fs/f2fs/gc.c | 2 -- fs/f2fs/inode.c | 3 +-- fs/f2fs/node.c | 3 +-- fs/f2fs/recovery.c | 5 ++--- fs/f2fs/segment.c | 4 ---- 10 files changed, 32 insertions(+), 46 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 6db267732438..f7cdd3b536e3 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -556,13 +556,12 @@ int f2fs_acquire_orphan_inode(struct f2fs_sb_info *sbi) spin_lock(&im->ino_lock); -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_ORPHAN)) { spin_unlock(&im->ino_lock); f2fs_show_injection_info(FAULT_ORPHAN); return -ENOSPC; } -#endif + if (unlikely(im->ino_num >= sbi->max_orphans)) err = -ENOSPC; else diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 87086b98f08c..fe6845820162 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -125,12 +125,10 @@ static bool f2fs_bio_post_read_required(struct bio *bio) static void f2fs_read_end_io(struct bio *bio) { -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) { f2fs_show_injection_info(FAULT_IO); bio->bi_error = -EIO; } -#endif if (f2fs_bio_post_read_required(bio)) { struct bio_post_read_ctx *ctx = bio->bi_private; diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index a7feed756592..086639556705 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -517,12 +517,11 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, } start: -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) { f2fs_show_injection_info(FAULT_DIR_DEPTH); return -ENOSPC; } -#endif + if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) return -ENOSPC; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 08e34719bc6f..dfa4710695d9 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -43,7 +43,6 @@ } while (0) #endif -#ifdef CONFIG_F2FS_FAULT_INJECTION enum { FAULT_KMALLOC, FAULT_KVMALLOC, @@ -62,6 +61,7 @@ enum { FAULT_MAX, }; +#ifdef CONFIG_F2FS_FAULT_INJECTION #define F2FS_ALL_FAULT_TYPE ((1 << FAULT_MAX) - 1) struct f2fs_fault_info { @@ -1389,6 +1389,12 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) } return false; } +#else +#define f2fs_show_injection_info(type) do { } while (0) +static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) +{ + return false; +} #endif /* For write statistics. Suppose sector size is 512 bytes, @@ -1741,13 +1747,12 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, if (ret) return ret; -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_BLOCK)) { f2fs_show_injection_info(FAULT_BLOCK); release = *count; goto enospc; } -#endif + /* * let's increase this in prior to actual block count change in order * for f2fs_sync_file to avoid data races when deciding checkpoint. @@ -1956,12 +1961,10 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, return ret; } -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_BLOCK)) { f2fs_show_injection_info(FAULT_BLOCK); goto enospc; } -#endif spin_lock(&sbi->stat_lock); @@ -2046,22 +2049,23 @@ static inline s64 valid_inode_count(struct f2fs_sb_info *sbi) static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, pgoff_t index, bool for_write) { -#ifdef CONFIG_F2FS_FAULT_INJECTION struct page *page; - if (!for_write) - page = find_get_page_flags(mapping, index, - FGP_LOCK | FGP_ACCESSED); - else - page = find_lock_page(mapping, index); - if (page) - return page; + if (IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION)) { + if (!for_write) + page = find_get_page_flags(mapping, index, + FGP_LOCK | FGP_ACCESSED); + else + page = find_lock_page(mapping, index); + if (page) + return page; - if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) { - f2fs_show_injection_info(FAULT_PAGE_ALLOC); - return NULL; + if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) { + f2fs_show_injection_info(FAULT_PAGE_ALLOC); + return NULL; + } } -#endif + if (!for_write) return grab_cache_page(mapping, index); return grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); @@ -2071,12 +2075,11 @@ static inline struct page *f2fs_pagecache_get_page( struct address_space *mapping, pgoff_t index, int fgp_flags, gfp_t gfp_mask) { -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET)) { f2fs_show_injection_info(FAULT_PAGE_GET); return NULL; } -#endif + return pagecache_get_page(mapping, index, fgp_flags, gfp_mask); } @@ -2141,12 +2144,11 @@ static inline struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages); return bio; } -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_ALLOC_BIO)) { f2fs_show_injection_info(FAULT_ALLOC_BIO); return NULL; } -#endif + return bio_alloc(GFP_KERNEL, npages); } @@ -2681,12 +2683,11 @@ static inline bool f2fs_may_extent_tree(struct inode *inode) static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_KMALLOC)) { f2fs_show_injection_info(FAULT_KMALLOC); return NULL; } -#endif + return kmalloc(size, flags); } @@ -2719,12 +2720,11 @@ static inline void *f2fs_kzalloc(struct f2fs_sb_info *sbi, static inline void *f2fs_kvmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_KVMALLOC)) { f2fs_show_injection_info(FAULT_KVMALLOC); return NULL; } -#endif + return kvmalloc(size, flags); } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 17174234beb5..39f89a4b7d8f 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -671,12 +671,11 @@ int f2fs_truncate(struct inode *inode) trace_f2fs_truncate(inode); -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(F2FS_I_SB(inode), FAULT_TRUNCATE)) { f2fs_show_injection_info(FAULT_TRUNCATE); return -EIO; } -#endif + /* we should check inline_data size */ if (!f2fs_may_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index c0a949d83de7..165b42f72943 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -53,12 +53,10 @@ static int gc_thread_func(void *data) continue; } -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_CHECKPOINT)) { f2fs_show_injection_info(FAULT_CHECKPOINT); f2fs_stop_checkpoint(sbi, false); } -#endif if (!sb_start_write_trylock(sbi->sb)) continue; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index b050ec5075fe..42856481be98 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -659,12 +659,11 @@ void f2fs_evict_inode(struct inode *inode) if (F2FS_HAS_BLOCKS(inode)) err = f2fs_truncate(inode); -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_EVICT_INODE)) { f2fs_show_injection_info(FAULT_EVICT_INODE); err = -EIO; } -#endif + if (!err) { f2fs_lock_op(sbi); err = f2fs_remove_inode_page(inode); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b7944fe56f1f..1af0805915b4 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2326,12 +2326,11 @@ bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i = NULL; retry: -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_ALLOC_NID)) { f2fs_show_injection_info(FAULT_ALLOC_NID); return false; } -#endif + spin_lock(&nm_i->nid_list_lock); if (unlikely(nm_i->available_nids == 0)) { diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 0a6e81879a1f..501bb0fdda1b 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -518,10 +518,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, if (src == NULL_ADDR) { err = f2fs_reserve_new_block(&dn); -#ifdef CONFIG_F2FS_FAULT_INJECTION - while (err) + while (err && + IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION)) err = f2fs_reserve_new_block(&dn); -#endif /* We should not get -ENOSPC */ f2fs_bug_on(sbi, err); if (err) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 259bc4343dd1..83fbdf3e3102 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -470,12 +470,10 @@ int f2fs_commit_inmem_pages(struct inode *inode) */ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_CHECKPOINT)) { f2fs_show_injection_info(FAULT_CHECKPOINT); f2fs_stop_checkpoint(sbi, false); } -#endif /* balance_fs_bg is able to be pending */ if (need && excess_cached_nats(sbi)) @@ -1122,13 +1120,11 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, dc->len += len; -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_DISCARD)) { f2fs_show_injection_info(FAULT_DISCARD); err = -EIO; goto submit; } -#endif err = __blkdev_issue_discard(bdev, SECTOR_FROM_BLOCK(start), SECTOR_FROM_BLOCK(len), From da42f15a55037a378c6c4cb8efed9ae7eb8002e1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 11 Aug 2018 23:42:09 +0800 Subject: [PATCH 0972/1212] f2fs: fix to skip verifying block address for non-regular inode generic/184 1s ... [failed, exit status 1]- output mismatch --- tests/generic/184.out 2015-01-11 16:52:27.643681072 +0800 QA output created by 184 - silence is golden +rm: cannot remove '/mnt/f2fs/null': Bad address +mknod: '/mnt/f2fs/null': Bad address +chmod: cannot access '/mnt/f2fs/null': Bad address +./tests/generic/184: line 36: /mnt/f2fs/null: Bad address ... F2FS-fs (zram0): access invalid blkaddr:259 EIP: f2fs_is_valid_blkaddr+0x14b/0x1b0 [f2fs] f2fs_iget+0x927/0x1010 [f2fs] f2fs_lookup+0x26e/0x630 [f2fs] __lookup_slow+0xb3/0x140 lookup_slow+0x31/0x50 walk_component+0x185/0x1f0 path_lookupat+0x51/0x190 filename_lookup+0x7f/0x140 user_path_at_empty+0x36/0x40 vfs_statx+0x61/0xc0 __do_sys_stat64+0x29/0x40 sys_stat64+0x13/0x20 do_fast_syscall_32+0xaa/0x22c entry_SYSENTER_32+0x53/0x86 In f2fs_iget(), we will check inode's first block address, if it is valid, we will set FI_FIRST_BLOCK_WRITTEN flag in inode. But we should only do this for regular inode, otherwise, like special inode, i_addr[0] is used for storing device info instead of block address, it will fail checking flow obviously. So for non-regular inode, let's skip verifying address and setting flag. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 42856481be98..292f787a65e2 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -371,13 +371,15 @@ static int do_read_inode(struct inode *inode) /* get rdev by using inline_info */ __get_inode_rdev(inode, ri); - err = __written_first_block(sbi, ri); - if (err < 0) { - f2fs_put_page(node_page, 1); - return err; + if (S_ISREG(inode->i_mode)) { + err = __written_first_block(sbi, ri); + if (err < 0) { + f2fs_put_page(node_page, 1); + return err; + } + if (!err) + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); } - if (!err) - set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); if (!f2fs_need_inode_block_update(sbi, inode->i_ino)) fi->last_disk_size = inode->i_size; From 1e1ba6365e040b7929828299cf6c950275abd8bc Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 9 Aug 2018 17:53:34 -0700 Subject: [PATCH 0973/1212] f2fs: fix performance issue observed with multi-thread sequential read This reverts the commit - "b93f771 - f2fs: remove writepages lock" to fix the drop in sequential read throughput. Test: ./tiotest -t 32 -d /data/tio_tmp -f 32 -b 524288 -k 1 -k 3 -L device: UFS Before - read throughput: 185 MB/s total read requests: 85177 (of these ~80000 are 4KB size requests). total write requests: 2546 (of these ~2208 requests are written in 512KB). After - read throughput: 758 MB/s total read requests: 2417 (of these ~2042 are 512KB reads). total write requests: 2701 (of these ~2034 requests are written in 512KB). Signed-off-by: Sahitya Tummala Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 8 ++++++++ fs/f2fs/data.c | 21 +++++++++++++++++++++ fs/f2fs/f2fs.h | 2 ++ fs/f2fs/segment.c | 1 + fs/f2fs/super.c | 1 + fs/f2fs/sysfs.c | 2 ++ 6 files changed, 35 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index f82da9bbb1fd..3bbb9fe9548c 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -51,6 +51,14 @@ Description: Controls the dirty page count condition for the in-place-update policies. +What: /sys/fs/f2fs//min_seq_blocks +Date: August 2018 +Contact: "Jaegeuk Kim" +Description: + Controls the dirty page count condition for batched sequential + writes in ->writepages. + + What: /sys/fs/f2fs//min_hot_blocks Date: March 2017 Contact: "Jaegeuk Kim" diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index fe6845820162..3911f2aaed89 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2119,6 +2119,18 @@ static int f2fs_write_cache_pages(struct address_space *mapping, return ret; } +static inline bool __should_serialize_io(struct inode *inode, + struct writeback_control *wbc) +{ + if (!S_ISREG(inode->i_mode)) + return false; + if (wbc->sync_mode != WB_SYNC_ALL) + return true; + if (get_dirty_pages(inode) >= SM_I(F2FS_I_SB(inode))->min_seq_blocks) + return true; + return false; +} + static int __f2fs_write_data_pages(struct address_space *mapping, struct writeback_control *wbc, enum iostat_type io_type) @@ -2127,6 +2139,7 @@ static int __f2fs_write_data_pages(struct address_space *mapping, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct blk_plug plug; int ret; + bool locked = false; /* deal with chardevs and other special file */ if (!mapping->a_ops->writepage) @@ -2157,10 +2170,18 @@ static int __f2fs_write_data_pages(struct address_space *mapping, else if (atomic_read(&sbi->wb_sync_req[DATA])) goto skip_write; + if (__should_serialize_io(inode, wbc)) { + mutex_lock(&sbi->writepages); + locked = true; + } + blk_start_plug(&plug); ret = f2fs_write_cache_pages(mapping, wbc, io_type); blk_finish_plug(&plug); + if (locked) + mutex_unlock(&sbi->writepages); + if (wbc->sync_mode == WB_SYNC_ALL) atomic_dec(&sbi->wb_sync_req[DATA]); /* diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index dfa4710695d9..9373300850de 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -978,6 +978,7 @@ struct f2fs_sm_info { unsigned int ipu_policy; /* in-place-update policy */ unsigned int min_ipu_util; /* in-place-update threshold */ unsigned int min_fsync_blocks; /* threshold for fsync */ + unsigned int min_seq_blocks; /* threshold for sequential blocks */ unsigned int min_hot_blocks; /* threshold for hot block allocation */ unsigned int min_ssr_sections; /* threshold to trigger SSR allocation */ @@ -1198,6 +1199,7 @@ struct f2fs_sb_info { struct rw_semaphore sb_lock; /* lock for raw super block */ int valid_super_block; /* valid super block no */ unsigned long s_flag; /* flags for sbi */ + struct mutex writepages; /* mutex for writepages() */ #ifdef CONFIG_BLK_DEV_ZONED unsigned int blocks_per_blkz; /* F2FS blocks per zone */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 83fbdf3e3102..2f604994e5cc 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -4208,6 +4208,7 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi) sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; + sm_info->min_seq_blocks = sbi->blocks_per_seg * sbi->segs_per_sec; sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS; sm_info->min_ssr_sections = reserved_sections(sbi); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 563e9157d765..5e329c022e74 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2850,6 +2850,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) /* init f2fs-specific super block info */ sbi->valid_super_block = valid_super_block; mutex_init(&sbi->gc_mutex); + mutex_init(&sbi->writepages); mutex_init(&sbi->cp_mutex); init_rwsem(&sbi->node_write); init_rwsem(&sbi->node_change); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index d4c196552888..30fd016afeb3 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -397,6 +397,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_seq_blocks, min_seq_blocks); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ssr_sections, min_ssr_sections); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); @@ -449,6 +450,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), ATTR_LIST(min_fsync_blocks), + ATTR_LIST(min_seq_blocks), ATTR_LIST(min_hot_blocks), ATTR_LIST(min_ssr_sections), ATTR_LIST(max_victim_search), From 163bbf811962f0a99bc82ac1e6f35e73509758ca Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 25 Jul 2018 12:11:56 +0900 Subject: [PATCH 0974/1212] f2fs: avoid fi->i_gc_rwsem[WRITE] lock in f2fs_gc The f2fs_gc() called by f2fs_balance_fs() requires to be called outside of fi->i_gc_rwsem[WRITE], since f2fs_gc() can try to grab it in a loop. If it hits the miximum retrials in GC, let's give a chance to release gc_mutex for a short time in order not to go into live lock in the worst case. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 +- fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 121 ++++++++++++++++++++++++---------------------- fs/f2fs/gc.c | 26 +++++++--- fs/f2fs/segment.c | 6 ++- fs/f2fs/segment.h | 2 +- 6 files changed, 92 insertions(+), 68 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 3911f2aaed89..1a8f6dd5a485 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2214,14 +2214,14 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to) loff_t i_size = i_size_read(inode); if (to > i_size) { - down_write(&F2FS_I(inode)->i_mmap_sem); down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_pagecache(inode, i_size); f2fs_truncate_blocks(inode, i_size, true); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9373300850de..b0151dd8ed76 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1308,6 +1308,7 @@ struct f2fs_sb_info { unsigned int gc_mode; /* current GC state */ /* for skip statistic */ unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ + unsigned long long skipped_gc_rwsem; /* FG_GC only */ /* threshold for gc trials on pinned files */ u64 gc_pin_file_threshold; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 39f89a4b7d8f..8e381b6385e3 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -801,8 +801,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & ATTR_SIZE) { bool to_smaller = (attr->ia_size <= i_size_read(inode)); - down_write(&F2FS_I(inode)->i_mmap_sem); down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_setsize(inode, attr->ia_size); @@ -812,8 +812,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) * do not trim all blocks after i_size if target size is * larger than i_size. */ - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (err) return err; @@ -969,8 +969,8 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) blk_start = (loff_t)pg_start << PAGE_SHIFT; blk_end = (loff_t)pg_end << PAGE_SHIFT; - down_write(&F2FS_I(inode)->i_mmap_sem); down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_inode_pages_range(mapping, blk_start, blk_end - 1); @@ -979,8 +979,8 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) ret = f2fs_truncate_hole(inode, pg_start, pg_end); f2fs_unlock_op(sbi); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } } @@ -1195,25 +1195,33 @@ static int __exchange_data_block(struct inode *src_inode, return ret; } -static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) +static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; + pgoff_t start = offset >> PAGE_SHIFT; + pgoff_t end = (offset + len) >> PAGE_SHIFT; int ret; f2fs_balance_fs(sbi, true); + + /* avoid gc operation during block exchange */ + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_lock_op(sbi); - f2fs_drop_extent_tree(inode); - + truncate_pagecache(inode, offset); ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); f2fs_unlock_op(sbi); + + up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) { - pgoff_t pg_start, pg_end; loff_t new_size; int ret; @@ -1228,25 +1236,17 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (ret) return ret; - pg_start = offset >> PAGE_SHIFT; - pg_end = (offset + len) >> PAGE_SHIFT; - - /* avoid gc operation during block exchange */ - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - - down_write(&F2FS_I(inode)->i_mmap_sem); /* write out all dirty pages from offset */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); if (ret) - goto out_unlock; + return ret; - truncate_pagecache(inode, offset); - - ret = f2fs_do_collapse(inode, pg_start, pg_end); + ret = f2fs_do_collapse(inode, offset, len); if (ret) - goto out_unlock; + return ret; /* write out all moved pages, if possible */ + down_write(&F2FS_I(inode)->i_mmap_sem); filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); truncate_pagecache(inode, offset); @@ -1254,11 +1254,9 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) truncate_pagecache(inode, new_size); ret = f2fs_truncate_blocks(inode, new_size, true); + up_write(&F2FS_I(inode)->i_mmap_sem); if (!ret) f2fs_i_size_write(inode, new_size); -out_unlock: - up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } @@ -1324,10 +1322,9 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) return ret; - down_write(&F2FS_I(inode)->i_mmap_sem); ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1); if (ret) - goto out_sem; + return ret; pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; @@ -1339,7 +1336,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = fill_zero(inode, pg_start, off_start, off_end - off_start); if (ret) - goto out_sem; + return ret; new_size = max_t(loff_t, new_size, offset + len); } else { @@ -1347,7 +1344,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = fill_zero(inode, pg_start++, off_start, PAGE_SIZE - off_start); if (ret) - goto out_sem; + return ret; new_size = max_t(loff_t, new_size, (loff_t)pg_start << PAGE_SHIFT); @@ -1359,6 +1356,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, pgoff_t end; down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_pagecache_range(inode, (loff_t)index << PAGE_SHIFT, @@ -1370,6 +1368,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE); if (ret) { f2fs_unlock_op(sbi); + up_write(&F2FS_I(inode)->i_mmap_sem); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); goto out; } @@ -1381,6 +1380,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); + up_write(&F2FS_I(inode)->i_mmap_sem); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); f2fs_balance_fs(sbi, dn.node_changed); @@ -1409,9 +1409,6 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, else f2fs_i_size_write(inode, new_size); } -out_sem: - up_write(&F2FS_I(inode)->i_mmap_sem); - return ret; } @@ -1440,26 +1437,27 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); - /* avoid gc operation during block exchange */ - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); + up_write(&F2FS_I(inode)->i_mmap_sem); if (ret) - goto out; + return ret; /* write out all dirty pages from offset */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); if (ret) - goto out; - - truncate_pagecache(inode, offset); + return ret; pg_start = offset >> PAGE_SHIFT; pg_end = (offset + len) >> PAGE_SHIFT; delta = pg_end - pg_start; idx = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; + /* avoid gc operation during block exchange */ + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); + truncate_pagecache(inode, offset); + while (!ret && idx > pg_start) { nr = idx - pg_start; if (nr > delta) @@ -1473,16 +1471,17 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) idx + delta, nr, false); f2fs_unlock_op(sbi); } + up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); /* write out all moved pages, if possible */ + down_write(&F2FS_I(inode)->i_mmap_sem); filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); truncate_pagecache(inode, offset); + up_write(&F2FS_I(inode)->i_mmap_sem); if (!ret) f2fs_i_size_write(inode, new_size); -out: - up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } @@ -1722,8 +1721,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) inode_lock(inode); - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - if (f2fs_is_atomic_file(inode)) { if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) ret = -EINVAL; @@ -1734,6 +1731,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (ret) goto out; + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + if (!get_dirty_pages(inode)) goto skip_flush; @@ -1741,18 +1740,20 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); - if (ret) + if (ret) { + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); goto out; + } skip_flush: set_inode_flag(inode, FI_ATOMIC_FILE); clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); F2FS_I(inode)->inmem_task = current; stat_inc_atomic_write(inode); stat_update_max_atomic_write(inode); out: - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -1770,9 +1771,9 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) if (ret) return ret; - inode_lock(inode); + f2fs_balance_fs(F2FS_I_SB(inode), true); - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + inode_lock(inode); if (f2fs_is_volatile_file(inode)) { ret = -EINVAL; @@ -1798,7 +1799,6 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); ret = -EINVAL; } - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -2394,15 +2394,10 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, } inode_lock(src); - down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); if (src != dst) { ret = -EBUSY; if (!inode_trylock(dst)) goto out; - if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) { - inode_unlock(dst); - goto out; - } } ret = -EINVAL; @@ -2447,6 +2442,14 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, goto out_unlock; f2fs_balance_fs(sbi, true); + + down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); + if (src != dst) { + ret = -EBUSY; + if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) + goto out_src; + } + f2fs_lock_op(sbi); ret = __exchange_data_block(src, dst, pos_in >> F2FS_BLKSIZE_BITS, pos_out >> F2FS_BLKSIZE_BITS, @@ -2459,13 +2462,15 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, f2fs_i_size_write(dst, dst_osize); } f2fs_unlock_op(sbi); -out_unlock: - if (src != dst) { + + if (src != dst) up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); - inode_unlock(dst); - } -out: +out_src: up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); +out_unlock: + if (src != dst) + inode_unlock(dst); +out: inode_unlock(src); return ret; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 165b42f72943..3e454522679e 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -882,6 +882,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (!down_write_trylock( &F2FS_I(inode)->i_gc_rwsem[WRITE])) { iput(inode); + sbi->skipped_gc_rwsem++; continue; } @@ -911,6 +912,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, continue; if (!down_write_trylock( &fi->i_gc_rwsem[WRITE])) { + sbi->skipped_gc_rwsem++; up_write(&fi->i_gc_rwsem[READ]); continue; } @@ -1048,6 +1050,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, .iroot = RADIX_TREE_INIT(GFP_NOFS), }; unsigned long long last_skipped = sbi->skipped_atomic_files[FG_GC]; + unsigned long long first_skipped; unsigned int skipped_round = 0, round = 0; trace_f2fs_gc_begin(sbi->sb, sync, background, @@ -1060,6 +1063,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, prefree_segments(sbi)); cpc.reason = __get_cp_reason(sbi); + sbi->skipped_gc_rwsem = 0; + first_skipped = last_skipped; gc_more: if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) { ret = -EINVAL; @@ -1101,7 +1106,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, total_freed += seg_freed; if (gc_type == FG_GC) { - if (sbi->skipped_atomic_files[FG_GC] > last_skipped) + if (sbi->skipped_atomic_files[FG_GC] > last_skipped || + sbi->skipped_gc_rwsem) skipped_round++; last_skipped = sbi->skipped_atomic_files[FG_GC]; round++; @@ -1110,15 +1116,23 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; - if (!sync) { - if (has_not_enough_free_secs(sbi, sec_freed, 0)) { - if (skipped_round > MAX_SKIP_ATOMIC_COUNT && - skipped_round * 2 >= round) - f2fs_drop_inmem_pages_all(sbi, true); + if (sync) + goto stop; + + if (has_not_enough_free_secs(sbi, sec_freed, 0)) { + if (skipped_round <= MAX_SKIP_GC_COUNT || + skipped_round * 2 < round) { segno = NULL_SEGNO; goto gc_more; } + if (first_skipped < last_skipped && + (last_skipped - first_skipped) > + sbi->skipped_gc_rwsem) { + f2fs_drop_inmem_pages_all(sbi, true); + segno = NULL_SEGNO; + goto gc_more; + } if (gc_type == FG_GC) ret = f2fs_write_checkpoint(sbi, &cpc); } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 2f604994e5cc..fe06152b0a6f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -445,8 +445,10 @@ int f2fs_commit_inmem_pages(struct inode *inode) int err; f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); + down_write(&fi->i_gc_rwsem[WRITE]); + + f2fs_lock_op(sbi); set_inode_flag(inode, FI_ATOMIC_COMMIT); mutex_lock(&fi->inmem_lock); @@ -461,6 +463,8 @@ int f2fs_commit_inmem_pages(struct inode *inode) clear_inode_flag(inode, FI_ATOMIC_COMMIT); f2fs_unlock_op(sbi); + up_write(&fi->i_gc_rwsem[WRITE]); + return err; } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 50495515f0a0..b3d9e317ff0c 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -215,7 +215,7 @@ struct segment_allocation { #define IS_DUMMY_WRITTEN_PAGE(page) \ (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) -#define MAX_SKIP_ATOMIC_COUNT 16 +#define MAX_SKIP_GC_COUNT 16 struct inmem_pages { struct list_head list; From e6105542d5e0c8de2a517be25549d6e86c97fbd5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 14 Aug 2018 22:37:25 +0800 Subject: [PATCH 0975/1212] f2fs: readahead encrypted block during GC During GC, for each encrypted block, we will read block synchronously into meta page, and then submit it into current cold data log area. So this block read model with 4k granularity can make poor performance, like migrating non-encrypted block, let's readahead encrypted block as well to improve migration performance. To implement this, we choose meta page that its index is old block address of the encrypted block, and readahead ciphertext into this page, later, if readaheaded page is still updated, we will load its data into target meta page, and submit the write IO. Note that for OPU, truncation, deletion, we need to invalid meta page after we invalid old block address, to make sure we won't load invalid data from target meta page during encrypted block migration. for ((i = 0; i < 1000; i++)) do { xfs_io -f /mnt/f2fs/dir/$i -c "pwrite 0 128k" -c "fsync"; } done for ((i = 0; i < 1000; i+=2)) do { rm /mnt/f2fs/dir/$i; } done ret = ioctl(fd, F2FS_IOC_GARBAGE_COLLECT, 0); Before: gc-6549 [001] d..1 214682.212797: block_rq_insert: 8,32 RA 32768 () 786400 + 64 [gc] gc-6549 [001] d..1 214682.212802: block_unplug: [gc] 1 gc-6549 [001] .... 214682.213892: block_bio_queue: 8,32 R 67494144 + 8 [gc] gc-6549 [001] .... 214682.213899: block_getrq: 8,32 R 67494144 + 8 [gc] gc-6549 [001] .... 214682.213902: block_plug: [gc] gc-6549 [001] d..1 214682.213905: block_rq_insert: 8,32 R 4096 () 67494144 + 8 [gc] gc-6549 [001] d..1 214682.213908: block_unplug: [gc] 1 gc-6549 [001] .... 214682.226405: block_bio_queue: 8,32 R 67494152 + 8 [gc] gc-6549 [001] .... 214682.226412: block_getrq: 8,32 R 67494152 + 8 [gc] gc-6549 [001] .... 214682.226414: block_plug: [gc] gc-6549 [001] d..1 214682.226417: block_rq_insert: 8,32 R 4096 () 67494152 + 8 [gc] gc-6549 [001] d..1 214682.226420: block_unplug: [gc] 1 gc-6549 [001] .... 214682.226904: block_bio_queue: 8,32 R 67494160 + 8 [gc] gc-6549 [001] .... 214682.226910: block_getrq: 8,32 R 67494160 + 8 [gc] gc-6549 [001] .... 214682.226911: block_plug: [gc] gc-6549 [001] d..1 214682.226914: block_rq_insert: 8,32 R 4096 () 67494160 + 8 [gc] gc-6549 [001] d..1 214682.226916: block_unplug: [gc] 1 After: gc-5678 [003] .... 214327.025906: block_bio_queue: 8,32 R 67493824 + 8 [gc] gc-5678 [003] .... 214327.025908: block_bio_backmerge: 8,32 R 67493824 + 8 [gc] gc-5678 [003] .... 214327.025915: block_bio_queue: 8,32 R 67493832 + 8 [gc] gc-5678 [003] .... 214327.025917: block_bio_backmerge: 8,32 R 67493832 + 8 [gc] gc-5678 [003] .... 214327.025923: block_bio_queue: 8,32 R 67493840 + 8 [gc] gc-5678 [003] .... 214327.025925: block_bio_backmerge: 8,32 R 67493840 + 8 [gc] gc-5678 [003] .... 214327.025932: block_bio_queue: 8,32 R 67493848 + 8 [gc] gc-5678 [003] .... 214327.025934: block_bio_backmerge: 8,32 R 67493848 + 8 [gc] gc-5678 [003] .... 214327.025941: block_bio_queue: 8,32 R 67493856 + 8 [gc] gc-5678 [003] .... 214327.025943: block_bio_backmerge: 8,32 R 67493856 + 8 [gc] gc-5678 [003] .... 214327.025953: block_bio_queue: 8,32 R 67493864 + 8 [gc] gc-5678 [003] .... 214327.025955: block_bio_backmerge: 8,32 R 67493864 + 8 [gc] gc-5678 [003] .... 214327.025962: block_bio_queue: 8,32 R 67493872 + 8 [gc] gc-5678 [003] .... 214327.025964: block_bio_backmerge: 8,32 R 67493872 + 8 [gc] gc-5678 [003] .... 214327.025970: block_bio_queue: 8,32 R 67493880 + 8 [gc] gc-5678 [003] .... 214327.025972: block_bio_backmerge: 8,32 R 67493880 + 8 [gc] gc-5678 [003] .... 214327.026000: block_bio_queue: 8,32 WS 34123776 + 2048 [gc] gc-5678 [003] .... 214327.026019: block_getrq: 8,32 WS 34123776 + 2048 [gc] gc-5678 [003] d..1 214327.026021: block_rq_insert: 8,32 R 131072 () 67493632 + 256 [gc] gc-5678 [003] d..1 214327.026023: block_unplug: [gc] 1 gc-5678 [003] d..1 214327.026026: block_rq_issue: 8,32 R 131072 () 67493632 + 256 [gc] gc-5678 [003] .... 214327.026046: block_plug: [gc] Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 37 +++++++++++----- fs/f2fs/gc.c | 111 +++++++++++++++++++++++++++++++++++++++++----- fs/f2fs/segment.c | 10 ++++- 3 files changed, 135 insertions(+), 23 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 1a8f6dd5a485..267d907104a1 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -873,6 +873,7 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct f2fs_summary sum; struct node_info ni; + block_t old_blkaddr; pgoff_t fofs; blkcnt_t count = 1; int err; @@ -894,9 +895,12 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) alloc: set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); - - f2fs_allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr, + old_blkaddr = dn->data_blkaddr; + f2fs_allocate_data_block(sbi, NULL, old_blkaddr, &dn->data_blkaddr, &sum, seg_type, NULL, false); + if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) + invalidate_mapping_pages(META_MAPPING(sbi), + old_blkaddr, old_blkaddr); f2fs_set_data_blkaddr(dn); /* update i_size */ @@ -1611,6 +1615,7 @@ static int f2fs_read_data_pages(struct file *file, static int encrypt_one_page(struct f2fs_io_info *fio) { struct inode *inode = fio->page->mapping->host; + struct page *mpage; gfp_t gfp_flags = GFP_NOFS; if (!f2fs_encrypted_file(inode)) @@ -1622,17 +1627,25 @@ static int encrypt_one_page(struct f2fs_io_info *fio) retry_encrypt: fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, PAGE_SIZE, 0, fio->page->index, gfp_flags); - if (!IS_ERR(fio->encrypted_page)) - return 0; - - /* flush pending IOs and wait for a while in the ENOMEM case */ - if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { - f2fs_flush_merged_writes(fio->sbi); - congestion_wait(BLK_RW_ASYNC, HZ/50); - gfp_flags |= __GFP_NOFAIL; - goto retry_encrypt; + if (IS_ERR(fio->encrypted_page)) { + /* flush pending IOs and wait for a while in the ENOMEM case */ + if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { + f2fs_flush_merged_writes(fio->sbi); + congestion_wait(BLK_RW_ASYNC, HZ/50); + gfp_flags |= __GFP_NOFAIL; + goto retry_encrypt; + } + return PTR_ERR(fio->encrypted_page); } - return PTR_ERR(fio->encrypted_page); + + mpage = find_lock_page(META_MAPPING(fio->sbi), fio->old_blkaddr); + if (mpage) { + if (PageUptodate(mpage)) + memcpy(page_address(mpage), + page_address(fio->encrypted_page), PAGE_SIZE); + f2fs_put_page(mpage, 1); + } + return 0; } static inline bool check_inplace_update_policy(struct inode *inode, diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 3e454522679e..ada8b8056cd0 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -599,6 +599,72 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return true; } +static int ra_data_block(struct inode *inode, pgoff_t index) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct address_space *mapping = inode->i_mapping; + struct dnode_of_data dn; + struct page *page; + struct extent_info ei = {0, 0, 0}; + struct f2fs_io_info fio = { + .sbi = sbi, + .ino = inode->i_ino, + .type = DATA, + .temp = COLD, + .op = REQ_OP_READ, + .op_flags = 0, + .encrypted_page = NULL, + .in_list = false, + .retry = false, + }; + int err; + + page = f2fs_grab_cache_page(mapping, index, true); + if (!page) + return -ENOMEM; + + if (f2fs_lookup_extent_cache(inode, index, &ei)) { + dn.data_blkaddr = ei.blk + index - ei.fofs; + goto got_it; + } + + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (err) + goto put_page; + f2fs_put_dnode(&dn); + + if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, + DATA_GENERIC))) { + err = -EFAULT; + goto put_page; + } +got_it: + /* read page */ + fio.page = page; + fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; + + fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(sbi), + dn.data_blkaddr, + FGP_LOCK | FGP_CREAT, GFP_NOFS); + if (!fio.encrypted_page) { + err = -ENOMEM; + goto put_page; + } + + err = f2fs_submit_page_bio(&fio); + if (err) + goto put_encrypted_page; + f2fs_put_page(fio.encrypted_page, 0); + f2fs_put_page(page, 1); + return 0; +put_encrypted_page: + f2fs_put_page(fio.encrypted_page, 1); +put_page: + f2fs_put_page(page, 1); + return err; +} + /* * Move data block via META_MAPPING while keeping locked data page. * This can be used to move blocks, aka LBAs, directly on disk. @@ -620,7 +686,7 @@ static void move_data_block(struct inode *inode, block_t bidx, struct dnode_of_data dn; struct f2fs_summary sum; struct node_info ni; - struct page *page; + struct page *page, *mpage; block_t newaddr; int err; bool lfs_mode = test_opt(fio.sbi, LFS); @@ -683,6 +749,23 @@ static void move_data_block(struct inode *inode, block_t bidx, goto recover_block; } + mpage = f2fs_pagecache_get_page(META_MAPPING(fio.sbi), + fio.old_blkaddr, FGP_LOCK, GFP_NOFS); + if (mpage) { + bool updated = false; + + if (PageUptodate(mpage)) { + memcpy(page_address(fio.encrypted_page), + page_address(mpage), PAGE_SIZE); + updated = true; + } + f2fs_put_page(mpage, 1); + invalidate_mapping_pages(META_MAPPING(fio.sbi), + fio.old_blkaddr, fio.old_blkaddr); + if (updated) + goto write_page; + } + err = f2fs_submit_page_bio(&fio); if (err) goto put_page_out; @@ -699,6 +782,7 @@ static void move_data_block(struct inode *inode, block_t bidx, goto put_page_out; } +write_page: set_page_dirty(fio.encrypted_page); f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true); if (clear_page_dirty_for_io(fio.encrypted_page)) @@ -873,12 +957,6 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (IS_ERR(inode) || is_bad_inode(inode)) continue; - /* if inode uses special I/O path, let's go phase 3 */ - if (f2fs_post_read_required(inode)) { - add_gc_inode(gc_list, inode); - continue; - } - if (!down_write_trylock( &F2FS_I(inode)->i_gc_rwsem[WRITE])) { iput(inode); @@ -886,10 +964,23 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, continue; } - start_bidx = f2fs_start_bidx_of_node(nofs, inode); + start_bidx = f2fs_start_bidx_of_node(nofs, inode) + + ofs_in_node; + + if (f2fs_post_read_required(inode)) { + int err = ra_data_block(inode, start_bidx); + + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + if (err) { + iput(inode); + continue; + } + add_gc_inode(gc_list, inode); + continue; + } + data_page = f2fs_get_read_data_page(inode, - start_bidx + ofs_in_node, REQ_RAHEAD, - true); + start_bidx, REQ_RAHEAD, true); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (IS_ERR(data_page)) { iput(inode); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index fe06152b0a6f..fdc17721e41e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2159,6 +2159,8 @@ void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) if (addr == NEW_ADDR) return; + invalidate_mapping_pages(META_MAPPING(sbi), addr, addr); + /* add it into sit main buffer */ down_write(&sit_i->sentry_lock); @@ -3059,6 +3061,9 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) reallocate: f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, &fio->new_blkaddr, sum, type, fio, true); + if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) + invalidate_mapping_pages(META_MAPPING(fio->sbi), + fio->old_blkaddr, fio->old_blkaddr); /* writeout dirty page into bdev */ f2fs_submit_page_write(fio); @@ -3213,8 +3218,11 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (!recover_curseg || recover_newaddr) update_sit_entry(sbi, new_blkaddr, 1); - if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) + if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) { + invalidate_mapping_pages(META_MAPPING(sbi), + old_blkaddr, old_blkaddr); update_sit_entry(sbi, old_blkaddr, -1); + } locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); locate_dirty_segment(sbi, GET_SEGNO(sbi, new_blkaddr)); From ad53f98feee72e1805a02b78022919ef5a998b58 Mon Sep 17 00:00:00 2001 From: Shubhrajyoti Datta Date: Mon, 3 Sep 2018 15:11:11 +0530 Subject: [PATCH 0976/1212] i2c: xiic: Make the start and the byte count write atomic commit ae7304c3ea28a3ba47a7a8312c76c654ef24967e upstream. Disable interrupts while configuring the transfer and enable them back. We have below as the programming sequence 1. start and slave address 2. byte count and stop In some customer platform there was a lot of interrupts between 1 and 2 and after slave address (around 7 clock cyles) if 2 is not executed then the transaction is nacked. To fix this case make the 2 writes atomic. Signed-off-by: Shubhrajyoti Datta Signed-off-by: Michal Simek [wsa: added a newline for better readability] Signed-off-by: Wolfram Sang Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/i2c/busses/i2c-xiic.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/i2c/busses/i2c-xiic.c b/drivers/i2c/busses/i2c-xiic.c index 0b20449e48cf..da9acec1a029 100644 --- a/drivers/i2c/busses/i2c-xiic.c +++ b/drivers/i2c/busses/i2c-xiic.c @@ -533,6 +533,7 @@ static void xiic_start_recv(struct xiic_i2c *i2c) { u8 rx_watermark; struct i2c_msg *msg = i2c->rx_msg = i2c->tx_msg; + unsigned long flags; /* Clear and enable Rx full interrupt. */ xiic_irq_clr_en(i2c, XIIC_INTR_RX_FULL_MASK | XIIC_INTR_TX_ERROR_MASK); @@ -548,6 +549,7 @@ static void xiic_start_recv(struct xiic_i2c *i2c) rx_watermark = IIC_RX_FIFO_DEPTH; xiic_setreg8(i2c, XIIC_RFD_REG_OFFSET, rx_watermark - 1); + local_irq_save(flags); if (!(msg->flags & I2C_M_NOSTART)) /* write the address */ xiic_setreg16(i2c, XIIC_DTR_REG_OFFSET, @@ -558,6 +560,8 @@ static void xiic_start_recv(struct xiic_i2c *i2c) xiic_setreg16(i2c, XIIC_DTR_REG_OFFSET, msg->len | ((i2c->nmsgs == 1) ? XIIC_TX_DYN_STOP_MASK : 0)); + local_irq_restore(flags); + if (i2c->nmsgs == 1) /* very last, enable bus not busy as well */ xiic_irq_clr_en(i2c, XIIC_INTR_BNB_MASK); From d811b40d010822eda1fd70d734caf8db1b21ec5f Mon Sep 17 00:00:00 2001 From: Felipe Balbi Date: Mon, 3 Sep 2018 11:24:57 +0300 Subject: [PATCH 0977/1212] i2c: i801: fix DNV's SMBCTRL register offset commit 851a15114895c5bce163a6f2d57e0aa4658a1be4 upstream. DNV's iTCO is slightly different with SMBCTRL sitting at a different offset when compared to all other devices. Let's fix so that we can properly use iTCO watchdog. Fixes: 84d7f2ebd70d ("i2c: i801: Add support for Intel DNV") Cc: # v4.4+ Signed-off-by: Felipe Balbi Reviewed-by: Jean Delvare Signed-off-by: Wolfram Sang Signed-off-by: Greg Kroah-Hartman --- drivers/i2c/busses/i2c-i801.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c index 85f39cc3e276..47581c32b1e1 100644 --- a/drivers/i2c/busses/i2c-i801.c +++ b/drivers/i2c/busses/i2c-i801.c @@ -128,6 +128,7 @@ #define SBREG_BAR 0x10 #define SBREG_SMBCTRL 0xc6000c +#define SBREG_SMBCTRL_DNV 0xcf000c /* Host status bits for SMBPCISTS */ #define SMBPCISTS_INTS 0x08 @@ -1251,7 +1252,11 @@ static void i801_add_tco(struct i801_priv *priv) spin_unlock(&p2sb_spinlock); res = &tco_res[ICH_RES_MEM_OFF]; - res->start = (resource_size_t)base64_addr + SBREG_SMBCTRL; + if (pci_dev->device == PCI_DEVICE_ID_INTEL_DNV_SMBUS) + res->start = (resource_size_t)base64_addr + SBREG_SMBCTRL_DNV; + else + res->start = (resource_size_t)base64_addr + SBREG_SMBCTRL; + res->end = res->start + 3; res->flags = IORESOURCE_MEM; From c9125a2116c75383ca0b37d9eecefa3c6235bb4e Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 30 Aug 2018 15:13:16 +0200 Subject: [PATCH 0978/1212] ALSA: hda - Fix cancel_work_sync() stall from jackpoll work commit 16037643969e095509cd8446a3f8e406a6dc3a2c upstream. On AMD/ATI controllers, the HD-audio controller driver allows a bus reset upon the error recovery, and its procedure includes the cancellation of pending jack polling work as found in snd_hda_bus_codec_reset(). This works usually fine, but it becomes a problem when the reset happens from the jack poll work itself; then calling cancel_work_sync() from the work being processed tries to wait the finish endlessly. As a workaround, this patch adds the check of current_work() and applies the cancel_work_sync() only when it's not from the jackpoll_work. This doesn't fix the root cause of the reported error below, but at least, it eases the unexpected stall of the whole system. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=200937 Cc: Cc: Lukas Wunner Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/hda_codec.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sound/pci/hda/hda_codec.c b/sound/pci/hda/hda_codec.c index 3324f98c35f6..f6d4a1046e54 100644 --- a/sound/pci/hda/hda_codec.c +++ b/sound/pci/hda/hda_codec.c @@ -4019,7 +4019,8 @@ void snd_hda_bus_reset_codecs(struct hda_bus *bus) list_for_each_codec(codec, bus) { /* FIXME: maybe a better way needed for forced reset */ - cancel_delayed_work_sync(&codec->jackpoll_work); + if (current_work() != &codec->jackpoll_work.work) + cancel_delayed_work_sync(&codec->jackpoll_work); #ifdef CONFIG_PM if (hda_codec_is_power_on(codec)) { hda_call_codec_suspend(codec); From 39223f841425da99c2b0174d8bc42ee1bb29a5ad Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Wed, 9 Aug 2017 18:28:32 +0530 Subject: [PATCH 0979/1212] cfq: Give a chance for arming slice idle timer in case of group_idle commit b3193bc0dca9bb69c8ba1ec1a318105c76eb4172 upstream. In below scenario blkio cgroup does not work as per their assigned weights :- 1. When the underlying device is nonrotational with a single HW queue with depth of >= CFQ_HW_QUEUE_MIN 2. When the use case is forming two blkio cgroups cg1(weight 1000) & cg2(wight 100) and two processes(file1 and file2) doing sync IO in their respective blkio cgroups. For above usecase result of fio (without this patch):- file1: (groupid=0, jobs=1): err= 0: pid=685: Thu Jan 1 19:41:49 1970 write: IOPS=1315, BW=41.1MiB/s (43.1MB/s)(1024MiB/24906msec) <...> file2: (groupid=0, jobs=1): err= 0: pid=686: Thu Jan 1 19:41:49 1970 write: IOPS=1295, BW=40.5MiB/s (42.5MB/s)(1024MiB/25293msec) <...> // both the process BW is equal even though they belong to diff. cgroups with weight of 1000(cg1) and 100(cg2) In above case (for non rotational NCQ devices), as soon as the request from cg1 is completed and even though it is provided with higher set_slice=10, because of CFQ algorithm when the driver tries to fetch the request, CFQ expires this group without providing any idle time nor weight priority and schedules another cfq group (in this case cg2). And thus both cfq groups(cg1 & cg2) keep alternating to get the disk time and hence loses the cgroup weight based scheduling. Below patch gives a chance to cfq algorithm (cfq_arm_slice_timer) to arm the slice timer in case group_idle is enabled. In case if group_idle is also not required (including for nonrotational NCQ drives), we need to explicitly set group_idle = 0 from sysfs for such cases. With this patch result of fio(for above usecase) :- file1: (groupid=0, jobs=1): err= 0: pid=690: Thu Jan 1 00:06:08 1970 write: IOPS=1706, BW=53.3MiB/s (55.9MB/s)(1024MiB/19197msec) <..> file2: (groupid=0, jobs=1): err= 0: pid=691: Thu Jan 1 00:06:08 1970 write: IOPS=1043, BW=32.6MiB/s (34.2MB/s)(1024MiB/31401msec) <..> // In this processes BW is as per their respective cgroups weight. Signed-off-by: Ritesh Harjani Signed-off-by: Jens Axboe Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- block/cfq-iosched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index e04a7b8492cf..6e363ba773b4 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -2905,7 +2905,8 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) * for devices that support queuing, otherwise we still have a problem * with sync vs async workloads. */ - if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag) + if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag && + !cfqd->cfq_group_idle) return; WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list)); From 798ef283a8dd73dea2ae8f817abe75255fde772c Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Tue, 9 May 2017 09:39:59 +0200 Subject: [PATCH 0980/1212] kthread: Fix use-after-free if kthread fork fails commit 4d6501dce079c1eb6bf0b1d8f528a5e81770109e upstream. If a kthread forks (e.g. usermodehelper since commit 1da5c46fa965) but fails in copy_process() between calling dup_task_struct() and setting p->set_child_tid, then the value of p->set_child_tid will be inherited from the parent and get prematurely freed by free_kthread_struct(). kthread() - worker_thread() - process_one_work() | - call_usermodehelper_exec_work() | - kernel_thread() | - _do_fork() | - copy_process() | - dup_task_struct() | - arch_dup_task_struct() | - tsk->set_child_tid = current->set_child_tid // implied | - ... | - goto bad_fork_* | - ... | - free_task(tsk) | - free_kthread_struct(tsk) | - kfree(tsk->set_child_tid) - ... - schedule() - __schedule() - wq_worker_sleeping() - kthread_data(task)->flags // UAF The problem started showing up with commit 1da5c46fa965 since it reused ->set_child_tid for the kthread worker data. A better long-term solution might be to get rid of the ->set_child_tid abuse. The comment in set_kthread_struct() also looks slightly wrong. Debugged-by: Jamie Iles Fixes: 1da5c46fa965 ("kthread: Make struct kthread kmalloc'ed") Signed-off-by: Vegard Nossum Acked-by: Oleg Nesterov Cc: Peter Zijlstra Cc: Greg Kroah-Hartman Cc: Andy Lutomirski Cc: Frederic Weisbecker Cc: Jamie Iles Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20170509073959.17858-1-vegard.nossum@oracle.com Signed-off-by: Thomas Gleixner Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- kernel/fork.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 37ec96fe739d..dd2f79ac0771 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1337,6 +1337,18 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (!p) goto fork_out; + /* + * This _must_ happen before we call free_task(), i.e. before we jump + * to any of the bad_fork_* labels. This is to avoid freeing + * p->set_child_tid which is (ab)used as a kthread's data pointer for + * kernel threads (PF_KTHREAD). + */ + p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; + /* + * Clear TID on mm_release()? + */ + p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; + ftrace_graph_init_task(p); rt_mutex_init_task(p); @@ -1498,11 +1510,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, } } - p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; - /* - * Clear TID on mm_release()? - */ - p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; #ifdef CONFIG_BLOCK p->plug = NULL; #endif From 8ecd71cd152bd3e08dc650c67ac7c600386a6a41 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Mon, 29 May 2017 09:22:07 +0200 Subject: [PATCH 0981/1212] kthread: fix boot hang (regression) on MIPS/OpenRISC commit b0f5a8f32e8bbdaae1abb8abe2d3cbafaba57e08 upstream. This fixes a regression in commit 4d6501dce079 where I didn't notice that MIPS and OpenRISC were reinitialising p->{set,clear}_child_tid to NULL after our initialisation in copy_process(). We can simply get rid of the arch-specific initialisation here since it is now always done in copy_process() before hitting copy_thread{,_tls}(). Review notes: - As far as I can tell, copy_process() is the only user of copy_thread_tls(), which is the only caller of copy_thread() for architectures that don't implement copy_thread_tls(). - After this patch, there is no arch-specific code touching p->set_child_tid or p->clear_child_tid whatsoever. - It may look like MIPS/OpenRISC wanted to always have these fields be NULL, but that's not true, as copy_process() would unconditionally set them again _after_ calling copy_thread_tls() before commit 4d6501dce079. Fixes: 4d6501dce079c1eb6bf0b1d8f528a5e81770109e ("kthread: Fix use-after-free if kthread fork fails") Reported-by: Guenter Roeck Tested-by: Guenter Roeck # MIPS only Acked-by: Stafford Horne Acked-by: Oleg Nesterov Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: Jonas Bonn Cc: Stefan Kristiansson Cc: openrisc@lists.librecores.org Cc: Jamie Iles Cc: Thomas Gleixner Signed-off-by: Vegard Nossum Signed-off-by: Linus Torvalds Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- arch/mips/kernel/process.c | 1 - arch/openrisc/kernel/process.c | 2 -- 2 files changed, 3 deletions(-) diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index 354b99f56c1e..ed6cac4a4df0 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -115,7 +115,6 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, struct thread_info *ti = task_thread_info(p); struct pt_regs *childregs, *regs = current_pt_regs(); unsigned long childksp; - p->set_child_tid = p->clear_child_tid = NULL; childksp = (unsigned long)task_stack_page(p) + THREAD_SIZE - 32; diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c index 7095dfe7666b..962372143fda 100644 --- a/arch/openrisc/kernel/process.c +++ b/arch/openrisc/kernel/process.c @@ -152,8 +152,6 @@ copy_thread(unsigned long clone_flags, unsigned long usp, top_of_kernel_stack = sp; - p->set_child_tid = p->clear_child_tid = NULL; - /* Locate userspace context on stack... */ sp -= STACK_FRAME_OVERHEAD; /* redzone */ sp -= sizeof(struct pt_regs); From 227e5d4b86e64e9807fe5ae507b0755229c2876a Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Mon, 5 Jun 2017 15:30:16 +0800 Subject: [PATCH 0982/1212] staging: rt5208: Fix a sleep-in-atomic bug in xd_copy_page commit 498c4b4e9c23855d17ecc2a108d949bb68020481 upstream. The driver may sleep under a spin lock, and the function call path is: rtsx_exclusive_enter_ss (acquire the lock by spin_lock) rtsx_enter_ss rtsx_power_off_card xd_cleanup_work xd_delay_write xd_finish_write xd_copy_page wait_timeout schedule_timeout --> may sleep To fix it, "wait_timeout" is replaced with mdelay in xd_copy_page. Signed-off-by: Jia-Ju Bai Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- drivers/staging/rts5208/xd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/rts5208/xd.c b/drivers/staging/rts5208/xd.c index 10fea7bb8f30..3db4a2570b19 100644 --- a/drivers/staging/rts5208/xd.c +++ b/drivers/staging/rts5208/xd.c @@ -1252,7 +1252,7 @@ static int xd_copy_page(struct rtsx_chip *chip, u32 old_blk, u32 new_blk, reg = 0; rtsx_read_register(chip, XD_CTL, ®); if (reg & (XD_ECC1_ERROR | XD_ECC2_ERROR)) { - wait_timeout(100); + mdelay(100); if (detect_card_cd(chip, XD_CARD) != STATUS_SUCCESS) { From 035c3ea1608e2f7a7cc43a46179f10add479c9df Mon Sep 17 00:00:00 2001 From: Daniel Micay Date: Mon, 5 Jun 2017 21:52:34 -0700 Subject: [PATCH 0983/1212] staging/rts5208: Fix read overflow in memcpy commit 88a5b39b69ab1828fd4130e2baadd184109cea69 upstream. Noticed by FORTIFY_SOURCE, this swaps memcpy() for strncpy() to zero-value fill the end of the buffer instead of over-reading a string from .rodata. Signed-off-by: Daniel Micay [kees: wrote commit log] Signed-off-by: Kees Cook Cc: Greg Kroah-Hartman Cc: Wayne Porter Signed-off-by: Amit Pundir --- drivers/staging/rts5208/rtsx_scsi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/rts5208/rtsx_scsi.c b/drivers/staging/rts5208/rtsx_scsi.c index 12a3893b98fd..ade29c4295b7 100644 --- a/drivers/staging/rts5208/rtsx_scsi.c +++ b/drivers/staging/rts5208/rtsx_scsi.c @@ -536,7 +536,7 @@ static int inquiry(struct scsi_cmnd *srb, struct rtsx_chip *chip) if (sendbytes > 8) { memcpy(buf, inquiry_buf, 8); - memcpy(buf + 8, inquiry_string, sendbytes - 8); + strncpy(buf + 8, inquiry_string, sendbytes - 8); if (pro_formatter_flag) { /* Additional Length */ buf[4] = 0x33; From 29a2875cf481b8db367a0a8aed54d4b30640558e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2016 18:03:32 -0500 Subject: [PATCH 0984/1212] block,blkcg: use __GFP_NOWARN for best-effort allocations in blkcg commit e00f4f4d0ff7e13b9115428a245b49108d625f09 upstream. blkcg allocates some per-cgroup data structures with GFP_NOWAIT and when that fails falls back to operations which aren't specific to the cgroup. Occassional failures are expected under pressure and falling back to non-cgroup operation is the right thing to do. Unfortunately, I forgot to add __GFP_NOWARN to these allocations and these expected failures end up creating a lot of noise. Add __GFP_NOWARN. Signed-off-by: Tejun Heo Reported-by: Marc MERLIN Reported-by: Vlastimil Babka Signed-off-by: Jens Axboe Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- block/blk-cgroup.c | 9 +++++---- block/cfq-iosched.c | 3 ++- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 46ba2402c8f9..987361113ecd 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -185,7 +185,8 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, } wb_congested = wb_congested_get_create(&q->backing_dev_info, - blkcg->css.id, GFP_NOWAIT); + blkcg->css.id, + GFP_NOWAIT | __GFP_NOWARN); if (!wb_congested) { ret = -ENOMEM; goto err_put_css; @@ -193,7 +194,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, /* allocate */ if (!new_blkg) { - new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT); + new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN); if (unlikely(!new_blkg)) { ret = -ENOMEM; goto err_put_congested; @@ -1022,7 +1023,7 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) } spin_lock_init(&blkcg->lock); - INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT); + INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN); INIT_HLIST_HEAD(&blkcg->blkg_list); #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&blkcg->cgwb_list); @@ -1238,7 +1239,7 @@ int blkcg_activate_policy(struct request_queue *q, if (blkg->pd[pol->plid]) continue; - pd = pol->pd_alloc_fn(GFP_NOWAIT, q->node); + pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q->node); if (!pd) swap(pd, pd_prealloc); if (!pd) { diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 6e363ba773b4..4e1f49434bbe 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3811,7 +3811,8 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, goto out; } - cfqq = kmem_cache_alloc_node(cfq_pool, GFP_NOWAIT | __GFP_ZERO, + cfqq = kmem_cache_alloc_node(cfq_pool, + GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN, cfqd->queue->node); if (!cfqq) { cfqq = &cfqd->oom_cfqq; From 70cc08c44fb55b587c7485a15549e9f9a12c9405 Mon Sep 17 00:00:00 2001 From: Prateek Sood Date: Thu, 7 Sep 2017 20:00:58 +0530 Subject: [PATCH 0985/1212] locking/rwsem-xadd: Fix missed wakeup due to reordering of load commit 9c29c31830a4eca724e137a9339137204bbb31be upstream. If a spinner is present, there is a chance that the load of rwsem_has_spinner() in rwsem_wake() can be reordered with respect to decrement of rwsem count in __up_write() leading to wakeup being missed: spinning writer up_write caller --------------- ----------------------- [S] osq_unlock() [L] osq spin_lock(wait_lock) sem->count=0xFFFFFFFF00000001 +0xFFFFFFFF00000000 count=sem->count MB sem->count=0xFFFFFFFE00000001 -0xFFFFFFFF00000001 spin_trylock(wait_lock) return rwsem_try_write_lock(count) spin_unlock(wait_lock) schedule() Reordering of atomic_long_sub_return_release() in __up_write() and rwsem_has_spinner() in rwsem_wake() can cause missing of wakeup in up_write() context. In spinning writer, sem->count and local variable count is 0XFFFFFFFE00000001. It would result in rwsem_try_write_lock() failing to acquire rwsem and spinning writer going to sleep in rwsem_down_write_failed(). The smp_rmb() will make sure that the spinner state is consulted after sem->count is updated in up_write context. Signed-off-by: Prateek Sood Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave@stgolabs.net Cc: longman@redhat.com Cc: parri.andrea@gmail.com Cc: sramana@codeaurora.org Link: http://lkml.kernel.org/r/1504794658-15397-1-git-send-email-prsood@codeaurora.org Signed-off-by: Ingo Molnar Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- kernel/locking/rwsem-xadd.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index a4d4de05b2d1..1be33caf157d 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -510,6 +510,33 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) { unsigned long flags; + /* + * __rwsem_down_write_failed_common(sem) + * rwsem_optimistic_spin(sem) + * osq_unlock(sem->osq) + * ... + * atomic_long_add_return(&sem->count) + * + * - VS - + * + * __up_write() + * if (atomic_long_sub_return_release(&sem->count) < 0) + * rwsem_wake(sem) + * osq_is_locked(&sem->osq) + * + * And __up_write() must observe !osq_is_locked() when it observes the + * atomic_long_add_return() in order to not miss a wakeup. + * + * This boils down to: + * + * [S.rel] X = 1 [RmW] r0 = (Y += 0) + * MB RMB + * [RmW] Y += 1 [L] r1 = X + * + * exists (r0=1 /\ r1=0) + */ + smp_rmb(); + /* * If a spinner is present, it is not necessary to do the wakeup. * Try to do wakeup only if the trylock succeeds to minimize From 97557d161572172d1d6ea317f254e501a4585c41 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 3 Aug 2017 10:11:52 +0200 Subject: [PATCH 0986/1212] selinux: use GFP_NOWAIT in the AVC kmem_caches commit 476accbe2f6ef69caeebe99f52a286e12ac35aee upstream. There is a strange __GFP_NOMEMALLOC usage pattern in SELinux, specifically GFP_ATOMIC | __GFP_NOMEMALLOC which doesn't make much sense. GFP_ATOMIC on its own allows to access memory reserves while __GFP_NOMEMALLOC dictates we cannot use memory reserves. Replace this with the much more sane GFP_NOWAIT in the AVC code as we can tolerate memory allocation failures in that code. Signed-off-by: Michal Hocko Acked-by: Mel Gorman Signed-off-by: Paul Moore Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- security/selinux/avc.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/security/selinux/avc.c b/security/selinux/avc.c index e60c79de13e1..52f3c550abcc 100644 --- a/security/selinux/avc.c +++ b/security/selinux/avc.c @@ -348,27 +348,26 @@ static struct avc_xperms_decision_node struct avc_xperms_decision_node *xpd_node; struct extended_perms_decision *xpd; - xpd_node = kmem_cache_zalloc(avc_xperms_decision_cachep, - GFP_ATOMIC | __GFP_NOMEMALLOC); + xpd_node = kmem_cache_zalloc(avc_xperms_decision_cachep, GFP_NOWAIT); if (!xpd_node) return NULL; xpd = &xpd_node->xpd; if (which & XPERMS_ALLOWED) { xpd->allowed = kmem_cache_zalloc(avc_xperms_data_cachep, - GFP_ATOMIC | __GFP_NOMEMALLOC); + GFP_NOWAIT); if (!xpd->allowed) goto error; } if (which & XPERMS_AUDITALLOW) { xpd->auditallow = kmem_cache_zalloc(avc_xperms_data_cachep, - GFP_ATOMIC | __GFP_NOMEMALLOC); + GFP_NOWAIT); if (!xpd->auditallow) goto error; } if (which & XPERMS_DONTAUDIT) { xpd->dontaudit = kmem_cache_zalloc(avc_xperms_data_cachep, - GFP_ATOMIC | __GFP_NOMEMALLOC); + GFP_NOWAIT); if (!xpd->dontaudit) goto error; } @@ -396,8 +395,7 @@ static struct avc_xperms_node *avc_xperms_alloc(void) { struct avc_xperms_node *xp_node; - xp_node = kmem_cache_zalloc(avc_xperms_cachep, - GFP_ATOMIC|__GFP_NOMEMALLOC); + xp_node = kmem_cache_zalloc(avc_xperms_cachep, GFP_NOWAIT); if (!xp_node) return xp_node; INIT_LIST_HEAD(&xp_node->xpd_head); @@ -550,7 +548,7 @@ static struct avc_node *avc_alloc_node(void) { struct avc_node *node; - node = kmem_cache_zalloc(avc_node_cachep, GFP_ATOMIC|__GFP_NOMEMALLOC); + node = kmem_cache_zalloc(avc_node_cachep, GFP_NOWAIT); if (!node) goto out; From d914882c936d9c3a1fa4c10d5950c5f0a7d32d79 Mon Sep 17 00:00:00 2001 From: Prateek Sood Date: Fri, 14 Jul 2017 19:17:56 +0530 Subject: [PATCH 0987/1212] locking/osq_lock: Fix osq_lock queue corruption commit 50972fe78f24f1cd0b9d7bbf1f87d2be9e4f412e upstream. Fix ordering of link creation between node->prev and prev->next in osq_lock(). A case in which the status of optimistic spin queue is CPU6->CPU2 in which CPU6 has acquired the lock. tail v ,-. <- ,-. |6| |2| `-' -> `-' At this point if CPU0 comes in to acquire osq_lock, it will update the tail count. CPU2 CPU0 ---------------------------------- tail v ,-. <- ,-. ,-. |6| |2| |0| `-' -> `-' `-' After tail count update if CPU2 starts to unqueue itself from optimistic spin queue, it will find an updated tail count with CPU0 and update CPU2 node->next to NULL in osq_wait_next(). unqueue-A tail v ,-. <- ,-. ,-. |6| |2| |0| `-' `-' `-' unqueue-B ->tail != curr && !node->next If reordering of following stores happen then prev->next where prev being CPU2 would be updated to point to CPU0 node: tail v ,-. <- ,-. ,-. |6| |2| |0| `-' `-' -> `-' osq_wait_next() node->next <- 0 xchg(node->next, NULL) tail v ,-. <- ,-. ,-. |6| |2| |0| `-' `-' `-' unqueue-C At this point if next instruction WRITE_ONCE(next->prev, prev); in CPU2 path is committed before the update of CPU0 node->prev = prev then CPU0 node->prev will point to CPU6 node. tail v----------. v ,-. <- ,-. ,-. |6| |2| |0| `-' `-' `-' `----------^ At this point if CPU0 path's node->prev = prev is committed resulting in change of CPU0 prev back to CPU2 node. CPU2 node->next is NULL currently, tail v ,-. <- ,-. <- ,-. |6| |2| |0| `-' `-' `-' `----------^ so if CPU0 gets into unqueue path of osq_lock it will keep spinning in infinite loop as condition prev->next == node will never be true. Signed-off-by: Prateek Sood [ Added pictures, rewrote comments. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: sramana@codeaurora.org Link: http://lkml.kernel.org/r/1500040076-27626-1-git-send-email-prsood@codeaurora.org Signed-off-by: Ingo Molnar Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- kernel/locking/osq_lock.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index 05a37857ab55..8d7047ecef4e 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -104,6 +104,19 @@ bool osq_lock(struct optimistic_spin_queue *lock) prev = decode_cpu(old); node->prev = prev; + + /* + * osq_lock() unqueue + * + * node->prev = prev osq_wait_next() + * WMB MB + * prev->next = node next->prev = prev // unqueue-C + * + * Here 'node->prev' and 'next->prev' are the same variable and we need + * to ensure these stores happen in-order to avoid corrupting the list. + */ + smp_wmb(); + WRITE_ONCE(prev->next, node); /* From e3efb7699a873bd47950e4fdcfd1052afd1ab578 Mon Sep 17 00:00:00 2001 From: Alexey Brodkin Date: Thu, 2 Aug 2018 11:50:16 +0300 Subject: [PATCH 0988/1212] ARC: [plat-axs*]: Enable SWAP commit c83532fb0fe053d2e43e9387354cb1b52ba26427 upstream. SWAP support on ARC was fixed earlier by commit 6e3761145a9b ("ARC: Fix CONFIG_SWAP") so now we may safely enable it on platforms that have external media like USB and SD-card. Note: it was already allowed for HSDK Signed-off-by: Alexey Brodkin Cc: stable@vger.kernel.org # 6e3761145a9b: ARC: Fix CONFIG_SWAP Signed-off-by: Vineet Gupta Signed-off-by: Greg Kroah-Hartman --- arch/arc/configs/axs101_defconfig | 1 - arch/arc/configs/axs103_defconfig | 1 - arch/arc/configs/axs103_smp_defconfig | 1 - 3 files changed, 3 deletions(-) diff --git a/arch/arc/configs/axs101_defconfig b/arch/arc/configs/axs101_defconfig index f1ac9818b751..dbee1934dfc6 100644 --- a/arch/arc/configs/axs101_defconfig +++ b/arch/arc/configs/axs101_defconfig @@ -1,6 +1,5 @@ CONFIG_CROSS_COMPILE="arc-linux-" CONFIG_DEFAULT_HOSTNAME="ARCLinux" -# CONFIG_SWAP is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y # CONFIG_CROSS_MEMORY_ATTACH is not set diff --git a/arch/arc/configs/axs103_defconfig b/arch/arc/configs/axs103_defconfig index 323486d6ee83..561eac854cc3 100644 --- a/arch/arc/configs/axs103_defconfig +++ b/arch/arc/configs/axs103_defconfig @@ -1,6 +1,5 @@ CONFIG_CROSS_COMPILE="arc-linux-" CONFIG_DEFAULT_HOSTNAME="ARCLinux" -# CONFIG_SWAP is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y # CONFIG_CROSS_MEMORY_ATTACH is not set diff --git a/arch/arc/configs/axs103_smp_defconfig b/arch/arc/configs/axs103_smp_defconfig index 66191cd0447e..aa4f261b6508 100644 --- a/arch/arc/configs/axs103_smp_defconfig +++ b/arch/arc/configs/axs103_smp_defconfig @@ -1,6 +1,5 @@ CONFIG_CROSS_COMPILE="arc-linux-" CONFIG_DEFAULT_HOSTNAME="ARCLinux" -# CONFIG_SWAP is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y # CONFIG_CROSS_MEMORY_ATTACH is not set From fb9dabb6fca85a6af11ce8669c812dfed0660ca0 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 2 Aug 2018 11:42:22 +0300 Subject: [PATCH 0989/1212] misc: mic: SCIF Fix scif_get_new_port() error handling [ Upstream commit a39284ae9d2ad09975c8ae33f1bd0f05fbfbf6ee ] There are only 2 callers of scif_get_new_port() and both appear to get the error handling wrong. Both treat zero returns as error, but it actually returns negative error codes and >= 0 on success. Fixes: e9089f43c9a7 ("misc: mic: SCIF open close bind and listen APIs") Signed-off-by: Dan Carpenter Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mic/scif/scif_api.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/drivers/misc/mic/scif/scif_api.c b/drivers/misc/mic/scif/scif_api.c index ddc9e4b08b5c..56efa9d18a9a 100644 --- a/drivers/misc/mic/scif/scif_api.c +++ b/drivers/misc/mic/scif/scif_api.c @@ -370,11 +370,10 @@ int scif_bind(scif_epd_t epd, u16 pn) goto scif_bind_exit; } } else { - pn = scif_get_new_port(); - if (!pn) { - ret = -ENOSPC; + ret = scif_get_new_port(); + if (ret < 0) goto scif_bind_exit; - } + pn = ret; } ep->state = SCIFEP_BOUND; @@ -648,13 +647,12 @@ int __scif_connect(scif_epd_t epd, struct scif_port_id *dst, bool non_block) err = -EISCONN; break; case SCIFEP_UNBOUND: - ep->port.port = scif_get_new_port(); - if (!ep->port.port) { - err = -ENOSPC; - } else { - ep->port.node = scif_info.nodeid; - ep->conn_async_state = ASYNC_CONN_IDLE; - } + err = scif_get_new_port(); + if (err < 0) + break; + ep->port.port = err; + ep->port.node = scif_info.nodeid; + ep->conn_async_state = ASYNC_CONN_IDLE; /* Fall through */ case SCIFEP_BOUND: /* From 61537b3398df88c0ca70ea6bccca0e00de5ab172 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Sat, 4 Aug 2018 14:20:40 -0700 Subject: [PATCH 0990/1212] ethtool: Remove trailing semicolon for static inline [ Upstream commit d89d41556141a527030a15233135ba622ba3350d ] Android's header sanitization tool chokes on static inline functions having a trailing semicolon, leading to an incorrectly parsed header file. While the tool should obviously be fixed, also fix the header files for the two affected functions: ethtool_get_flow_spec_ring() and ethtool_get_flow_spec_ring_vf(). Fixes: 8cf6f497de40 ("ethtool: Add helper routines to pass vf to rx_flow_spec") Reporetd-by: Blair Prescott Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/ethtool.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index cd1629170103..08f47e0e9f8d 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -819,13 +819,13 @@ struct ethtool_rx_flow_spec { static inline __u64 ethtool_get_flow_spec_ring(__u64 ring_cookie) { return ETHTOOL_RX_FLOW_SPEC_RING & ring_cookie; -}; +} static inline __u64 ethtool_get_flow_spec_ring_vf(__u64 ring_cookie) { return (ETHTOOL_RX_FLOW_SPEC_RING_VF & ring_cookie) >> ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF; -}; +} /** * struct ethtool_rxnfc - command to get or set RX flow classification rules From 5824d86b50b8c5f9ecd725f2d74381a23ab1c63b Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Sat, 4 Aug 2018 23:40:26 +0300 Subject: [PATCH 0991/1212] Bluetooth: h5: Fix missing dependency on BT_HCIUART_SERDEV MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 6c3711ec64fd23a9abc8aaf59a9429569a6282df ] This driver was recently updated to use serdev, so add the appropriate dependency. Without this one can get compiler warnings like this if CONFIG_SERIAL_DEV_BUS is not enabled: CC [M] drivers/bluetooth/hci_h5.o drivers/bluetooth/hci_h5.c:934:36: warning: ‘h5_serdev_driver’ defined but not used [-Wunused-variable] static struct serdev_device_driver h5_serdev_driver = { ^~~~~~~~~~~~~~~~ Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/bluetooth/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/bluetooth/Kconfig b/drivers/bluetooth/Kconfig index ec6af1595062..4685bd10c473 100644 --- a/drivers/bluetooth/Kconfig +++ b/drivers/bluetooth/Kconfig @@ -125,6 +125,7 @@ config BT_HCIUART_LL config BT_HCIUART_3WIRE bool "Three-wire UART (H5) protocol support" depends on BT_HCIUART + depends on BT_HCIUART_SERDEV help The HCI Three-wire UART Transport Layer makes it possible to user the Bluetooth HCI over a serial port interface. The HCI From bf748acafd58fe15bc74e0646bc95b4c3fdcb4c3 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Thu, 2 Aug 2018 14:11:44 +0300 Subject: [PATCH 0992/1212] gpio: tegra: Move driver registration to subsys_init level [ Upstream commit 40b25bce0adbe641a744d1291bc0e51fb7f3c3d8 ] There is a bug in regards to deferred probing within the drivers core that causes GPIO-driver to suspend after its users. The bug appears if GPIO-driver probe is getting deferred, which happens after introducing dependency on PINCTRL-driver for the GPIO-driver by defining "gpio-ranges" property in device-tree. The bug in the drivers core is old (more than 4 years now) and is well known, unfortunately there is no easy fix for it. The good news is that we can workaround the deferred probe issue by changing GPIO / PINCTRL drivers registration order and hence by moving PINCTRL driver registration to the arch_init level and GPIO to the subsys_init. Signed-off-by: Dmitry Osipenko Acked-by: Stefan Agner Signed-off-by: Linus Walleij Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpio/gpio-tegra.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-tegra.c b/drivers/gpio/gpio-tegra.c index 896bf29776b0..fb2c1df4f588 100644 --- a/drivers/gpio/gpio-tegra.c +++ b/drivers/gpio/gpio-tegra.c @@ -591,4 +591,4 @@ static int __init tegra_gpio_init(void) { return platform_driver_register(&tegra_gpio_driver); } -postcore_initcall(tegra_gpio_init); +subsys_initcall(tegra_gpio_init); From 44dc4734a7810e7ee24fc8942a93ca5b8521a24b Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Thu, 2 Aug 2018 12:12:20 -0500 Subject: [PATCH 0993/1212] scsi: target: fix __transport_register_session locking [ Upstream commit 6a64f6e1591322beb8ce16e952a53582caf2a15c ] When __transport_register_session is called from transport_register_session irqs will already have been disabled, so we do not want the unlock irq call to enable them until the higher level has done the final spin_unlock_irqrestore/ spin_unlock_irq. This has __transport_register_session use the save/restore call. Signed-off-by: Mike Christie Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/target/target_core_transport.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 21f888ac550e..7199bac67333 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -306,6 +306,7 @@ void __transport_register_session( { const struct target_core_fabric_ops *tfo = se_tpg->se_tpg_tfo; unsigned char buf[PR_REG_ISID_LEN]; + unsigned long flags; se_sess->se_tpg = se_tpg; se_sess->fabric_sess_ptr = fabric_sess_ptr; @@ -342,7 +343,7 @@ void __transport_register_session( se_sess->sess_bin_isid = get_unaligned_be64(&buf[0]); } - spin_lock_irq(&se_nacl->nacl_sess_lock); + spin_lock_irqsave(&se_nacl->nacl_sess_lock, flags); /* * The se_nacl->nacl_sess pointer will be set to the * last active I_T Nexus for each struct se_node_acl. @@ -351,7 +352,7 @@ void __transport_register_session( list_add_tail(&se_sess->sess_acl_list, &se_nacl->acl_sess_list); - spin_unlock_irq(&se_nacl->nacl_sess_lock); + spin_unlock_irqrestore(&se_nacl->nacl_sess_lock, flags); } list_add_tail(&se_sess->sess_list, &se_tpg->tpg_sess_list); From ccae23ff45ccd3e9c610c46ee8449c2617118daa Mon Sep 17 00:00:00 2001 From: BingJing Chang Date: Wed, 1 Aug 2018 17:08:36 +0800 Subject: [PATCH 0994/1212] md/raid5: fix data corruption of replacements after originals dropped [ Upstream commit d63e2fc804c46e50eee825c5d3a7228e07048b47 ] During raid5 replacement, the stripes can be marked with R5_NeedReplace flag. Data can be read from being-replaced devices and written to replacing spares without reading all other devices. (It's 'replace' mode. s.replacing = 1) If a being-replaced device is dropped, the replacement progress will be interrupted and resumed with pure recovery mode. However, existing stripes before being interrupted cannot read from the dropped device anymore. It prints lots of WARN_ON messages. And it results in data corruption because existing stripes write problematic data into its replacement device and update the progress. \# Erase disks (1MB + 2GB) dd if=/dev/zero of=/dev/sda bs=1MB count=2049 dd if=/dev/zero of=/dev/sdb bs=1MB count=2049 dd if=/dev/zero of=/dev/sdc bs=1MB count=2049 dd if=/dev/zero of=/dev/sdd bs=1MB count=2049 mdadm -C /dev/md0 -amd -R -l5 -n3 -x0 /dev/sd[abc] -z 2097152 \# Ensure array stores non-zero data dd if=/root/data_4GB.iso of=/dev/md0 bs=1MB \# Start replacement mdadm /dev/md0 -a /dev/sdd mdadm /dev/md0 --replace /dev/sda Then, Hot-plug out /dev/sda during recovery, and wait for recovery done. echo check > /sys/block/md0/md/sync_action cat /sys/block/md0/md/mismatch_cnt # it will be greater than 0. Soon after you hot-plug out /dev/sda, you will see many WARN_ON messages. The replacement recovery will be interrupted shortly. After the recovery finishes, it will result in data corruption. Actually, it's just an unhandled case of replacement. In commit (md/raid5: fix interaction of 'replace' and 'recovery'.), if a NeedReplace device is not UPTODATE then that is an error, the commit just simply print WARN_ON but also mark these corrupted stripes with R5_WantReplace. (it means it's ready for writes.) To fix this case, we can leverage 'sync and replace' mode mentioned in commit <9a3e1101b827> (md/raid5: detect and handle replacements during recovery.). We can add logics to detect and use 'sync and replace' mode for these stripes. Reported-by: Alex Chen Reviewed-by: Alex Wu Reviewed-by: Chung-Chiang Cheng Signed-off-by: BingJing Chang Signed-off-by: Shaohua Li Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/md/raid5.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d59b861764a1..0841d8f10a58 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4190,6 +4190,12 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) s->failed++; if (rdev && !test_bit(Faulty, &rdev->flags)) do_recovery = 1; + else if (!rdev) { + rdev = rcu_dereference( + conf->disks[i].replacement); + if (rdev && !test_bit(Faulty, &rdev->flags)) + do_recovery = 1; + } } } if (test_bit(STRIPE_SYNCING, &sh->state)) { From 7c075f0a0b37e8d868da9cc7dde23f27e341e193 Mon Sep 17 00:00:00 2001 From: Anton Vasilyev Date: Fri, 27 Jul 2018 18:45:36 +0300 Subject: [PATCH 0995/1212] misc: ti-st: Fix memory leak in the error path of probe() [ Upstream commit 81ae962d7f180c0092859440c82996cccb254976 ] Free resources instead of direct return of the error code if kim_probe fails. Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Anton Vasilyev Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/misc/ti-st/st_kim.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/misc/ti-st/st_kim.c b/drivers/misc/ti-st/st_kim.c index 71b64550b591..a1bca836e506 100644 --- a/drivers/misc/ti-st/st_kim.c +++ b/drivers/misc/ti-st/st_kim.c @@ -757,14 +757,14 @@ static int kim_probe(struct platform_device *pdev) err = gpio_request(kim_gdata->nshutdown, "kim"); if (unlikely(err)) { pr_err(" gpio %d request failed ", kim_gdata->nshutdown); - return err; + goto err_sysfs_group; } /* Configure nShutdown GPIO as output=0 */ err = gpio_direction_output(kim_gdata->nshutdown, 0); if (unlikely(err)) { pr_err(" unable to configure gpio %d", kim_gdata->nshutdown); - return err; + goto err_sysfs_group; } /* get reference of pdev for request_firmware */ From 57533bacbc5606d12322dbb738f5f335fc835c33 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 2 Aug 2018 11:24:47 +0300 Subject: [PATCH 0996/1212] uio: potential double frees if __uio_register_device() fails [ Upstream commit f019f07ecf6a6b8bd6d7853bce70925d90af02d1 ] The uio_unregister_device() function assumes that if "info->uio_dev" is non-NULL that means "info" is fully allocated. Setting info->uio_de has to be the last thing in the function. In the current code, if request_threaded_irq() fails then we return with info->uio_dev set to non-NULL but info is not fully allocated and it can lead to double frees. Fixes: beafc54c4e2f ("UIO: Add the User IO core code") Signed-off-by: Dan Carpenter Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/uio/uio.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c index bcc1fc027311..b9823eb9c195 100644 --- a/drivers/uio/uio.c +++ b/drivers/uio/uio.c @@ -833,8 +833,6 @@ int __uio_register_device(struct module *owner, if (ret) goto err_uio_dev_add_attributes; - info->uio_dev = idev; - if (info->irq && (info->irq != UIO_IRQ_CUSTOM)) { /* * Note that we deliberately don't use devm_request_irq @@ -850,6 +848,7 @@ int __uio_register_device(struct module *owner, goto err_request_irq; } + info->uio_dev = idev; return 0; err_request_irq: From 0a93d88d6638bcf1d85917cf60ba812826b41a01 Mon Sep 17 00:00:00 2001 From: Anton Vasilyev Date: Fri, 27 Jul 2018 16:39:31 +0300 Subject: [PATCH 0997/1212] tty: rocket: Fix possible buffer overwrite on register_PCI [ Upstream commit 0419056ec8fd01ddf5460d2dba0491aad22657dd ] If number of isa and pci boards exceed NUM_BOARDS on the path rp_init()->init_PCI()->register_PCI() then buffer overwrite occurs in register_PCI() on assign rcktpt_io_addr[i]. The patch adds check on upper bound for index of registered board in register_PCI. Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Anton Vasilyev Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/tty/rocket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/rocket.c b/drivers/tty/rocket.c index 802eac7e561b..2b8f2e0a4224 100644 --- a/drivers/tty/rocket.c +++ b/drivers/tty/rocket.c @@ -1915,7 +1915,7 @@ static __init int register_PCI(int i, struct pci_dev *dev) ByteIO_t UPCIRingInd = 0; if (!dev || !pci_match_id(rocket_pci_ids, dev) || - pci_enable_device(dev)) + pci_enable_device(dev) || i >= NUM_BOARDS) return 0; rcktpt_io_addr[i] = pci_resource_start(dev, 0); From f5be08ed5e393ddb75e8b7b14b3fa8b6d236b9fa Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Thu, 12 Jul 2018 23:09:26 +0800 Subject: [PATCH 0998/1212] f2fs: do not set free of current section [ Upstream commit 3611ce9911267cb93d364bd71ddea6821278d11f ] For the case when sbi->segs_per_sec > 1, take section:segment = 5 for example, if segment 1 is just used and allocate new segment 2, and the blocks of segment 1 is invalidated, at this time, the previous code will use __set_test_and_free to free the free_secmap and free_sections++, this is not correct since it is still a current section, so fix it. Signed-off-by: Yunlong Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/f2fs/segment.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index ee44d346ea44..bfa1d31f79aa 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -381,6 +381,8 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi, if (test_and_clear_bit(segno, free_i->free_segmap)) { free_i->free_segments++; + if (IS_CURSEC(sbi, secno)) + goto skip_free; next = find_next_bit(free_i->free_segmap, start_segno + sbi->segs_per_sec, start_segno); if (next >= start_segno + sbi->segs_per_sec) { @@ -388,6 +390,7 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi, free_i->free_sections++; } } +skip_free: spin_unlock(&free_i->segmap_lock); } From 5c228c2f0c595925872ec306b689a0dfc3ebbfa4 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 22 Sep 2017 13:20:43 +0200 Subject: [PATCH 0999/1212] perf tools: Allow overriding MAX_NR_CPUS at compile time [ Upstream commit 21b8732eb4479b579bda9ee38e62b2c312c2a0e5 ] After update of kernel, the perf tool doesn't run anymore on my 32MB RAM powerpc board, but still runs on a 128MB RAM board: ~# strace perf execve("/usr/sbin/perf", ["perf"], [/* 12 vars */]) = -1 ENOMEM (Cannot allocate memory) --- SIGSEGV {si_signo=SIGSEGV, si_code=SI_KERNEL, si_addr=0} --- +++ killed by SIGSEGV +++ Segmentation fault objdump -x shows that .bss section has a huge size of 24Mbytes: 27 .bss 016baca8 101cebb8 101cebb8 001cd988 2**3 With especially the following objects having quite big size: 10205f80 l O .bss 00140000 runtime_cycles_stats 10345f80 l O .bss 00140000 runtime_stalled_cycles_front_stats 10485f80 l O .bss 00140000 runtime_stalled_cycles_back_stats 105c5f80 l O .bss 00140000 runtime_branches_stats 10705f80 l O .bss 00140000 runtime_cacherefs_stats 10845f80 l O .bss 00140000 runtime_l1_dcache_stats 10985f80 l O .bss 00140000 runtime_l1_icache_stats 10ac5f80 l O .bss 00140000 runtime_ll_cache_stats 10c05f80 l O .bss 00140000 runtime_itlb_cache_stats 10d45f80 l O .bss 00140000 runtime_dtlb_cache_stats 10e85f80 l O .bss 00140000 runtime_cycles_in_tx_stats 10fc5f80 l O .bss 00140000 runtime_transaction_stats 11105f80 l O .bss 00140000 runtime_elision_stats 11245f80 l O .bss 00140000 runtime_topdown_total_slots 11385f80 l O .bss 00140000 runtime_topdown_slots_retired 114c5f80 l O .bss 00140000 runtime_topdown_slots_issued 11605f80 l O .bss 00140000 runtime_topdown_fetch_bubbles 11745f80 l O .bss 00140000 runtime_topdown_recovery_bubbles This is due to commit 4d255766d28b1 ("perf: Bump max number of cpus to 1024"), because many tables are sized with MAX_NR_CPUS This patch gives the opportunity to redefine MAX_NR_CPUS via $ make EXTRA_CFLAGS=-DMAX_NR_CPUS=1 Signed-off-by: Christophe Leroy Cc: Alexander Shishkin Cc: Peter Zijlstra Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20170922112043.8349468C57@po15668-vm-win7.idsi0.si.c-s.fr Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- tools/perf/perf.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/perf/perf.h b/tools/perf/perf.h index 90129accffbe..4341ed267d4e 100644 --- a/tools/perf/perf.h +++ b/tools/perf/perf.h @@ -29,7 +29,9 @@ static inline unsigned long long rdclock(void) return ts.tv_sec * 1000000000ULL + ts.tv_nsec; } +#ifndef MAX_NR_CPUS #define MAX_NR_CPUS 1024 +#endif extern const char *input_name; extern bool perf_host, perf_guest; From 2d926fe3c2557d6b0e956fe26565823a48b15479 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Thu, 26 Jul 2018 16:04:47 -0400 Subject: [PATCH 1000/1212] NFSv4.0 fix client reference leak in callback [ Upstream commit 32cd3ee511f4e07ca25d71163b50e704808d22f4 ] If there is an error during processing of a callback message, it leads to refrence leak on the client structure and eventually an unclean superblock. Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/nfs/callback_xdr.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index e2e857affbf2..0647cb1ede56 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -911,16 +911,21 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r if (hdr_arg.minorversion == 0) { cps.clp = nfs4_find_client_ident(SVC_NET(rqstp), hdr_arg.cb_ident); - if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp)) + if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp)) { + if (cps.clp) + nfs_put_client(cps.clp); goto out_invalidcred; + } } cps.minorversion = hdr_arg.minorversion; hdr_res.taglen = hdr_arg.taglen; hdr_res.tag = hdr_arg.tag; - if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0) + if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0) { + if (cps.clp) + nfs_put_client(cps.clp); return rpc_system_err; - + } while (status == 0 && nops != hdr_arg.nops) { status = process_op(nops, rqstp, &xdr_in, argp, &xdr_out, resp, &cps); From 805841279cb02cab5c15913db5f4438ba7649403 Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Mon, 2 Jul 2018 04:21:18 -0400 Subject: [PATCH 1001/1212] macintosh/via-pmu: Add missing mmio accessors [ Upstream commit 576d5290d678a651b9f36050fc1717e0573aca13 ] Add missing in_8() accessors to init_pmu() and pmu_sr_intr(). This fixes several sparse warnings: drivers/macintosh/via-pmu.c:536:29: warning: dereference of noderef expression drivers/macintosh/via-pmu.c:537:33: warning: dereference of noderef expression drivers/macintosh/via-pmu.c:1455:17: warning: dereference of noderef expression drivers/macintosh/via-pmu.c:1456:69: warning: dereference of noderef expression Tested-by: Stan Johnson Signed-off-by: Finn Thain Reviewed-by: Geert Uytterhoeven Signed-off-by: Michael Ellerman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/macintosh/via-pmu.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c index f9512bfa6c3c..0a41132ffba7 100644 --- a/drivers/macintosh/via-pmu.c +++ b/drivers/macintosh/via-pmu.c @@ -530,8 +530,9 @@ init_pmu(void) int timeout; struct adb_request req; - out_8(&via[B], via[B] | TREQ); /* negate TREQ */ - out_8(&via[DIRB], (via[DIRB] | TREQ) & ~TACK); /* TACK in, TREQ out */ + /* Negate TREQ. Set TACK to input and TREQ to output. */ + out_8(&via[B], in_8(&via[B]) | TREQ); + out_8(&via[DIRB], (in_8(&via[DIRB]) | TREQ) & ~TACK); pmu_request(&req, NULL, 2, PMU_SET_INTR_MASK, pmu_intr_mask); timeout = 100000; @@ -1453,8 +1454,8 @@ pmu_sr_intr(void) struct adb_request *req; int bite = 0; - if (via[B] & TREQ) { - printk(KERN_ERR "PMU: spurious SR intr (%x)\n", via[B]); + if (in_8(&via[B]) & TREQ) { + printk(KERN_ERR "PMU: spurious SR intr (%x)\n", in_8(&via[B])); out_8(&via[IFR], SR_INT); return NULL; } From 2071bd1ca7f588a1c08b157026577d5366388233 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Thu, 26 Jul 2018 15:59:48 +0200 Subject: [PATCH 1002/1212] ath10k: prevent active scans on potential unusable channels [ Upstream commit 3f259111583801013cb605bb4414aa529adccf1c ] The QCA4019 hw1.0 firmware 10.4-3.2.1-00050 and 10.4-3.5.3-00053 (and most likely all other) seem to ignore the WMI_CHAN_FLAG_DFS flag during the scan. This results in transmission (probe requests) on channels which are not "available" for transmissions. Since the firmware is closed source and nothing can be done from our side to fix the problem in it, the driver has to work around this problem. The WMI_CHAN_FLAG_PASSIVE seems to be interpreted by the firmware to not scan actively on a channel unless an AP was detected on it. Simple probe requests will then be transmitted by the STA on the channel. ath10k must therefore also use this flag when it queues a radar channel for scanning. This should reduce the chance of an active scan when the channel might be "unusable" for transmissions. Fixes: e8a50f8ba44b ("ath10k: introduce DFS implementation") Signed-off-by: Sven Eckelmann Signed-off-by: Kalle Valo Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/ath/ath10k/mac.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index 916b9b12edd2..4644357d291a 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -2901,6 +2901,13 @@ static int ath10k_update_channel_list(struct ath10k *ar) passive = channel->flags & IEEE80211_CHAN_NO_IR; ch->passive = passive; + /* the firmware is ignoring the "radar" flag of the + * channel and is scanning actively using Probe Requests + * on "Radar detection"/DFS channels which are not + * marked as "available" + */ + ch->passive |= ch->chan_radar; + ch->freq = channel->center_freq; ch->band_center_freq1 = channel->center_freq; ch->min_power = 0; From 41038bf29f25291f0a04d18dc4b9354c41cf26a1 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Fri, 27 Jul 2018 18:23:19 -0700 Subject: [PATCH 1003/1212] MIPS: Fix ISA virt/bus conversion for non-zero PHYS_OFFSET [ Upstream commit 0494d7ffdcebc6935410ea0719b24ab626675351 ] isa_virt_to_bus() & isa_bus_to_virt() claim to treat ISA bus addresses as being identical to physical addresses, but they fail to do so in the presence of a non-zero PHYS_OFFSET. Correct this by having them use virt_to_phys() & phys_to_virt(), which consolidates the calculations to one place & ensures that ISA bus addresses do indeed match physical addresses. Signed-off-by: Paul Burton Patchwork: https://patchwork.linux-mips.org/patch/20047/ Cc: James Hogan Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: Vladimir Kondratiev Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/mips/include/asm/io.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/mips/include/asm/io.h b/arch/mips/include/asm/io.h index 75fa296836fc..ab1df19b0957 100644 --- a/arch/mips/include/asm/io.h +++ b/arch/mips/include/asm/io.h @@ -141,14 +141,14 @@ static inline void * phys_to_virt(unsigned long address) /* * ISA I/O bus memory addresses are 1:1 with the physical address. */ -static inline unsigned long isa_virt_to_bus(volatile void * address) +static inline unsigned long isa_virt_to_bus(volatile void *address) { - return (unsigned long)address - PAGE_OFFSET; + return virt_to_phys(address); } -static inline void * isa_bus_to_virt(unsigned long address) +static inline void *isa_bus_to_virt(unsigned long address) { - return (void *)(address + PAGE_OFFSET); + return phys_to_virt(address); } #define isa_page_to_bus page_to_phys From 58e91e96fdb2cacd96d89baba3ec9d8e1b937896 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Mon, 2 Jul 2018 12:01:53 -0700 Subject: [PATCH 1004/1212] ata: libahci: Correct setting of DEVSLP register [ Upstream commit 2dbb3ec29a6c069035857a2fc4c24e80e5dfe3cc ] We have seen that on some platforms, SATA device never show any DEVSLP residency. This prevent power gating of SATA IP, which prevent system to transition to low power mode in systems with SLP_S0 aka modern standby systems. The PHY logic is off only in DEVSLP not in slumber. Reference: https://www.intel.com/content/dam/www/public/us/en/documents/datasheets /332995-skylake-i-o-platform-datasheet-volume-1.pdf Section 28.7.6.1 Here driver is trying to do read-modify-write the devslp register. But not resetting the bits for which this driver will modify values (DITO, MDAT and DETO). So simply reset those bits before updating to new values. Signed-off-by: Srinivas Pandruvada Reviewed-by: Rafael J. Wysocki Reviewed-by: Hans de Goede Signed-off-by: Tejun Heo Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/ata/libahci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c index 9628fa131757..8116cb2fef2d 100644 --- a/drivers/ata/libahci.c +++ b/drivers/ata/libahci.c @@ -2113,6 +2113,8 @@ static void ahci_set_aggressive_devslp(struct ata_port *ap, bool sleep) deto = 20; } + /* Make dito, mdat, deto bits to 0s */ + devslp &= ~GENMASK_ULL(24, 2); devslp |= ((dito << PORT_DEVSLP_DITO_OFFSET) | (mdat << PORT_DEVSLP_MDAT_OFFSET) | (deto << PORT_DEVSLP_DETO_OFFSET) | From 3f5885f4ce3f300bf3f3d92a6e2a3b0adf368382 Mon Sep 17 00:00:00 2001 From: Anton Vasilyev Date: Fri, 27 Jul 2018 16:51:57 +0300 Subject: [PATCH 1005/1212] scsi: 3ware: fix return 0 on the error path of probe [ Upstream commit 4dc98c1995482262e70e83ef029135247fafe0f2 ] tw_probe() returns 0 in case of fail of tw_initialize_device_extension(), pci_resource_start() or tw_reset_sequence() and releases resources. twl_probe() returns 0 in case of fail of twl_initialize_device_extension(), pci_iomap() and twl_reset_sequence(). twa_probe() returns 0 in case of fail of tw_initialize_device_extension(), ioremap() and twa_reset_sequence(). The patch adds retval initialization for these cases. Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Anton Vasilyev Acked-by: Adam Radford Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/3w-9xxx.c | 6 +++++- drivers/scsi/3w-sas.c | 3 +++ drivers/scsi/3w-xxxx.c | 2 ++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/3w-9xxx.c b/drivers/scsi/3w-9xxx.c index 5466246c69b4..b78a2f3745f2 100644 --- a/drivers/scsi/3w-9xxx.c +++ b/drivers/scsi/3w-9xxx.c @@ -2045,6 +2045,7 @@ static int twa_probe(struct pci_dev *pdev, const struct pci_device_id *dev_id) if (twa_initialize_device_extension(tw_dev)) { TW_PRINTK(tw_dev->host, TW_DRIVER, 0x25, "Failed to initialize device extension"); + retval = -ENOMEM; goto out_free_device_extension; } @@ -2067,6 +2068,7 @@ static int twa_probe(struct pci_dev *pdev, const struct pci_device_id *dev_id) tw_dev->base_addr = ioremap(mem_addr, mem_len); if (!tw_dev->base_addr) { TW_PRINTK(tw_dev->host, TW_DRIVER, 0x35, "Failed to ioremap"); + retval = -ENOMEM; goto out_release_mem_region; } @@ -2074,8 +2076,10 @@ static int twa_probe(struct pci_dev *pdev, const struct pci_device_id *dev_id) TW_DISABLE_INTERRUPTS(tw_dev); /* Initialize the card */ - if (twa_reset_sequence(tw_dev, 0)) + if (twa_reset_sequence(tw_dev, 0)) { + retval = -ENOMEM; goto out_iounmap; + } /* Set host specific parameters */ if ((pdev->device == PCI_DEVICE_ID_3WARE_9650SE) || diff --git a/drivers/scsi/3w-sas.c b/drivers/scsi/3w-sas.c index f8374850f714..f0a5536a9ff5 100644 --- a/drivers/scsi/3w-sas.c +++ b/drivers/scsi/3w-sas.c @@ -1600,6 +1600,7 @@ static int twl_probe(struct pci_dev *pdev, const struct pci_device_id *dev_id) if (twl_initialize_device_extension(tw_dev)) { TW_PRINTK(tw_dev->host, TW_DRIVER, 0x1a, "Failed to initialize device extension"); + retval = -ENOMEM; goto out_free_device_extension; } @@ -1614,6 +1615,7 @@ static int twl_probe(struct pci_dev *pdev, const struct pci_device_id *dev_id) tw_dev->base_addr = pci_iomap(pdev, 1, 0); if (!tw_dev->base_addr) { TW_PRINTK(tw_dev->host, TW_DRIVER, 0x1c, "Failed to ioremap"); + retval = -ENOMEM; goto out_release_mem_region; } @@ -1623,6 +1625,7 @@ static int twl_probe(struct pci_dev *pdev, const struct pci_device_id *dev_id) /* Initialize the card */ if (twl_reset_sequence(tw_dev, 0)) { TW_PRINTK(tw_dev->host, TW_DRIVER, 0x1d, "Controller reset failed during probe"); + retval = -ENOMEM; goto out_iounmap; } diff --git a/drivers/scsi/3w-xxxx.c b/drivers/scsi/3w-xxxx.c index 14af38036287..308a4206b636 100644 --- a/drivers/scsi/3w-xxxx.c +++ b/drivers/scsi/3w-xxxx.c @@ -2278,6 +2278,7 @@ static int tw_probe(struct pci_dev *pdev, const struct pci_device_id *dev_id) if (tw_initialize_device_extension(tw_dev)) { printk(KERN_WARNING "3w-xxxx: Failed to initialize device extension."); + retval = -ENOMEM; goto out_free_device_extension; } @@ -2292,6 +2293,7 @@ static int tw_probe(struct pci_dev *pdev, const struct pci_device_id *dev_id) tw_dev->base_addr = pci_resource_start(pdev, 0); if (!tw_dev->base_addr) { printk(KERN_WARNING "3w-xxxx: Failed to get io address."); + retval = -ENOMEM; goto out_release_mem_region; } From a820e7709f7d6f338b21a41e163366b344ab02ab Mon Sep 17 00:00:00 2001 From: Surabhi Vishnoi Date: Wed, 25 Jul 2018 10:59:41 +0300 Subject: [PATCH 1006/1212] ath10k: disable bundle mgmt tx completion event support [ Upstream commit 673bc519c55843c68c3aecff71a4101e79d28d2b ] The tx completion of multiple mgmt frames can be bundled in a single event and sent by the firmware to host, if this capability is not disabled explicitly by the host. If the host cannot handle the bundled mgmt tx completion, this capability support needs to be disabled in the wmi init cmd, sent to the firmware. Add the host capability indication flag in the wmi ready command, to let firmware know the features supported by the host driver. This field is ignored if it is not supported by firmware. Set the host capability indication flag(i.e. host_capab) to zero, for disabling the support of bundle mgmt tx completion. This will indicate the firmware to send completion event for every mgmt tx completion, instead of bundling them together and sending in a single event. Tested HW: WCN3990 Tested FW: WLAN.HL.2.0-01188-QCAHLSWMTPLZ-1 Signed-off-by: Surabhi Vishnoi Signed-off-by: Rakesh Pillai Signed-off-by: Kalle Valo Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/ath/ath10k/wmi-tlv.c | 5 +++++ drivers/net/wireless/ath/ath10k/wmi-tlv.h | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/drivers/net/wireless/ath/ath10k/wmi-tlv.c b/drivers/net/wireless/ath/ath10k/wmi-tlv.c index 02eea3c3b5d3..c72eb4464de9 100644 --- a/drivers/net/wireless/ath/ath10k/wmi-tlv.c +++ b/drivers/net/wireless/ath/ath10k/wmi-tlv.c @@ -1424,6 +1424,11 @@ static struct sk_buff *ath10k_wmi_tlv_op_gen_init(struct ath10k *ar) cfg->keep_alive_pattern_size = __cpu_to_le32(0); cfg->max_tdls_concurrent_sleep_sta = __cpu_to_le32(1); cfg->max_tdls_concurrent_buffer_sta = __cpu_to_le32(1); + cfg->wmi_send_separate = __cpu_to_le32(0); + cfg->num_ocb_vdevs = __cpu_to_le32(0); + cfg->num_ocb_channels = __cpu_to_le32(0); + cfg->num_ocb_schedules = __cpu_to_le32(0); + cfg->host_capab = __cpu_to_le32(0); ath10k_wmi_put_host_mem_chunks(ar, chunks); diff --git a/drivers/net/wireless/ath/ath10k/wmi-tlv.h b/drivers/net/wireless/ath/ath10k/wmi-tlv.h index ad655c44afdb..f5031f3965c5 100644 --- a/drivers/net/wireless/ath/ath10k/wmi-tlv.h +++ b/drivers/net/wireless/ath/ath10k/wmi-tlv.h @@ -1209,6 +1209,11 @@ struct wmi_tlv_resource_config { __le32 keep_alive_pattern_size; __le32 max_tdls_concurrent_sleep_sta; __le32 max_tdls_concurrent_buffer_sta; + __le32 wmi_send_separate; + __le32 num_ocb_vdevs; + __le32 num_ocb_channels; + __le32 num_ocb_schedules; + __le32 host_capab; } __packed; struct wmi_tlv_init_cmd { From 362990d8aaea6dedb1d2a28a7caf868cc3c4ec02 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Mon, 30 Jul 2018 13:57:41 +0200 Subject: [PATCH 1007/1212] Bluetooth: hidp: Fix handling of strncpy for hid->name information MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit b3cadaa485f0c20add1644a5c877b0765b285c0c ] This fixes two issues with setting hid->name information. CC net/bluetooth/hidp/core.o In function ‘hidp_setup_hid’, inlined from ‘hidp_session_dev_init’ at net/bluetooth/hidp/core.c:815:9, inlined from ‘hidp_session_new’ at net/bluetooth/hidp/core.c:953:8, inlined from ‘hidp_connection_add’ at net/bluetooth/hidp/core.c:1366:8: net/bluetooth/hidp/core.c:778:2: warning: ‘strncpy’ output may be truncated copying 127 bytes from a string of length 127 [-Wstringop-truncation] strncpy(hid->name, req->name, sizeof(req->name) - 1); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CC net/bluetooth/hidp/core.o net/bluetooth/hidp/core.c: In function ‘hidp_setup_hid’: net/bluetooth/hidp/core.c:778:38: warning: argument to ‘sizeof’ in ‘strncpy’ call is the same expression as the source; did you mean to use the size of the destination? [-Wsizeof-pointer-memaccess] strncpy(hid->name, req->name, sizeof(req->name)); ^ Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/bluetooth/hidp/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 1811f8e7ddf4..552e00b07196 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -774,7 +774,7 @@ static int hidp_setup_hid(struct hidp_session *session, hid->version = req->version; hid->country = req->country; - strncpy(hid->name, req->name, sizeof(req->name) - 1); + strncpy(hid->name, req->name, sizeof(hid->name)); snprintf(hid->phys, sizeof(hid->phys), "%pMR", &l2cap_pi(session->ctrl_sock->sk)->chan->src); From cf8f39692c14db469b90299a33968eb2822bd732 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 25 Jul 2018 17:48:01 +0200 Subject: [PATCH 1008/1212] x86/mm: Remove in_nmi() warning from vmalloc_fault() [ Upstream commit 6863ea0cda8725072522cd78bda332d9a0b73150 ] It is perfectly okay to take page-faults, especially on the vmalloc area while executing an NMI handler. Remove the warning. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: David H. Gutteridge Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: Pavel Machek Cc: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1532533683-5988-2-git-send-email-joro@8bytes.org Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/x86/mm/fault.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index e0a34b0d381e..c4dffae5d939 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -273,8 +273,6 @@ static noinline int vmalloc_fault(unsigned long address) if (!(address >= VMALLOC_START && address < VMALLOC_END)) return -1; - WARN_ON_ONCE(in_nmi()); - /* * Synchronize this task's top level page-table * with the 'reference' page table. From 931ed0a5d3d7bc51947aaa379b8eff2d629cae49 Mon Sep 17 00:00:00 2001 From: Anton Vasilyev Date: Mon, 23 Jul 2018 19:53:30 +0300 Subject: [PATCH 1009/1212] gpio: ml-ioh: Fix buffer underwrite on probe error path [ Upstream commit 4bf4eed44bfe288f459496eaf38089502ef91a79 ] If ioh_gpio_probe() fails on devm_irq_alloc_descs() then chip may point to any element of chip_save array, so reverse iteration from pointer chip may become chip_save[-1] and gpiochip_remove() will operate with wrong memory. The patch fix the error path of ioh_gpio_probe() to correctly bypass chip_save array. Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Anton Vasilyev Signed-off-by: Linus Walleij Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpio/gpio-ml-ioh.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-ml-ioh.c b/drivers/gpio/gpio-ml-ioh.c index 5536108aa9db..fe21734bbe5c 100644 --- a/drivers/gpio/gpio-ml-ioh.c +++ b/drivers/gpio/gpio-ml-ioh.c @@ -495,9 +495,10 @@ static int ioh_gpio_probe(struct pci_dev *pdev, chip = chip_save; err_gpiochip_add: + chip = chip_save; while (--i >= 0) { - chip--; gpiochip_remove(&chip->gpio); + chip++; } kfree(chip_save); From 3d7d5919a51a7f8d0c8c51f7250cd5c4d9f3598e Mon Sep 17 00:00:00 2001 From: Yelena Krivosheev Date: Wed, 18 Jul 2018 18:10:51 +0200 Subject: [PATCH 1010/1212] net: mvneta: fix mtu change on port without link [ Upstream commit 8466baf788ec3e18836bd9c91ba0b1a07af25878 ] It is incorrect to enable TX/RX queues (call by mvneta_port_up()) for port without link. Indeed MTU change for interface without link causes TX queues to stuck. Fixes: c5aff18204da ("net: mvneta: driver for Marvell Armada 370/XP network unit") Signed-off-by: Yelena Krivosheev [gregory.clement: adding Fixes tags and rewording commit log] Signed-off-by: Gregory CLEMENT Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/marvell/mvneta.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index ea693bbf56d8..1c300259d70a 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -2569,7 +2569,6 @@ static int mvneta_change_mtu(struct net_device *dev, int mtu) } mvneta_start_dev(pp); - mvneta_port_up(pp); netdev_update_features(dev); From 7ffa0928ce29480c6a9b55cf9e343333107e32cd Mon Sep 17 00:00:00 2001 From: Nicholas Mc Guire Date: Sat, 16 Jun 2018 09:06:33 +0200 Subject: [PATCH 1011/1212] MIPS: Octeon: add missing of_node_put() [ Upstream commit b1259519e618d479ede8a0db5474b3aff99f5056 ] The call to of_find_node_by_name returns a node pointer with refcount incremented thus it must be explicitly decremented here after the last usage. Signed-off-by: Nicholas Mc Guire Signed-off-by: Paul Burton Patchwork: https://patchwork.linux-mips.org/patch/19558/ Cc: Ralf Baechle Cc: James Hogan Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/mips/cavium-octeon/octeon-platform.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/mips/cavium-octeon/octeon-platform.c b/arch/mips/cavium-octeon/octeon-platform.c index d113c8ded6e2..6df3a4ea77fc 100644 --- a/arch/mips/cavium-octeon/octeon-platform.c +++ b/arch/mips/cavium-octeon/octeon-platform.c @@ -349,6 +349,7 @@ static int __init octeon_ehci_device_init(void) return 0; pd = of_find_device_by_node(ehci_node); + of_node_put(ehci_node); if (!pd) return 0; @@ -411,6 +412,7 @@ static int __init octeon_ohci_device_init(void) return 0; pd = of_find_device_by_node(ohci_node); + of_node_put(ohci_node); if (!pd) return 0; From bde99d93a6aed76ec2039bd9d800346083289d10 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 27 Jul 2018 15:26:55 +0300 Subject: [PATCH 1012/1212] net: dcb: For wild-card lookups, use priority -1, not 0 [ Upstream commit 08193d1a893c802c4b807e4d522865061f4e9f4f ] The function dcb_app_lookup walks the list of specified DCB APP entries, looking for one that matches a given criteria: ifindex, selector, protocol ID and optionally also priority. The "don't care" value for priority is set to 0, because that priority has not been allowed under CEE regime, which predates the IEEE standardization. Under IEEE, 0 is a valid priority number. But because dcb_app_lookup considers zero a wild card, attempts to add an APP entry with priority 0 fail when other entries exist for a given ifindex / selector / PID triplet. Fix by changing the wild-card value to -1. Signed-off-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/dcb/dcbnl.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 4f6c1862dfd2..6fe2b615518c 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -1763,7 +1763,7 @@ static struct dcb_app_type *dcb_app_lookup(const struct dcb_app *app, if (itr->app.selector == app->selector && itr->app.protocol == app->protocol && itr->ifindex == ifindex && - (!prio || itr->app.priority == prio)) + ((prio == -1) || itr->app.priority == prio)) return itr; } @@ -1798,7 +1798,8 @@ u8 dcb_getapp(struct net_device *dev, struct dcb_app *app) u8 prio = 0; spin_lock_bh(&dcb_lock); - if ((itr = dcb_app_lookup(app, dev->ifindex, 0))) + itr = dcb_app_lookup(app, dev->ifindex, -1); + if (itr) prio = itr->app.priority; spin_unlock_bh(&dcb_lock); @@ -1826,7 +1827,8 @@ int dcb_setapp(struct net_device *dev, struct dcb_app *new) spin_lock_bh(&dcb_lock); /* Search for existing match and replace */ - if ((itr = dcb_app_lookup(new, dev->ifindex, 0))) { + itr = dcb_app_lookup(new, dev->ifindex, -1); + if (itr) { if (new->priority) itr->app.priority = new->priority; else { @@ -1859,7 +1861,8 @@ u8 dcb_ieee_getapp_mask(struct net_device *dev, struct dcb_app *app) u8 prio = 0; spin_lock_bh(&dcb_lock); - if ((itr = dcb_app_lookup(app, dev->ifindex, 0))) + itr = dcb_app_lookup(app, dev->ifindex, -1); + if (itr) prio |= 1 << itr->app.priority; spin_unlock_bh(&dcb_lock); From af39fa735a44f950e65ea04513cb5e2d70f35e67 Mon Sep 17 00:00:00 2001 From: Nick Dyer Date: Fri, 27 Jul 2018 11:44:20 -0700 Subject: [PATCH 1013/1212] Input: atmel_mxt_ts - only use first T9 instance [ Upstream commit 36f5d9ef26e52edff046b4b097855db89bf0cd4a ] The driver only registers one input device, which uses the screen parameters from the first T9 instance. The first T63 instance also uses those parameters. It is incorrect to send input reports from the second instances of these objects if they are enabled: the input scaling will be wrong and the positions will be mashed together. This also causes problems on Android if the number of slots exceeds 32. In the future, this could be handled by looking for enabled touch object instances and creating an input device for each one. Signed-off-by: Nick Dyer Acked-by: Benson Leung Acked-by: Yufeng Shen Signed-off-by: Dmitry Torokhov Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/input/touchscreen/atmel_mxt_ts.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/input/touchscreen/atmel_mxt_ts.c b/drivers/input/touchscreen/atmel_mxt_ts.c index 88dfe3008cf4..be2f2521c1c5 100644 --- a/drivers/input/touchscreen/atmel_mxt_ts.c +++ b/drivers/input/touchscreen/atmel_mxt_ts.c @@ -1593,10 +1593,11 @@ static int mxt_get_object_table(struct mxt_data *data) break; case MXT_TOUCH_MULTI_T9: data->multitouch = MXT_TOUCH_MULTI_T9; + /* Only handle messages from first T9 instance */ data->T9_reportid_min = min_id; - data->T9_reportid_max = max_id; - data->num_touchids = object->num_report_ids - * mxt_obj_instances(object); + data->T9_reportid_max = min_id + + object->num_report_ids - 1; + data->num_touchids = object->num_report_ids; break; case MXT_SPT_MESSAGECOUNT_T44: data->T44_address = object->start_address; From 77d3c98f661ed3bb9d675b9af9763c01d82ebc9c Mon Sep 17 00:00:00 2001 From: Mauricio Faria de Oliveira Date: Wed, 25 Jul 2018 22:46:29 -0300 Subject: [PATCH 1014/1212] partitions/aix: append null character to print data from disk [ Upstream commit d43fdae7bac2def8c4314b5a49822cb7f08a45f1 ] Even if properly initialized, the lvname array (i.e., strings) is read from disk, and might contain corrupt data (e.g., lack the null terminating character for strings). So, make sure the partition name string used in pr_warn() has the null terminating character. Fixes: 6ceea22bbbc8 ("partitions: add aix lvm partition support files") Suggested-by: Daniel J. Axtens Signed-off-by: Mauricio Faria de Oliveira Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- block/partitions/aix.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/block/partitions/aix.c b/block/partitions/aix.c index f3ed7b2d89bf..fa74698e12a6 100644 --- a/block/partitions/aix.c +++ b/block/partitions/aix.c @@ -281,10 +281,14 @@ int aix_partition(struct parsed_partitions *state) next_lp_ix += 1; } for (i = 0; i < state->limit; i += 1) - if (lvip[i].pps_found && !lvip[i].lv_is_contiguous) + if (lvip[i].pps_found && !lvip[i].lv_is_contiguous) { + char tmp[sizeof(n[i].name) + 1]; // null char + + snprintf(tmp, sizeof(tmp), "%s", n[i].name); pr_warn("partition %s (%u pp's found) is " "not contiguous\n", - n[i].name, lvip[i].pps_found); + tmp, lvip[i].pps_found); + } kfree(pvd); } kfree(n); From a169d7c844a9ed24d110ae4333ed1edf451bf22f Mon Sep 17 00:00:00 2001 From: Mauricio Faria de Oliveira Date: Wed, 25 Jul 2018 22:46:28 -0300 Subject: [PATCH 1015/1212] partitions/aix: fix usage of uninitialized lv_info and lvname structures [ Upstream commit 14cb2c8a6c5dae57ee3e2da10fa3db2b9087e39e ] The if-block that sets a successful return value in aix_partition() uses 'lvip[].pps_per_lv' and 'n[].name' potentially uninitialized. For example, if 'numlvs' is zero or alloc_lvn() fails, neither is initialized, but are used anyway if alloc_pvd() succeeds after it. So, make the alloc_pvd() call conditional on their initialization. This has been hit when attaching an apparently corrupted/stressed AIX LUN, misleading the kernel to pr_warn() invalid data and hang. [...] partition (null) (11 pp's found) is not contiguous [...] partition (null) (2 pp's found) is not contiguous [...] partition (null) (3 pp's found) is not contiguous [...] partition (null) (64 pp's found) is not contiguous Fixes: 6ceea22bbbc8 ("partitions: add aix lvm partition support files") Signed-off-by: Mauricio Faria de Oliveira Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- block/partitions/aix.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/block/partitions/aix.c b/block/partitions/aix.c index fa74698e12a6..8e7d358e0226 100644 --- a/block/partitions/aix.c +++ b/block/partitions/aix.c @@ -177,7 +177,7 @@ int aix_partition(struct parsed_partitions *state) u32 vgda_sector = 0; u32 vgda_len = 0; int numlvs = 0; - struct pvd *pvd; + struct pvd *pvd = NULL; struct lv_info { unsigned short pps_per_lv; unsigned short pps_found; @@ -231,10 +231,11 @@ int aix_partition(struct parsed_partitions *state) if (lvip[i].pps_per_lv) foundlvs += 1; } + /* pvd loops depend on n[].name and lvip[].pps_per_lv */ + pvd = alloc_pvd(state, vgda_sector + 17); } put_dev_sector(sect); } - pvd = alloc_pvd(state, vgda_sector + 17); if (pvd) { int numpps = be16_to_cpu(pvd->pp_count); int psn_part1 = be32_to_cpu(pvd->psn_part1); From 22fa0358a99afb20946e6e03cc0cd04ffd8cb304 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 20 Jul 2018 18:16:59 +0200 Subject: [PATCH 1016/1212] iommu/ipmmu-vmsa: Fix allocation in atomic context [ Upstream commit 46583e8c48c5a094ba28060615b3a7c8c576690f ] When attaching a device to an IOMMU group with CONFIG_DEBUG_ATOMIC_SLEEP=y: BUG: sleeping function called from invalid context at mm/slab.h:421 in_atomic(): 1, irqs_disabled(): 128, pid: 61, name: kworker/1:1 ... Call trace: ... arm_lpae_alloc_pgtable+0x114/0x184 arm_64_lpae_alloc_pgtable_s1+0x2c/0x128 arm_32_lpae_alloc_pgtable_s1+0x40/0x6c alloc_io_pgtable_ops+0x60/0x88 ipmmu_attach_device+0x140/0x334 ipmmu_attach_device() takes a spinlock, while arm_lpae_alloc_pgtable() allocates memory using GFP_KERNEL. Originally, the ipmmu-vmsa driver had its own custom page table allocation implementation using GFP_ATOMIC, hence the spinlock was fine. Fix this by replacing the spinlock by a mutex, like the arm-smmu driver does. Fixes: f20ed39f53145e45 ("iommu/ipmmu-vmsa: Use the ARM LPAE page table allocator") Signed-off-by: Geert Uytterhoeven Reviewed-by: Laurent Pinchart Signed-off-by: Joerg Roedel Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/iommu/ipmmu-vmsa.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c index dfb868e2d129..624e7ff76166 100644 --- a/drivers/iommu/ipmmu-vmsa.c +++ b/drivers/iommu/ipmmu-vmsa.c @@ -44,7 +44,7 @@ struct ipmmu_vmsa_domain { struct io_pgtable_ops *iop; unsigned int context_id; - spinlock_t lock; /* Protects mappings */ + struct mutex mutex; /* Protects mappings */ }; struct ipmmu_vmsa_archdata { @@ -464,7 +464,7 @@ static struct iommu_domain *ipmmu_domain_alloc(unsigned type) if (!domain) return NULL; - spin_lock_init(&domain->lock); + mutex_init(&domain->mutex); return &domain->io_domain; } @@ -488,7 +488,6 @@ static int ipmmu_attach_device(struct iommu_domain *io_domain, struct ipmmu_vmsa_archdata *archdata = dev->archdata.iommu; struct ipmmu_vmsa_device *mmu = archdata->mmu; struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain); - unsigned long flags; unsigned int i; int ret = 0; @@ -497,7 +496,7 @@ static int ipmmu_attach_device(struct iommu_domain *io_domain, return -ENXIO; } - spin_lock_irqsave(&domain->lock, flags); + mutex_lock(&domain->mutex); if (!domain->mmu) { /* The domain hasn't been used yet, initialize it. */ @@ -513,7 +512,7 @@ static int ipmmu_attach_device(struct iommu_domain *io_domain, ret = -EINVAL; } - spin_unlock_irqrestore(&domain->lock, flags); + mutex_unlock(&domain->mutex); if (ret < 0) return ret; From ab140bd2316d587161350768ff3c6fb434e33a24 Mon Sep 17 00:00:00 2001 From: Zumeng Chen Date: Wed, 4 Jul 2018 12:35:29 +0800 Subject: [PATCH 1017/1212] mfd: ti_am335x_tscadc: Fix struct clk memory leak [ Upstream commit c2b1509c77a99a0dcea0a9051ca743cb88385f50 ] Use devm_elk_get() to let Linux manage struct clk memory to avoid the following memory leakage report: unreferenced object 0xdd75efc0 (size 64): comm "systemd-udevd", pid 186, jiffies 4294945126 (age 1195.750s) hex dump (first 32 bytes): 61 64 63 5f 74 73 63 5f 66 63 6b 00 00 00 00 00 adc_tsc_fck..... 00 00 00 00 92 03 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kmemleak_alloc+0x40/0x74 [] __kmalloc_track_caller+0x198/0x388 [] kstrdup+0x40/0x5c [] kstrdup_const+0x30/0x3c [] __clk_create_clk+0x60/0xac [] clk_get_sys+0x74/0x144 [] clk_get+0x5c/0x68 [] ti_tscadc_probe+0x260/0x468 [ti_am335x_tscadc] [] platform_drv_probe+0x60/0xac [] driver_probe_device+0x214/0x2dc [] __driver_attach+0x94/0xc0 [] bus_for_each_dev+0x90/0xa0 [] driver_attach+0x28/0x30 [] bus_add_driver+0x184/0x1ec [] driver_register+0xb0/0xf0 [] __platform_driver_register+0x40/0x54 Signed-off-by: Zumeng Chen Signed-off-by: Lee Jones Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/mfd/ti_am335x_tscadc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/mfd/ti_am335x_tscadc.c b/drivers/mfd/ti_am335x_tscadc.c index e4e4b22eebc9..4a0f076c91ba 100644 --- a/drivers/mfd/ti_am335x_tscadc.c +++ b/drivers/mfd/ti_am335x_tscadc.c @@ -224,14 +224,13 @@ static int ti_tscadc_probe(struct platform_device *pdev) * The TSC_ADC_SS controller design assumes the OCP clock is * at least 6x faster than the ADC clock. */ - clk = clk_get(&pdev->dev, "adc_tsc_fck"); + clk = devm_clk_get(&pdev->dev, "adc_tsc_fck"); if (IS_ERR(clk)) { dev_err(&pdev->dev, "failed to get TSC fck\n"); err = PTR_ERR(clk); goto err_disable_clk; } clock_rate = clk_get_rate(clk); - clk_put(clk); tscadc->clk_div = clock_rate / ADC_CLK; /* TSCADC_CLKDIV needs to be configured to the value minus 1 */ From e498af87075ae63b39bb90cdc67e35b9fe592acb Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 23 Jun 2018 11:25:19 +0800 Subject: [PATCH 1018/1212] f2fs: fix to do sanity check with {sit,nat}_ver_bitmap_bytesize [ Upstream commit c77ec61ca0a49544ca81881cc5d5529858f7e196 ] This patch adds to do sanity check with {sit,nat}_ver_bitmap_bytesize during mount, in order to avoid accessing across cache boundary with this abnormal bitmap size. - Overview buffer overrun in build_sit_info() when mounting a crafted f2fs image - Reproduce - Kernel message [ 548.580867] F2FS-fs (loop0): Invalid log blocks per segment (8201) [ 548.580877] F2FS-fs (loop0): Can't find valid F2FS filesystem in 1th superblock [ 548.584979] ================================================================== [ 548.586568] BUG: KASAN: use-after-free in kmemdup+0x36/0x50 [ 548.587715] Read of size 64 at addr ffff8801e9c265ff by task mount/1295 [ 548.589428] CPU: 1 PID: 1295 Comm: mount Not tainted 4.18.0-rc1+ #4 [ 548.589432] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 548.589438] Call Trace: [ 548.589474] dump_stack+0x7b/0xb5 [ 548.589487] print_address_description+0x70/0x290 [ 548.589492] kasan_report+0x291/0x390 [ 548.589496] ? kmemdup+0x36/0x50 [ 548.589509] check_memory_region+0x139/0x190 [ 548.589514] memcpy+0x23/0x50 [ 548.589518] kmemdup+0x36/0x50 [ 548.589545] f2fs_build_segment_manager+0x8fa/0x3410 [ 548.589551] ? __asan_loadN+0xf/0x20 [ 548.589560] ? f2fs_sanity_check_ckpt+0x1be/0x240 [ 548.589566] ? f2fs_flush_sit_entries+0x10c0/0x10c0 [ 548.589587] ? __put_user_ns+0x40/0x40 [ 548.589604] ? find_next_bit+0x57/0x90 [ 548.589610] f2fs_fill_super+0x194b/0x2b40 [ 548.589617] ? f2fs_commit_super+0x1b0/0x1b0 [ 548.589637] ? set_blocksize+0x90/0x140 [ 548.589651] mount_bdev+0x1c5/0x210 [ 548.589655] ? f2fs_commit_super+0x1b0/0x1b0 [ 548.589667] f2fs_mount+0x15/0x20 [ 548.589672] mount_fs+0x60/0x1a0 [ 548.589683] ? alloc_vfsmnt+0x309/0x360 [ 548.589688] vfs_kern_mount+0x6b/0x1a0 [ 548.589699] do_mount+0x34a/0x18c0 [ 548.589710] ? lockref_put_or_lock+0xcf/0x160 [ 548.589716] ? copy_mount_string+0x20/0x20 [ 548.589728] ? memcg_kmem_put_cache+0x1b/0xa0 [ 548.589734] ? kasan_check_write+0x14/0x20 [ 548.589740] ? _copy_from_user+0x6a/0x90 [ 548.589744] ? memdup_user+0x42/0x60 [ 548.589750] ksys_mount+0x83/0xd0 [ 548.589755] __x64_sys_mount+0x67/0x80 [ 548.589781] do_syscall_64+0x78/0x170 [ 548.589797] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 548.589820] RIP: 0033:0x7f76fc331b9a [ 548.589821] Code: 48 8b 0d 01 c3 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ce c2 2b 00 f7 d8 64 89 01 48 [ 548.589880] RSP: 002b:00007ffd4f0a0e48 EFLAGS: 00000206 ORIG_RAX: 00000000000000a5 [ 548.589890] RAX: ffffffffffffffda RBX: 000000000146c030 RCX: 00007f76fc331b9a [ 548.589892] RDX: 000000000146c210 RSI: 000000000146df30 RDI: 0000000001474ec0 [ 548.589895] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000013 [ 548.589897] R10: 00000000c0ed0000 R11: 0000000000000206 R12: 0000000001474ec0 [ 548.589900] R13: 000000000146c210 R14: 0000000000000000 R15: 0000000000000003 [ 548.590242] The buggy address belongs to the page: [ 548.591243] page:ffffea0007a70980 count:0 mapcount:0 mapping:0000000000000000 index:0x0 [ 548.592886] flags: 0x2ffff0000000000() [ 548.593665] raw: 02ffff0000000000 dead000000000100 dead000000000200 0000000000000000 [ 548.595258] raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 [ 548.603713] page dumped because: kasan: bad access detected [ 548.605203] Memory state around the buggy address: [ 548.606198] ffff8801e9c26480: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 548.607676] ffff8801e9c26500: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 548.609157] >ffff8801e9c26580: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 548.610629] ^ [ 548.612088] ffff8801e9c26600: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 548.613674] ffff8801e9c26680: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 548.615141] ================================================================== [ 548.616613] Disabling lock debugging due to kernel taint [ 548.622871] WARNING: CPU: 1 PID: 1295 at mm/page_alloc.c:4065 __alloc_pages_slowpath+0xe4a/0x1420 [ 548.622878] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy [ 548.623217] CPU: 1 PID: 1295 Comm: mount Tainted: G B 4.18.0-rc1+ #4 [ 548.623219] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 548.623226] RIP: 0010:__alloc_pages_slowpath+0xe4a/0x1420 [ 548.623227] Code: ff ff 01 89 85 c8 fe ff ff e9 91 fc ff ff 41 89 c5 e9 5c fc ff ff 0f 0b 89 f8 25 ff ff f7 ff 89 85 8c fe ff ff e9 d5 f2 ff ff <0f> 0b e9 65 f2 ff ff 65 8b 05 38 81 d2 47 f6 c4 01 74 1c 65 48 8b [ 548.623281] RSP: 0018:ffff8801f28c7678 EFLAGS: 00010246 [ 548.623284] RAX: 0000000000000000 RBX: 00000000006040c0 RCX: ffffffffb82f73b7 [ 548.623287] RDX: 1ffff1003e518eeb RSI: 000000000000000c RDI: 0000000000000000 [ 548.623290] RBP: ffff8801f28c7880 R08: 0000000000000000 R09: ffffed0047fff2c5 [ 548.623292] R10: 0000000000000001 R11: ffffed0047fff2c4 R12: ffff8801e88de040 [ 548.623295] R13: 00000000006040c0 R14: 000000000000000c R15: ffff8801f28c7938 [ 548.623299] FS: 00007f76fca51840(0000) GS:ffff8801f6f00000(0000) knlGS:0000000000000000 [ 548.623302] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 548.623304] CR2: 00007f19b9171760 CR3: 00000001ed952000 CR4: 00000000000006e0 [ 548.623317] Call Trace: [ 548.623325] ? kasan_check_read+0x11/0x20 [ 548.623330] ? __zone_watermark_ok+0x92/0x240 [ 548.623336] ? get_page_from_freelist+0x1c3/0x1d90 [ 548.623347] ? _raw_spin_lock_irqsave+0x2a/0x60 [ 548.623353] ? warn_alloc+0x250/0x250 [ 548.623358] ? save_stack+0x46/0xd0 [ 548.623361] ? kasan_kmalloc+0xad/0xe0 [ 548.623366] ? __isolate_free_page+0x2a0/0x2a0 [ 548.623370] ? mount_fs+0x60/0x1a0 [ 548.623374] ? vfs_kern_mount+0x6b/0x1a0 [ 548.623378] ? do_mount+0x34a/0x18c0 [ 548.623383] ? ksys_mount+0x83/0xd0 [ 548.623387] ? __x64_sys_mount+0x67/0x80 [ 548.623391] ? do_syscall_64+0x78/0x170 [ 548.623396] ? entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 548.623401] __alloc_pages_nodemask+0x3c5/0x400 [ 548.623407] ? __alloc_pages_slowpath+0x1420/0x1420 [ 548.623412] ? __mutex_lock_slowpath+0x20/0x20 [ 548.623417] ? kvmalloc_node+0x31/0x80 [ 548.623424] alloc_pages_current+0x75/0x110 [ 548.623436] kmalloc_order+0x24/0x60 [ 548.623442] kmalloc_order_trace+0x24/0xb0 [ 548.623448] __kmalloc_track_caller+0x207/0x220 [ 548.623455] ? f2fs_build_node_manager+0x399/0xbb0 [ 548.623460] kmemdup+0x20/0x50 [ 548.623465] f2fs_build_node_manager+0x399/0xbb0 [ 548.623470] f2fs_fill_super+0x195e/0x2b40 [ 548.623477] ? f2fs_commit_super+0x1b0/0x1b0 [ 548.623481] ? set_blocksize+0x90/0x140 [ 548.623486] mount_bdev+0x1c5/0x210 [ 548.623489] ? f2fs_commit_super+0x1b0/0x1b0 [ 548.623495] f2fs_mount+0x15/0x20 [ 548.623498] mount_fs+0x60/0x1a0 [ 548.623503] ? alloc_vfsmnt+0x309/0x360 [ 548.623508] vfs_kern_mount+0x6b/0x1a0 [ 548.623513] do_mount+0x34a/0x18c0 [ 548.623518] ? lockref_put_or_lock+0xcf/0x160 [ 548.623523] ? copy_mount_string+0x20/0x20 [ 548.623528] ? memcg_kmem_put_cache+0x1b/0xa0 [ 548.623533] ? kasan_check_write+0x14/0x20 [ 548.623537] ? _copy_from_user+0x6a/0x90 [ 548.623542] ? memdup_user+0x42/0x60 [ 548.623547] ksys_mount+0x83/0xd0 [ 548.623552] __x64_sys_mount+0x67/0x80 [ 548.623557] do_syscall_64+0x78/0x170 [ 548.623562] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 548.623566] RIP: 0033:0x7f76fc331b9a [ 548.623567] Code: 48 8b 0d 01 c3 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ce c2 2b 00 f7 d8 64 89 01 48 [ 548.623632] RSP: 002b:00007ffd4f0a0e48 EFLAGS: 00000206 ORIG_RAX: 00000000000000a5 [ 548.623636] RAX: ffffffffffffffda RBX: 000000000146c030 RCX: 00007f76fc331b9a [ 548.623639] RDX: 000000000146c210 RSI: 000000000146df30 RDI: 0000000001474ec0 [ 548.623641] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000013 [ 548.623643] R10: 00000000c0ed0000 R11: 0000000000000206 R12: 0000000001474ec0 [ 548.623646] R13: 000000000146c210 R14: 0000000000000000 R15: 0000000000000003 [ 548.623650] ---[ end trace 4ce02f25ff7d3df5 ]--- [ 548.623656] F2FS-fs (loop0): Failed to initialize F2FS node manager [ 548.627936] F2FS-fs (loop0): Invalid log blocks per segment (8201) [ 548.627940] F2FS-fs (loop0): Can't find valid F2FS filesystem in 1th superblock [ 548.635835] F2FS-fs (loop0): Failed to initialize F2FS node manager - Location https://elixir.bootlin.com/linux/v4.18-rc1/source/fs/f2fs/segment.c#L3578 sit_i->sit_bitmap = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL); Buffer overrun happens when doing memcpy. I suspect there is missing (inconsistent) checks on bitmap_size. Reported by Wen Xu (wen.xu@gatech.edu) from SSLab, Gatech. Reported-by: Wen Xu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/f2fs/super.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 6cc67e1bbb41..2ffc53d0c9c7 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1079,12 +1079,17 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi) struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned int main_segs, blocks_per_seg; + unsigned int sit_segs, nat_segs; + unsigned int sit_bitmap_size, nat_bitmap_size; + unsigned int log_blocks_per_seg; int i; total = le32_to_cpu(raw_super->segment_count); fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); - fsmeta += le32_to_cpu(raw_super->segment_count_sit); - fsmeta += le32_to_cpu(raw_super->segment_count_nat); + sit_segs = le32_to_cpu(raw_super->segment_count_sit); + fsmeta += sit_segs; + nat_segs = le32_to_cpu(raw_super->segment_count_nat); + fsmeta += nat_segs; fsmeta += le32_to_cpu(ckpt->rsvd_segment_count); fsmeta += le32_to_cpu(raw_super->segment_count_ssa); @@ -1105,6 +1110,18 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi) return 1; } + sit_bitmap_size = le32_to_cpu(ckpt->sit_ver_bitmap_bytesize); + nat_bitmap_size = le32_to_cpu(ckpt->nat_ver_bitmap_bytesize); + log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); + + if (sit_bitmap_size != ((sit_segs / 2) << log_blocks_per_seg) / 8 || + nat_bitmap_size != ((nat_segs / 2) << log_blocks_per_seg) / 8) { + f2fs_msg(sbi->sb, KERN_ERR, + "Wrong bitmap size: sit: %u, nat:%u", + sit_bitmap_size, nat_bitmap_size); + return 1; + } + if (unlikely(f2fs_cp_error(sbi))) { f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); return 1; From d04f1b44f64416a4b2a9ba35b6d0610325163349 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Fri, 25 Nov 2016 18:46:09 +0000 Subject: [PATCH 1019/1212] MIPS: WARN_ON invalid DMA cache maintenance, not BUG_ON [ Upstream commit d4da0e97baea8768b3d66ccef3967bebd50dfc3b ] If a driver causes DMA cache maintenance with a zero length then we currently BUG and kill the kernel. As this is a scenario that we may well be able to recover from, WARN & return in the condition instead. Signed-off-by: Paul Burton Acked-by: Florian Fainelli Patchwork: https://patchwork.linux-mips.org/patch/14623/ Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/mips/mm/c-r4k.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c index 5d3a25e1cfae..52e8c2026853 100644 --- a/arch/mips/mm/c-r4k.c +++ b/arch/mips/mm/c-r4k.c @@ -712,7 +712,8 @@ static void r4k_flush_icache_range(unsigned long start, unsigned long end) static void r4k_dma_cache_wback_inv(unsigned long addr, unsigned long size) { /* Catch bad driver code */ - BUG_ON(size == 0); + if (WARN_ON(size == 0)) + return; preempt_disable(); if (cpu_has_inclusive_pcaches) { @@ -745,7 +746,8 @@ static void r4k_dma_cache_wback_inv(unsigned long addr, unsigned long size) static void r4k_dma_cache_inv(unsigned long addr, unsigned long size) { /* Catch bad driver code */ - BUG_ON(size == 0); + if (WARN_ON(size == 0)) + return; preempt_disable(); if (cpu_has_inclusive_pcaches) { From ff75cdadf1e4eb196eab01ad31588033a0b647fc Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Mon, 16 Jul 2018 11:50:13 +0300 Subject: [PATCH 1020/1212] RDMA/cma: Do not ignore net namespace for unbound cm_id [ Upstream commit 643d213a9a034fa04f5575a40dfc8548e33ce04f ] Currently if the cm_id is not bound to any netdevice, than for such cm_id, net namespace is ignored; which is incorrect. Regardless of cm_id bound to a netdevice or not, net namespace must match. When a cm_id is bound to a netdevice, in such case net namespace and netdevice both must match. Fixes: 4c21b5bcef73 ("IB/cma: Add net_dev and private data checks to RDMA CM") Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/core/cma.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index d57a78ec7425..0f42411d6a79 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1280,9 +1280,16 @@ static bool cma_match_net_dev(const struct rdma_cm_id *id, (addr->src_addr.ss_family == AF_IB || cma_protocol_roce_dev_port(id->device, port_num)); - return !addr->dev_addr.bound_dev_if || - (net_eq(dev_net(net_dev), addr->dev_addr.net) && - addr->dev_addr.bound_dev_if == net_dev->ifindex); + /* + * Net namespaces must match, and if the listner is listening + * on a specific netdevice than netdevice must match as well. + */ + if (net_eq(dev_net(net_dev), addr->dev_addr.net) && + (!!addr->dev_addr.bound_dev_if == + (addr->dev_addr.bound_dev_if == net_dev->ifindex))) + return true; + else + return false; } static struct rdma_id_private *cma_find_listener( From c72a42d38b1f516b3533400a2d7ac90388b35e6c Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Thu, 3 May 2018 17:30:07 +0300 Subject: [PATCH 1021/1212] xhci: Fix use-after-free in xhci_free_virt_device commit 44a182b9d17765514fa2b1cc911e4e65134eef93 upstream. KASAN found a use-after-free in xhci_free_virt_device+0x33b/0x38e where xhci_free_virt_device() sets slot id to 0 if udev exists: if (dev->udev && dev->udev->slot_id) dev->udev->slot_id = 0; dev->udev will be true even if udev is freed because dev->udev is not set to NULL. set dev->udev pointer to NULL in xhci_free_dev() The original patch went to stable so this fix needs to be applied there as well. Fixes: a400efe455f7 ("xhci: zero usb device slot_id member when disabling and freeing a xhci slot") Cc: Reported-by: Guenter Roeck Reviewed-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: Mathias Nyman Signed-off-by: Matthias Kaehlcke Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index 128a3c0a9286..9dbf1583bd7a 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -3675,6 +3675,9 @@ void xhci_free_dev(struct usb_hcd *hcd, struct usb_device *udev) } spin_lock_irqsave(&xhci->lock, flags); + + virt_dev->udev = NULL; + /* Don't disable the slot if the host controller is dead. */ state = readl(&xhci->op_regs->status); if (state == 0xffffffff || (xhci->xhc_state & XHCI_STATE_DYING) || From a3b92ee6fc171d7c9d9b6b829b7fef169210440c Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 13 Sep 2018 13:18:52 -0700 Subject: [PATCH 1022/1212] vmw_balloon: include asm/io.h Fix a build error due to missing virt_to_phys() Reported-by: kbuild test robot Fixes: f0a1bf29d821b ("vmw_balloon: fix inflation with batching") Cc: stable@vger.kernel.org Cc: Xavier Deguillard Signed-off-by: Nadav Amit Signed-off-by: Greg Kroah-Hartman --- drivers/misc/vmw_balloon.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c index 518e2dec2aa2..5e9122cd3898 100644 --- a/drivers/misc/vmw_balloon.c +++ b/drivers/misc/vmw_balloon.c @@ -45,6 +45,7 @@ #include #include #include +#include #include MODULE_AUTHOR("VMware, Inc."); From f9845426f4427eb0031b562dfd5c0ffe99cba644 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Oct 2017 02:50:07 -0700 Subject: [PATCH 1023/1212] netfilter: x_tables: avoid stack-out-of-bounds read in xt_copy_counters_from_user commit e466af75c074e76107ae1cd5a2823e9c61894ffb upstream. syzkaller reports an out of bound read in strlcpy(), triggered by xt_copy_counters_from_user() Fix this by using memcpy(), then forcing a zero byte at the last position of the destination, as Florian did for the non COMPAT code. Fixes: d7591f0c41ce ("netfilter: x_tables: introduce and use xt_copy_counters_from_user") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Cc: Greg Hackmann Signed-off-by: Greg Kroah-Hartman --- net/netfilter/x_tables.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 1f3c305df45d..b6e72af15237 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -876,7 +876,7 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len, if (copy_from_user(&compat_tmp, user, sizeof(compat_tmp)) != 0) return ERR_PTR(-EFAULT); - strlcpy(info->name, compat_tmp.name, sizeof(info->name)); + memcpy(info->name, compat_tmp.name, sizeof(info->name) - 1); info->num_counters = compat_tmp.num_counters; user += sizeof(compat_tmp); } else @@ -889,9 +889,9 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len, if (copy_from_user(info, user, sizeof(*info)) != 0) return ERR_PTR(-EFAULT); - info->name[sizeof(info->name) - 1] = '\0'; user += sizeof(*info); } + info->name[sizeof(info->name) - 1] = '\0'; size = sizeof(struct xt_counters); size *= info->num_counters; From d02c870872ffb9f0ee605319da46cc52797f837e Mon Sep 17 00:00:00 2001 From: David Rivshin Date: Wed, 27 Apr 2016 21:25:25 -0400 Subject: [PATCH 1024/1212] drivers: net: cpsw: fix parsing of phy-handle DT property in dual_emac config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 552165bcf7060b998b4a9b5b86110b6a5e04dfd9 upstream. Commit 9e42f715264ff158478fa30eaed847f6e131366b ("drivers: net: cpsw: add phy-handle parsing") saved the "phy-handle" phandle into a new cpsw_priv field. However, phy connections are per-slave, so the phy_node field should be in cpsw_slave_data rather than cpsw_priv. This would go unnoticed in a single emac configuration. But in dual_emac mode, the last "phy-handle" property parsed for either slave would be used by both of them, causing them both to refer to the same phy_device. Fixes: 9e42f715264f ("drivers: net: cpsw: add phy-handle parsing") Signed-off-by: David Rivshin Tested-by: Nicolas Chauvet Tested-by: Andrew Goodbody Reviewed-by: Mugunthan V N Reviewed-by: Grygorii Strashko Signed-off-by: David S. Miller Signed-off-by: SZ Lin (林上智) Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/ti/cpsw.c | 13 ++++++------- drivers/net/ethernet/ti/cpsw.h | 1 + 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index c69b0bdd891d..c21c80a228d9 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -371,7 +371,6 @@ struct cpsw_priv { spinlock_t lock; struct platform_device *pdev; struct net_device *ndev; - struct device_node *phy_node; struct napi_struct napi_rx; struct napi_struct napi_tx; struct device *dev; @@ -1165,8 +1164,8 @@ static void cpsw_slave_open(struct cpsw_slave *slave, struct cpsw_priv *priv) cpsw_ale_add_mcast(priv->ale, priv->ndev->broadcast, 1 << slave_port, 0, 0, ALE_MCAST_FWD_2); - if (priv->phy_node) - slave->phy = of_phy_connect(priv->ndev, priv->phy_node, + if (slave->data->phy_node) + slave->phy = of_phy_connect(priv->ndev, slave->data->phy_node, &cpsw_adjust_link, 0, slave->data->phy_if); else slave->phy = phy_connect(priv->ndev, slave->data->phy_id, @@ -1957,12 +1956,11 @@ static void cpsw_slave_init(struct cpsw_slave *slave, struct cpsw_priv *priv, slave->port_vlan = data->dual_emac_res_vlan; } -static int cpsw_probe_dt(struct cpsw_priv *priv, +static int cpsw_probe_dt(struct cpsw_platform_data *data, struct platform_device *pdev) { struct device_node *node = pdev->dev.of_node; struct device_node *slave_node; - struct cpsw_platform_data *data = &priv->data; int i = 0, ret; u32 prop; @@ -2050,7 +2048,8 @@ static int cpsw_probe_dt(struct cpsw_priv *priv, if (strcmp(slave_node->name, "slave")) continue; - priv->phy_node = of_parse_phandle(slave_node, "phy-handle", 0); + slave_data->phy_node = of_parse_phandle(slave_node, + "phy-handle", 0); parp = of_get_property(slave_node, "phy_id", &lenp); if (of_phy_is_fixed_link(slave_node)) { struct device_node *phy_node; @@ -2291,7 +2290,7 @@ static int cpsw_probe(struct platform_device *pdev) /* Select default pin state */ pinctrl_pm_select_default_state(&pdev->dev); - if (cpsw_probe_dt(priv, pdev)) { + if (cpsw_probe_dt(&priv->data, pdev)) { dev_err(&pdev->dev, "cpsw: platform data missing\n"); ret = -ENODEV; goto clean_runtime_disable_ret; diff --git a/drivers/net/ethernet/ti/cpsw.h b/drivers/net/ethernet/ti/cpsw.h index 442a7038e660..e50afd1b2eda 100644 --- a/drivers/net/ethernet/ti/cpsw.h +++ b/drivers/net/ethernet/ti/cpsw.h @@ -18,6 +18,7 @@ #include struct cpsw_slave_data { + struct device_node *phy_node; char phy_id[MII_BUS_ID_SIZE]; int phy_if; u8 mac_addr[ETH_ALEN]; From b3179842575a1d650b8b5f018252fce947f0e99f Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 17 Nov 2016 17:39:59 +0100 Subject: [PATCH 1025/1212] net: ethernet: ti: cpsw: fix mdio device reference leak MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 86e1d5adcef961eb383ce4eacbe0ef22f06e2045 upstream. Make sure to drop the reference taken by of_find_device_by_node() when looking up an mdio device from a phy_id property during probe. Fixes: 549985ee9c72 ("cpsw: simplify the setup of the register pointers") Signed-off-by: Johan Hovold Signed-off-by: David S. Miller Signed-off-by: SZ Lin (林上智) Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/ti/cpsw.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index c21c80a228d9..c2e110b2549b 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -2086,6 +2086,7 @@ static int cpsw_probe_dt(struct cpsw_platform_data *data, } snprintf(slave_data->phy_id, sizeof(slave_data->phy_id), PHY_ID_FMT, mdio->name, phyid); + put_device(&mdio->dev); } else { dev_err(&pdev->dev, "No slave[%d] phy_id or fixed-link property\n", i); goto no_phy_slave; From 21de2732aaab31df375184ebd3d4c53a0247526d Mon Sep 17 00:00:00 2001 From: Peter Chen Date: Mon, 1 Aug 2016 15:02:43 +0800 Subject: [PATCH 1026/1212] ethernet: ti: davinci_emac: add missing of_node_put after calling of_parse_phandle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 5817f977527e5cdbea9ca56d2b95824f59c8747d upstream. of_node_put needs to be called when the device node which is got from of_parse_phandle has finished using. Signed-off-by: Peter Chen Signed-off-by: David S. Miller Signed-off-by: SZ Lin (林上智) Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/ti/davinci_emac.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c index 6be315303d61..8ecb24186b7f 100644 --- a/drivers/net/ethernet/ti/davinci_emac.c +++ b/drivers/net/ethernet/ti/davinci_emac.c @@ -2108,6 +2108,7 @@ static int davinci_emac_remove(struct platform_device *pdev) cpdma_ctlr_destroy(priv->dma); unregister_netdev(ndev); + of_node_put(priv->phy_node); free_netdev(ndev); return 0; From f1b13e97e451975fdd7ff9452afa53092feb37af Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Wed, 22 Aug 2018 08:26:31 +0200 Subject: [PATCH 1027/1212] crypto: vmx - Fix sleep-in-atomic bugs commit 0522236d4f9c5ab2e79889cb020d1acbe5da416e upstream. This patch fixes sleep-in-atomic bugs in AES-CBC and AES-XTS VMX implementations. The problem is that the blkcipher_* functions should not be called in atomic context. The bugs can be reproduced via the AF_ALG interface by trying to encrypt/decrypt sufficiently large buffers (at least 64 KiB) using the VMX implementations of 'cbc(aes)' or 'xts(aes)'. Such operations then trigger BUG in crypto_yield(): [ 891.863680] BUG: sleeping function called from invalid context at include/crypto/algapi.h:424 [ 891.864622] in_atomic(): 1, irqs_disabled(): 0, pid: 12347, name: kcapi-enc [ 891.864739] 1 lock held by kcapi-enc/12347: [ 891.864811] #0: 00000000f5d42c46 (sk_lock-AF_ALG){+.+.}, at: skcipher_recvmsg+0x50/0x530 [ 891.865076] CPU: 5 PID: 12347 Comm: kcapi-enc Not tainted 4.19.0-0.rc0.git3.1.fc30.ppc64le #1 [ 891.865251] Call Trace: [ 891.865340] [c0000003387578c0] [c000000000d67ea4] dump_stack+0xe8/0x164 (unreliable) [ 891.865511] [c000000338757910] [c000000000172a58] ___might_sleep+0x2f8/0x310 [ 891.865679] [c000000338757990] [c0000000006bff74] blkcipher_walk_done+0x374/0x4a0 [ 891.865825] [c0000003387579e0] [d000000007e73e70] p8_aes_cbc_encrypt+0x1c8/0x260 [vmx_crypto] [ 891.865993] [c000000338757ad0] [c0000000006c0ee0] skcipher_encrypt_blkcipher+0x60/0x80 [ 891.866128] [c000000338757b10] [c0000000006ec504] skcipher_recvmsg+0x424/0x530 [ 891.866283] [c000000338757bd0] [c000000000b00654] sock_recvmsg+0x74/0xa0 [ 891.866403] [c000000338757c10] [c000000000b00f64] ___sys_recvmsg+0xf4/0x2f0 [ 891.866515] [c000000338757d90] [c000000000b02bb8] __sys_recvmsg+0x68/0xe0 [ 891.866631] [c000000338757e30] [c00000000000bbe4] system_call+0x5c/0x70 Fixes: 8c755ace357c ("crypto: vmx - Adding CBC routines for VMX module") Fixes: c07f5d3da643 ("crypto: vmx - Adding support for XTS") Cc: stable@vger.kernel.org Signed-off-by: Ondrej Mosnacek Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- drivers/crypto/vmx/aes_cbc.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/drivers/crypto/vmx/aes_cbc.c b/drivers/crypto/vmx/aes_cbc.c index 9506e8693c81..d8ef1147b344 100644 --- a/drivers/crypto/vmx/aes_cbc.c +++ b/drivers/crypto/vmx/aes_cbc.c @@ -111,24 +111,23 @@ static int p8_aes_cbc_encrypt(struct blkcipher_desc *desc, ret = crypto_blkcipher_encrypt(&fallback_desc, dst, src, nbytes); } else { - preempt_disable(); - pagefault_disable(); - enable_kernel_altivec(); - enable_kernel_vsx(); - blkcipher_walk_init(&walk, dst, src, nbytes); ret = blkcipher_walk_virt(desc, &walk); while ((nbytes = walk.nbytes)) { + preempt_disable(); + pagefault_disable(); + enable_kernel_vsx(); + enable_kernel_altivec(); aes_p8_cbc_encrypt(walk.src.virt.addr, walk.dst.virt.addr, nbytes & AES_BLOCK_MASK, &ctx->enc_key, walk.iv, 1); + pagefault_enable(); + preempt_enable(); + nbytes &= AES_BLOCK_SIZE - 1; ret = blkcipher_walk_done(desc, &walk, nbytes); } - - pagefault_enable(); - preempt_enable(); } return ret; @@ -152,24 +151,23 @@ static int p8_aes_cbc_decrypt(struct blkcipher_desc *desc, ret = crypto_blkcipher_decrypt(&fallback_desc, dst, src, nbytes); } else { - preempt_disable(); - pagefault_disable(); - enable_kernel_altivec(); - enable_kernel_vsx(); - blkcipher_walk_init(&walk, dst, src, nbytes); ret = blkcipher_walk_virt(desc, &walk); while ((nbytes = walk.nbytes)) { + preempt_disable(); + pagefault_disable(); + enable_kernel_vsx(); + enable_kernel_altivec(); aes_p8_cbc_encrypt(walk.src.virt.addr, walk.dst.virt.addr, nbytes & AES_BLOCK_MASK, &ctx->dec_key, walk.iv, 0); + pagefault_enable(); + preempt_enable(); + nbytes &= AES_BLOCK_SIZE - 1; ret = blkcipher_walk_done(desc, &walk, nbytes); } - - pagefault_enable(); - preempt_enable(); } return ret; From da7d5af64afd3f18b727c8d7787265c555e2fded Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 18 Jan 2018 14:05:05 +0000 Subject: [PATCH 1028/1212] mtd: ubi: wl: Fix error return code in ubi_wl_init() commit 7233982ade15eeac05c6f351e8d347406e6bcd2f upstream. Fix to return error code -ENOMEM from the kmem_cache_alloc() error handling case instead of 0, as done elsewhere in this function. Fixes: f78e5623f45b ("ubi: fastmap: Erase outdated anchor PEBs during attach") Signed-off-by: Wei Yongjun Reviewed-by: Boris Brezillon Signed-off-by: Richard Weinberger Cc: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- drivers/mtd/ubi/wl.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c index b3c1b8106a68..f4b3ce2b2bc3 100644 --- a/drivers/mtd/ubi/wl.c +++ b/drivers/mtd/ubi/wl.c @@ -1597,8 +1597,10 @@ int ubi_wl_init(struct ubi_device *ubi, struct ubi_attach_info *ai) cond_resched(); e = kmem_cache_alloc(ubi_wl_entry_slab, GFP_KERNEL); - if (!e) + if (!e) { + err = -ENOMEM; goto out_free; + } e->pnum = aeb->pnum; e->ec = aeb->ec; @@ -1617,8 +1619,10 @@ int ubi_wl_init(struct ubi_device *ubi, struct ubi_attach_info *ai) cond_resched(); e = kmem_cache_alloc(ubi_wl_entry_slab, GFP_KERNEL); - if (!e) + if (!e) { + err = -ENOMEM; goto out_free; + } e->pnum = aeb->pnum; e->ec = aeb->ec; From 4bdac2526df0ea4fad8a78958fe462f1ab4a17dd Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Tue, 21 Aug 2018 21:51:45 -0700 Subject: [PATCH 1029/1212] autofs: fix autofs_sbi() does not check super block type commit 0633da48f0793aeba27f82d30605624416723a91 upstream. autofs_sbi() does not check the superblock magic number to verify it has been given an autofs super block. Backport Note: autofs4 has been renamed to autofs upstream. As a result the upstream patch does not apply cleanly onto 4.14.y. Link: http://lkml.kernel.org/r/153475422934.17131.7563724552005298277.stgit@pluto.themaw.net Reported-by: Signed-off-by: Ian Kent Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Zubin Mithra Signed-off-by: Greg Kroah-Hartman --- fs/autofs4/autofs_i.h | 4 +++- fs/autofs4/inode.c | 1 - 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 502d3892d8a4..d71e7ad4d382 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -18,6 +18,7 @@ #include #include #include +#include /* This is the range of ioctl() numbers we claim as ours */ #define AUTOFS_IOC_FIRST AUTOFS_IOC_READY @@ -135,7 +136,8 @@ struct autofs_sb_info { static inline struct autofs_sb_info *autofs4_sbi(struct super_block *sb) { - return (struct autofs_sb_info *)(sb->s_fs_info); + return sb->s_magic != AUTOFS_SUPER_MAGIC ? + NULL : (struct autofs_sb_info *)(sb->s_fs_info); } static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry) diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index a3ae0b2aeb5a..1132fe71b312 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -18,7 +18,6 @@ #include #include #include -#include #include "autofs_i.h" #include From e29c75d4fb491a7b1890293302f7ab40be7ce586 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 3 Sep 2018 10:19:13 -0700 Subject: [PATCH 1030/1212] x86/speculation/l1tf: Increase l1tf memory limit for Nehalem+ [upstream cc51e5428ea54f575d49cfcede1d4cb3a72b4ec4 for 4.4. Note there might be still a trivial conflict with the backport for b0a182f875689647b014bc01d36b340217792852, but should be easy to resolve] On Nehalem and newer core CPUs the CPU cache internally uses 44 bits physical address space. The L1TF workaround is limited by this internal cache address width, and needs to have one bit free there for the mitigation to work. Older client systems report only 36bit physical address space so the range check decides that L1TF is not mitigated for a 36bit phys/32GB system with some memory holes. But since these actually have the larger internal cache width this warning is bogus because it would only really be needed if the system had more than 43bits of memory. Add a new internal x86_cache_bits field. Normally it is the same as the physical bits field reported by CPUID, but for Nehalem and newerforce it to be at least 44bits. Change the L1TF memory size warning to use the new cache_bits field to avoid bogus warnings and remove the bogus comment about memory size. Fixes: 17dbca119312 ("x86/speculation/l1tf: Add sysfs reporting for l1tf") Reported-by: George Anchev Reported-by: Christopher Snowhill Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Cc: x86@kernel.org Cc: linux-kernel@vger.kernel.org Cc: Michael Hocko Cc: vbabka@suse.cz Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180824170351.34874-1-andi@firstfloor.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/processor.h | 4 ++- arch/x86/kernel/cpu/bugs.c | 47 ++++++++++++++++++++++++++++---- arch/x86/kernel/cpu/common.c | 2 ++ 3 files changed, 47 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 337c52192278..440a948c4feb 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -104,6 +104,8 @@ struct cpuinfo_x86 { __u8 x86_phys_bits; /* CPUID returned core id bits: */ __u8 x86_coreid_bits; + + __u8 x86_cache_bits; /* Max extended CPUID function supported: */ __u32 extended_cpuid_level; /* Maximum supported CPUID level, -1=no CPUID: */ @@ -174,7 +176,7 @@ extern void cpu_detect(struct cpuinfo_x86 *c); static inline unsigned long long l1tf_pfn_limit(void) { - return BIT_ULL(boot_cpu_data.x86_phys_bits - 1 - PAGE_SHIFT); + return BIT_ULL(boot_cpu_data.x86_cache_bits - 1 - PAGE_SHIFT); } extern void early_cpu_init(void); diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index b9e6b60df148..621bc6561189 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -634,6 +634,46 @@ void x86_spec_ctrl_setup_ap(void) #undef pr_fmt #define pr_fmt(fmt) "L1TF: " fmt + +/* + * These CPUs all support 44bits physical address space internally in the + * cache but CPUID can report a smaller number of physical address bits. + * + * The L1TF mitigation uses the top most address bit for the inversion of + * non present PTEs. When the installed memory reaches into the top most + * address bit due to memory holes, which has been observed on machines + * which report 36bits physical address bits and have 32G RAM installed, + * then the mitigation range check in l1tf_select_mitigation() triggers. + * This is a false positive because the mitigation is still possible due to + * the fact that the cache uses 44bit internally. Use the cache bits + * instead of the reported physical bits and adjust them on the affected + * machines to 44bit if the reported bits are less than 44. + */ +static void override_cache_bits(struct cpuinfo_x86 *c) +{ + if (c->x86 != 6) + return; + + switch (c->x86_model) { + case INTEL_FAM6_NEHALEM: + case INTEL_FAM6_WESTMERE: + case INTEL_FAM6_SANDYBRIDGE: + case INTEL_FAM6_IVYBRIDGE: + case INTEL_FAM6_HASWELL_CORE: + case INTEL_FAM6_HASWELL_ULT: + case INTEL_FAM6_HASWELL_GT3E: + case INTEL_FAM6_BROADWELL_CORE: + case INTEL_FAM6_BROADWELL_GT3E: + case INTEL_FAM6_SKYLAKE_MOBILE: + case INTEL_FAM6_SKYLAKE_DESKTOP: + case INTEL_FAM6_KABYLAKE_MOBILE: + case INTEL_FAM6_KABYLAKE_DESKTOP: + if (c->x86_cache_bits < 44) + c->x86_cache_bits = 44; + break; + } +} + static void __init l1tf_select_mitigation(void) { u64 half_pa; @@ -641,16 +681,13 @@ static void __init l1tf_select_mitigation(void) if (!boot_cpu_has_bug(X86_BUG_L1TF)) return; + override_cache_bits(&boot_cpu_data); + #if CONFIG_PGTABLE_LEVELS == 2 pr_warn("Kernel not compiled for PAE. No mitigation for L1TF\n"); return; #endif - /* - * This is extremely unlikely to happen because almost all - * systems have far more MAX_PA/2 than RAM can be fit into - * DIMM slots. - */ half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT; if (e820_any_mapped(half_pa, ULLONG_MAX - half_pa, E820_RAM)) { pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n"); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4d3fa79c0f09..b12c0287d6cf 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -798,6 +798,8 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_phys_bits = 36; #endif + c->x86_cache_bits = c->x86_phys_bits; + if (c->extended_cpuid_level >= 0x8000000a) c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a); From 88d6918401a4ecdc50fe77df3e1e77c1e49d8579 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 12 Sep 2018 23:57:48 -1000 Subject: [PATCH 1031/1212] mm: get rid of vmacache_flush_all() entirely commit 7a9cdebdcc17e426fb5287e4a82db1dfe86339b2 upstream. Jann Horn points out that the vmacache_flush_all() function is not only potentially expensive, it's buggy too. It also happens to be entirely unnecessary, because the sequence number overflow case can be avoided by simply making the sequence number be 64-bit. That doesn't even grow the data structures in question, because the other adjacent fields are already 64-bit. So simplify the whole thing by just making the sequence number overflow case go away entirely, which gets rid of all the complications and makes the code faster too. Win-win. [ Oleg Nesterov points out that the VMACACHE_FULL_FLUSHES statistics also just goes away entirely with this ] Reported-by: Jann Horn Suggested-by: Will Deacon Acked-by: Davidlohr Bueso Cc: Oleg Nesterov Cc: stable@kernel.org Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/mm_types.h | 2 +- include/linux/sched.h | 2 +- include/linux/vm_event_item.h | 1 - include/linux/vmacache.h | 5 ----- mm/debug.c | 4 ++-- mm/vmacache.c | 38 ----------------------------------- 6 files changed, 4 insertions(+), 48 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 36f4695aa604..ad2a081bac66 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -392,7 +392,7 @@ struct kioctx_table; struct mm_struct { struct vm_area_struct *mmap; /* list of VMAs */ struct rb_root mm_rb; - u32 vmacache_seqnum; /* per-thread vmacache */ + u64 vmacache_seqnum; /* per-thread vmacache */ #ifdef CONFIG_MMU unsigned long (*get_unmapped_area) (struct file *filp, unsigned long addr, unsigned long len, diff --git a/include/linux/sched.h b/include/linux/sched.h index 725498cc5d30..b30540d6d125 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1454,7 +1454,7 @@ struct task_struct { struct mm_struct *mm, *active_mm; /* per-thread vma caching */ - u32 vmacache_seqnum; + u64 vmacache_seqnum; struct vm_area_struct *vmacache[VMACACHE_SIZE]; #if defined(SPLIT_RSS_COUNTING) struct task_rss_stat rss_stat; diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 8ef3a61fdc74..fdac5800872d 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -88,7 +88,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_DEBUG_VM_VMACACHE VMACACHE_FIND_CALLS, VMACACHE_FIND_HITS, - VMACACHE_FULL_FLUSHES, #endif NR_VM_EVENT_ITEMS }; diff --git a/include/linux/vmacache.h b/include/linux/vmacache.h index c3fa0fd43949..4f58ff2dacd6 100644 --- a/include/linux/vmacache.h +++ b/include/linux/vmacache.h @@ -15,7 +15,6 @@ static inline void vmacache_flush(struct task_struct *tsk) memset(tsk->vmacache, 0, sizeof(tsk->vmacache)); } -extern void vmacache_flush_all(struct mm_struct *mm); extern void vmacache_update(unsigned long addr, struct vm_area_struct *newvma); extern struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr); @@ -29,10 +28,6 @@ extern struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, static inline void vmacache_invalidate(struct mm_struct *mm) { mm->vmacache_seqnum++; - - /* deal with overflows */ - if (unlikely(mm->vmacache_seqnum == 0)) - vmacache_flush_all(mm); } #endif /* __LINUX_VMACACHE_H */ diff --git a/mm/debug.c b/mm/debug.c index 668aa35191ca..689b6e911cae 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -168,7 +168,7 @@ EXPORT_SYMBOL(dump_vma); void dump_mm(const struct mm_struct *mm) { - pr_emerg("mm %p mmap %p seqnum %d task_size %lu\n" + pr_emerg("mm %p mmap %p seqnum %llu task_size %lu\n" #ifdef CONFIG_MMU "get_unmapped_area %p\n" #endif @@ -198,7 +198,7 @@ void dump_mm(const struct mm_struct *mm) #endif "%s", /* This is here to hold the comma */ - mm, mm->mmap, mm->vmacache_seqnum, mm->task_size, + mm, mm->mmap, (long long) mm->vmacache_seqnum, mm->task_size, #ifdef CONFIG_MMU mm->get_unmapped_area, #endif diff --git a/mm/vmacache.c b/mm/vmacache.c index fd09dc9c6812..9c8ff3d4eda9 100644 --- a/mm/vmacache.c +++ b/mm/vmacache.c @@ -5,44 +5,6 @@ #include #include -/* - * Flush vma caches for threads that share a given mm. - * - * The operation is safe because the caller holds the mmap_sem - * exclusively and other threads accessing the vma cache will - * have mmap_sem held at least for read, so no extra locking - * is required to maintain the vma cache. - */ -void vmacache_flush_all(struct mm_struct *mm) -{ - struct task_struct *g, *p; - - count_vm_vmacache_event(VMACACHE_FULL_FLUSHES); - - /* - * Single threaded tasks need not iterate the entire - * list of process. We can avoid the flushing as well - * since the mm's seqnum was increased and don't have - * to worry about other threads' seqnum. Current's - * flush will occur upon the next lookup. - */ - if (atomic_read(&mm->mm_users) == 1) - return; - - rcu_read_lock(); - for_each_process_thread(g, p) { - /* - * Only flush the vmacache pointers as the - * mm seqnum is already set and curr's will - * be set upon invalidation when the next - * lookup is done. - */ - if (mm == p->mm) - vmacache_flush(p); - } - rcu_read_unlock(); -} - /* * This task may be accessing a foreign mm via (for example) * get_user_pages()->find_vma(). The vmacache is task-local and this From d9560919689d588beccf719452086b5cdf6d6c22 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 19 Sep 2018 22:49:00 +0200 Subject: [PATCH 1032/1212] Linux 4.4.157 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 6dd5924a7ea5..2d55f88e6a08 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 156 +SUBLEVEL = 157 EXTRAVERSION = NAME = Blurry Fish Butt From a8323a85377619afd0ca6240c90e3668940c247c Mon Sep 17 00:00:00 2001 From: Miao Zhong Date: Mon, 23 Jul 2018 20:56:58 +0800 Subject: [PATCH 1033/1212] iommu/arm-smmu-v3: sync the OVACKFLG to PRIQ consumer register [ Upstream commit 0d535967ac658966c6ade8f82b5799092f7d5441 ] When PRI queue occurs overflow, driver should update the OVACKFLG to the PRIQ consumer register, otherwise subsequent PRI requests will not be processed. Cc: Will Deacon Cc: Robin Murphy Signed-off-by: Miao Zhong Signed-off-by: Will Deacon Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/iommu/arm-smmu-v3.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index 347aaaa5a7ea..fc6eb752ab35 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -1219,6 +1219,7 @@ static irqreturn_t arm_smmu_priq_thread(int irq, void *dev) /* Sync our overflow flag, as we believe we're up to speed */ q->cons = Q_OVF(q, q->prod) | Q_WRP(q, q->cons) | Q_IDX(q, q->cons); + writel(q->cons, q->cons_reg); return IRQ_HANDLED; } From 5df13ba493746f4b4a4d8f8bbf7c6bc86c7a899b Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 25 Jul 2018 23:00:48 +0200 Subject: [PATCH 1034/1212] ALSA: msnd: Fix the default sample sizes [ Upstream commit 7c500f9ea139d0c9b80fdea5a9c911db3166ea54 ] The default sample sizes set by msnd driver are bogus; it sets ALSA PCM format, not the actual bit width. Signed-off-by: Takashi Iwai Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- sound/isa/msnd/msnd_pinnacle.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/isa/msnd/msnd_pinnacle.c b/sound/isa/msnd/msnd_pinnacle.c index a31ea6c22d19..2d7379dec1f0 100644 --- a/sound/isa/msnd/msnd_pinnacle.c +++ b/sound/isa/msnd/msnd_pinnacle.c @@ -82,10 +82,10 @@ static void set_default_audio_parameters(struct snd_msnd *chip) { - chip->play_sample_size = DEFSAMPLESIZE; + chip->play_sample_size = snd_pcm_format_width(DEFSAMPLESIZE); chip->play_sample_rate = DEFSAMPLERATE; chip->play_channels = DEFCHANNELS; - chip->capture_sample_size = DEFSAMPLESIZE; + chip->capture_sample_size = snd_pcm_format_width(DEFSAMPLESIZE); chip->capture_sample_rate = DEFSAMPLERATE; chip->capture_channels = DEFCHANNELS; } From 650ca59a640ba016d3a29555cdbc5c2623c753e9 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 25 Jul 2018 23:00:46 +0200 Subject: [PATCH 1035/1212] ALSA: usb-audio: Fix multiple definitions in AU0828_DEVICE() macro [ Upstream commit bd1cd0eb2ce9141100628d476ead4de485501b29 ] AU0828_DEVICE() macro in quirks-table.h uses USB_DEVICE_VENDOR_SPEC() for expanding idVendor and idProduct fields. However, the latter macro adds also match_flags and bInterfaceClass, which are different from the values AU0828_DEVICE() macro sets after that. For fixing them, just expand idVendor and idProduct fields manually in AU0828_DEVICE(). This fixes sparse warnings like: sound/usb/quirks-table.h:2892:1: warning: Initializer entry defined twice Signed-off-by: Takashi Iwai Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- sound/usb/quirks-table.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sound/usb/quirks-table.h b/sound/usb/quirks-table.h index 69bf5cf1e91e..15cbe2565703 100644 --- a/sound/usb/quirks-table.h +++ b/sound/usb/quirks-table.h @@ -2875,7 +2875,8 @@ YAMAHA_DEVICE(0x7010, "UB99"), */ #define AU0828_DEVICE(vid, pid, vname, pname) { \ - USB_DEVICE_VENDOR_SPEC(vid, pid), \ + .idVendor = vid, \ + .idProduct = pid, \ .match_flags = USB_DEVICE_ID_MATCH_DEVICE | \ USB_DEVICE_ID_MATCH_INT_CLASS | \ USB_DEVICE_ID_MATCH_INT_SUBCLASS, \ From 1e89472ff0635fc68c19e44ec552ed8a9d6e71e4 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 25 Jul 2018 16:54:33 +0800 Subject: [PATCH 1036/1212] xfrm: fix 'passing zero to ERR_PTR()' warning [ Upstream commit 934ffce1343f22ed5e2d0bd6da4440f4848074de ] Fix a static code checker warning: net/xfrm/xfrm_policy.c:1836 xfrm_resolve_and_create_bundle() warn: passing zero to 'ERR_PTR' xfrm_tmpl_resolve return 0 just means no xdst found, return NULL instead of passing zero to ERR_PTR. Fixes: d809ec895505 ("xfrm: do not assume that template resolving always returns xfrms") Signed-off-by: YueHaibing Signed-off-by: Steffen Klassert Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/xfrm/xfrm_policy.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index e9eecf6f0bff..48080f89ed25 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1845,7 +1845,10 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, /* Try to instantiate a bundle */ err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family); if (err <= 0) { - if (err != 0 && err != -EAGAIN) + if (err == 0) + return NULL; + + if (err != -EAGAIN) XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); return ERR_PTR(err); } From 9e8d585c52c5db4780c53f949cd3fdb9d6a50b54 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 25 Jul 2018 18:45:08 +0100 Subject: [PATCH 1037/1212] gfs2: Special-case rindex for gfs2_grow [ Upstream commit 776125785a87ff05d49938bd5b9f336f2a05bff6 ] To speed up the common case of appending to a file, gfs2_write_alloc_required presumes that writing beyond the end of a file will always require additional blocks to be allocated. This assumption is incorrect for preallocates files, but there are no negative consequences as long as *some* space is still left on the filesystem. One special file that always has some space preallocated beyond the end of the file is the rindex: when growing a filesystem, gfs2_grow adds one or more new resource groups and appends records describing those resource groups to the rindex; the preallocated space ensures that this is always possible. However, when a filesystem is completely full, gfs2_write_alloc_required will indicate that an additional allocation is required, and appending the next record to the rindex will fail even though space for that record has already been preallocated. To fix that, skip the incorrect optimization in gfs2_write_alloc_required, but for the rindex only. Other writes to preallocated space beyond the end of the file are still allowed to fail on completely full filesystems. Signed-off-by: Andreas Gruenbacher Reviewed-by: Bob Peterson Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/gfs2/bmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 61296ecbd0e2..09476bb8f6cd 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1476,7 +1476,7 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift; lblock = offset >> shift; lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift; - if (lblock_stop > end_of_file) + if (lblock_stop > end_of_file && ip != GFS2_I(sdp->sd_rindex)) return 1; size = (lblock_stop - lblock) << shift; From 210006efbb7a8dfd1c0356df939073f65e7d1ad8 Mon Sep 17 00:00:00 2001 From: Nicholas Mc Guire Date: Fri, 13 Jul 2018 13:13:20 +0200 Subject: [PATCH 1038/1212] clk: imx6ul: fix missing of_node_put() [ Upstream commit 11177e7a7aaef95935592072985526ebf0a3df43 ] of_find_compatible_node() is returning a device node with refcount incremented and must be explicitly decremented after the last use which is right after the us in of_iomap() here. Signed-off-by: Nicholas Mc Guire Fixes: 787b4271a6a0 ("clk: imx: add imx6ul clk tree support") Signed-off-by: Stephen Boyd Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/clk/imx/clk-imx6ul.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/clk/imx/clk-imx6ul.c b/drivers/clk/imx/clk-imx6ul.c index 01718d05e952..9e8f0e255de2 100644 --- a/drivers/clk/imx/clk-imx6ul.c +++ b/drivers/clk/imx/clk-imx6ul.c @@ -120,6 +120,7 @@ static void __init imx6ul_clocks_init(struct device_node *ccm_node) np = of_find_compatible_node(NULL, NULL, "fsl,imx6ul-anatop"); base = of_iomap(np, 0); + of_node_put(np); WARN_ON(!base); clks[IMX6UL_PLL1_BYPASS_SRC] = imx_clk_mux("pll1_bypass_src", base + 0x00, 14, 1, pll_bypass_src_sels, ARRAY_SIZE(pll_bypass_src_sels)); From d6ac46c7db364c3dd4b328a9e6616c9f26985f2b Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 20 Jul 2018 16:46:33 +0900 Subject: [PATCH 1039/1212] kbuild: add .DELETE_ON_ERROR special target MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 9c2af1c7377a8a6ef86e5cabf80978f3dbbb25c0 ] If Make gets a fatal signal while a shell is executing, it may delete the target file that the recipe was supposed to update. This is needed to make sure that it is remade from scratch when Make is next run; if Make is interrupted after the recipe has begun to write the target file, it results in an incomplete file whose time stamp is newer than that of the prerequisites files. Make automatically deletes the incomplete file on interrupt unless the target is marked .PRECIOUS. The situation is just the same as when the shell fails for some reasons. Usually when a recipe line fails, if it has changed the target file at all, the file is corrupted, or at least it is not completely updated. Yet the file’s time stamp says that it is now up to date, so the next time Make runs, it will not try to update that file. However, Make does not cater to delete the incomplete target file in this case. We need to add .DELETE_ON_ERROR somewhere in the Makefile to request it. scripts/Kbuild.include seems a suitable place to add it because it is included from almost all sub-makes. Please note .DELETE_ON_ERROR is not effective for phony targets. The external module building should never ever touch the kernel tree. The following recipe fails if include/generated/autoconf.h is missing. However, include/config/auto.conf is not deleted since it is a phony target. PHONY += include/config/auto.conf include/config/auto.conf: $(Q)test -e include/generated/autoconf.h -a -e $@ || ( \ echo >&2; \ echo >&2 " ERROR: Kernel configuration is invalid."; \ echo >&2 " include/generated/autoconf.h or $@ are missing.";\ echo >&2 " Run 'make oldconfig && make prepare' on kernel src to fix it."; \ echo >&2 ; \ /bin/false) Signed-off-by: Masahiro Yamada Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- scripts/Kbuild.include | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include index 31a981d6229d..5897fc3857a0 100644 --- a/scripts/Kbuild.include +++ b/scripts/Kbuild.include @@ -359,3 +359,6 @@ endif endef # ############################################################################### + +# delete partially updated (i.e. corrupted) files on error +.DELETE_ON_ERROR: From 46c66ac938163769834ce38a298c65795ff9e0bc Mon Sep 17 00:00:00 2001 From: John Keeping Date: Tue, 17 Jul 2018 11:48:16 +0100 Subject: [PATCH 1040/1212] dmaengine: pl330: fix irq race with terminate_all [ Upstream commit e49756544a21f5625b379b3871d27d8500764670 ] In pl330_update() when checking if a channel has been aborted, the channel's lock is not taken, only the overall pl330_dmac lock. But in pl330_terminate_all() the aborted flag (req_running==-1) is set under the channel lock and not the pl330_dmac lock. With threaded interrupts, this leads to a potential race: pl330_terminate_all pl330_update ------------------- ------------ lock channel entry lock pl330 _stop channel unlock pl330 lock pl330 check req_running != -1 req_running = -1 _start channel Signed-off-by: John Keeping Signed-off-by: Vinod Koul Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/dma/pl330.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/dma/pl330.c b/drivers/dma/pl330.c index 8db791ef2027..95619ee33112 100644 --- a/drivers/dma/pl330.c +++ b/drivers/dma/pl330.c @@ -2132,13 +2132,14 @@ static int pl330_terminate_all(struct dma_chan *chan) pm_runtime_get_sync(pl330->ddma.dev); spin_lock_irqsave(&pch->lock, flags); + spin_lock(&pl330->lock); _stop(pch->thread); - spin_unlock(&pl330->lock); - pch->thread->req[0].desc = NULL; pch->thread->req[1].desc = NULL; pch->thread->req_running = -1; + spin_unlock(&pl330->lock); + power_down = pch->active; pch->active = false; From 9f6d6fb580c0f26cd7ae75dfe75063f4ccc6ccca Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Fri, 20 Jul 2018 13:58:22 +0200 Subject: [PATCH 1041/1212] MIPS: ath79: fix system restart [ Upstream commit f8a7bfe1cb2c1ebfa07775c9c8ac0ad3ba8e5ff5 ] This patch disables irq on reboot to fix hang issues that were observed due to pending interrupts. Signed-off-by: Felix Fietkau Signed-off-by: John Crispin Signed-off-by: Paul Burton Patchwork: https://patchwork.linux-mips.org/patch/19913/ Cc: James Hogan Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/mips/ath79/setup.c | 1 + arch/mips/include/asm/mach-ath79/ath79.h | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/mips/ath79/setup.c b/arch/mips/ath79/setup.c index 8755d618e116..961c393c0f55 100644 --- a/arch/mips/ath79/setup.c +++ b/arch/mips/ath79/setup.c @@ -44,6 +44,7 @@ static char ath79_sys_type[ATH79_SYS_TYPE_LEN]; static void ath79_restart(char *command) { + local_irq_disable(); ath79_device_reset_set(AR71XX_RESET_FULL_CHIP); for (;;) if (cpu_wait) diff --git a/arch/mips/include/asm/mach-ath79/ath79.h b/arch/mips/include/asm/mach-ath79/ath79.h index 4eee221b0cf0..d2be8e4f7a35 100644 --- a/arch/mips/include/asm/mach-ath79/ath79.h +++ b/arch/mips/include/asm/mach-ath79/ath79.h @@ -133,6 +133,7 @@ static inline u32 ath79_pll_rr(unsigned reg) static inline void ath79_reset_wr(unsigned reg, u32 val) { __raw_writel(val, ath79_reset_base + reg); + (void) __raw_readl(ath79_reset_base + reg); /* flush */ } static inline u32 ath79_reset_rr(unsigned reg) From 0eda7472163dbda62a0ac27d3e89851f0e603eba Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Thu, 5 Jul 2018 04:25:19 -0400 Subject: [PATCH 1042/1212] media: videobuf2-core: check for q->error in vb2_core_qbuf() [ Upstream commit b509d733d337417bcb7fa4a35be3b9a49332b724 ] The vb2_core_qbuf() function didn't check if q->error was set. It is checked in __buf_prepare(), but that function isn't called if the buffer was already prepared before with VIDIOC_PREPARE_BUF. So check it at the start of vb2_core_qbuf() as well. Signed-off-by: Hans Verkuil Acked-by: Sakari Ailus Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/media/v4l2-core/videobuf2-core.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/media/v4l2-core/videobuf2-core.c b/drivers/media/v4l2-core/videobuf2-core.c index 0c1a42bf27fd..1c37d5a78822 100644 --- a/drivers/media/v4l2-core/videobuf2-core.c +++ b/drivers/media/v4l2-core/videobuf2-core.c @@ -1366,6 +1366,11 @@ int vb2_core_qbuf(struct vb2_queue *q, unsigned int index, void *pb) struct vb2_buffer *vb; int ret; + if (q->error) { + dprintk(1, "fatal error occurred on queue\n"); + return -EIO; + } + vb = q->bufs[index]; switch (vb->state) { From 66e32b788f4f3336cdff64ebed24c290a8ae8589 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 24 Jul 2018 11:29:01 -0700 Subject: [PATCH 1043/1212] mtd/maps: fix solutionengine.c printk format warnings [ Upstream commit 1d25e3eeed1d987404e2d2e451eebac8c15cecc1 ] Fix 2 printk format warnings (this driver is currently only used by arch/sh/) by using "%pap" instead of "%lx". Fixes these build warnings: ../drivers/mtd/maps/solutionengine.c: In function 'init_soleng_maps': ../include/linux/kern_levels.h:5:18: warning: format '%lx' expects argument of type 'long unsigned int', but argument 2 has type 'resource_size_t' {aka 'unsigned int'} [-Wformat=] ../drivers/mtd/maps/solutionengine.c:62:54: note: format string is defined here printk(KERN_NOTICE "Solution Engine: Flash at 0x%08lx, EPROM at 0x%08lx\n", ~~~~^ %08x ../include/linux/kern_levels.h:5:18: warning: format '%lx' expects argument of type 'long unsigned int', but argument 3 has type 'resource_size_t' {aka 'unsigned int'} [-Wformat=] ../drivers/mtd/maps/solutionengine.c:62:72: note: format string is defined here printk(KERN_NOTICE "Solution Engine: Flash at 0x%08lx, EPROM at 0x%08lx\n", ~~~~^ %08x Cc: David Woodhouse Cc: Brian Norris Cc: Boris Brezillon Cc: Marek Vasut Cc: Richard Weinberger Cc: linux-mtd@lists.infradead.org Cc: Yoshinori Sato Cc: Rich Felker Cc: linux-sh@vger.kernel.org Cc: Sergei Shtylyov Signed-off-by: Randy Dunlap Signed-off-by: Boris Brezillon Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/mtd/maps/solutionengine.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mtd/maps/solutionengine.c b/drivers/mtd/maps/solutionengine.c index bb580bc16445..c07f21b20463 100644 --- a/drivers/mtd/maps/solutionengine.c +++ b/drivers/mtd/maps/solutionengine.c @@ -59,9 +59,9 @@ static int __init init_soleng_maps(void) return -ENXIO; } } - printk(KERN_NOTICE "Solution Engine: Flash at 0x%08lx, EPROM at 0x%08lx\n", - soleng_flash_map.phys & 0x1fffffff, - soleng_eprom_map.phys & 0x1fffffff); + printk(KERN_NOTICE "Solution Engine: Flash at 0x%pap, EPROM at 0x%pap\n", + &soleng_flash_map.phys, + &soleng_eprom_map.phys); flash_mtd->owner = THIS_MODULE; eprom_mtd = do_map_probe("map_rom", &soleng_eprom_map); From 98c059565202926a5ede0c3c04df4c152abd17a1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 24 Jul 2018 19:11:28 +0200 Subject: [PATCH 1044/1212] fbdev: omapfb: off by one in omapfb_register_client() [ Upstream commit 5ec1ec35b2979b59d0b33381e7c9aac17e159d16 ] The omapfb_register_client[] array has OMAPFB_PLANE_NUM elements so the > should be >= or we are one element beyond the end of the array. Fixes: 8b08cf2b64f5 ("OMAP: add TI OMAP framebuffer driver") Signed-off-by: Dan Carpenter Cc: Imre Deak Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/video/fbdev/omap/omapfb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/fbdev/omap/omapfb_main.c b/drivers/video/fbdev/omap/omapfb_main.c index 393ae1bc07e8..a8a6f072fb78 100644 --- a/drivers/video/fbdev/omap/omapfb_main.c +++ b/drivers/video/fbdev/omap/omapfb_main.c @@ -977,7 +977,7 @@ int omapfb_register_client(struct omapfb_notifier_block *omapfb_nb, { int r; - if ((unsigned)omapfb_nb->plane_idx > OMAPFB_PLANE_NUM) + if ((unsigned)omapfb_nb->plane_idx >= OMAPFB_PLANE_NUM) return -EINVAL; if (!notifier_inited) { From 1401b76dfbccc4c8bd6718e2437e112593995b7d Mon Sep 17 00:00:00 2001 From: Anton Vasilyev Date: Tue, 24 Jul 2018 19:11:27 +0200 Subject: [PATCH 1045/1212] video: goldfishfb: fix memory leak on driver remove [ Upstream commit 5958fde72d04e7b8c6de3669d1f794a90997e3eb ] goldfish_fb_probe() allocates memory for fb, but goldfish_fb_remove() does not have deallocation of fb, which leads to memory leak on probe/remove. The patch adds deallocation into goldfish_fb_remove(). Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Anton Vasilyev Cc: Aleksandar Markovic Cc: Miodrag Dinic Cc: Goran Ferenc Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/video/fbdev/goldfishfb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/video/fbdev/goldfishfb.c b/drivers/video/fbdev/goldfishfb.c index 7f6c9e6cfc6c..14a93cb21310 100644 --- a/drivers/video/fbdev/goldfishfb.c +++ b/drivers/video/fbdev/goldfishfb.c @@ -301,6 +301,7 @@ static int goldfish_fb_remove(struct platform_device *pdev) dma_free_coherent(&pdev->dev, framesize, (void *)fb->fb.screen_base, fb->fb.fix.smem_start); iounmap(fb->reg_base); + kfree(fb); return 0; } From 5a85c8d629cbd5731c9eda6cc1a7c041d027683f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 24 Jul 2018 19:11:27 +0200 Subject: [PATCH 1046/1212] fbdev/via: fix defined but not used warning [ Upstream commit b6566b47a67e07fdca44cf51abb14e2fbe17d3eb ] Fix a build warning in viafbdev.c when CONFIG_PROC_FS is not enabled by marking the unused function as __maybe_unused. ../drivers/video/fbdev/via/viafbdev.c:1471:12: warning: 'viafb_sup_odev_proc_show' defined but not used [-Wunused-function] Signed-off-by: Randy Dunlap Cc: Florian Tobias Schandinat Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/video/fbdev/via/viafbdev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/video/fbdev/via/viafbdev.c b/drivers/video/fbdev/via/viafbdev.c index badee04ef496..71b5dca95bdb 100644 --- a/drivers/video/fbdev/via/viafbdev.c +++ b/drivers/video/fbdev/via/viafbdev.c @@ -19,6 +19,7 @@ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ +#include #include #include #include @@ -1468,7 +1469,7 @@ static const struct file_operations viafb_vt1636_proc_fops = { #endif /* CONFIG_FB_VIA_DIRECT_PROCFS */ -static int viafb_sup_odev_proc_show(struct seq_file *m, void *v) +static int __maybe_unused viafb_sup_odev_proc_show(struct seq_file *m, void *v) { via_odev_to_seq(m, supported_odev_map[ viaparinfo->shared->chip_info.gfx_chip_name]); From e71975f0d7d5821d384af9fac2c06a67619a962f Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Tue, 10 Jul 2018 19:28:14 +0530 Subject: [PATCH 1047/1212] perf powerpc: Fix callchain ip filtering when return address is in a register [ Upstream commit 9068533e4f470daf2b0f29c71d865990acd8826e ] For powerpc64, perf will filter out the second entry in the callchain, i.e. the LR value, if the return address of the function corresponding to the probed location has already been saved on its caller's stack. The state of the return address is determined using debug information. At any point within a function, if the return address is already saved somewhere, a DWARF expression can tell us about its location. If the return address in still in LR only, no DWARF expression would exist. Typically, the instructions in a function's prologue first copy the LR value to R0 and then pushes R0 on to the stack. If LR has already been copied to R0 but R0 is yet to be pushed to the stack, we can still get a DWARF expression that says that the return address is in R0. This is indicating that getting a DWARF expression for the return address does not guarantee the fact that it has already been saved on the stack. This can be observed on a powerpc64le system running Fedora 27 as shown below. # objdump -d /usr/lib64/libc-2.26.so | less ... 000000000015af20 : 15af20: 0b 00 4c 3c addis r2,r12,11 15af24: e0 c1 42 38 addi r2,r2,-15904 15af28: a6 02 08 7c mflr r0 15af2c: f0 ff c1 fb std r30,-16(r1) 15af30: f8 ff e1 fb std r31,-8(r1) 15af34: 78 1b 7f 7c mr r31,r3 15af38: 78 23 83 7c mr r3,r4 15af3c: 78 2b be 7c mr r30,r5 15af40: 10 00 01 f8 std r0,16(r1) 15af44: c1 ff 21 f8 stdu r1,-64(r1) 15af48: 28 00 81 f8 std r4,40(r1) ... # readelf --debug-dump=frames-interp /usr/lib64/libc-2.26.so | less ... 00027024 0000000000000024 00027028 FDE cie=00000000 pc=000000000015af20..000000000015af88 LOC CFA r30 r31 ra 000000000015af20 r1+0 u u u 000000000015af34 r1+0 c-16 c-8 r0 000000000015af48 r1+64 c-16 c-8 c+16 000000000015af5c r1+0 c-16 c-8 c+16 000000000015af78 r1+0 u u ... # perf probe -x /usr/lib64/libc-2.26.so -a inet_pton+0x18 # perf record -e probe_libc:inet_pton -g ping -6 -c 1 ::1 # perf script Before: ping 2829 [005] 512917.460174: probe_libc:inet_pton: (7fff7e2baf38) 7fff7e2baf38 __GI___inet_pton+0x18 (/usr/lib64/libc-2.26.so) 7fff7e2705b4 getaddrinfo+0x164 (/usr/lib64/libc-2.26.so) 12f152d70 _init+0xbfc (/usr/bin/ping) 7fff7e1836a0 generic_start_main.isra.0+0x140 (/usr/lib64/libc-2.26.so) 7fff7e183898 __libc_start_main+0xb8 (/usr/lib64/libc-2.26.so) 0 [unknown] ([unknown]) After: ping 2829 [005] 512917.460174: probe_libc:inet_pton: (7fff7e2baf38) 7fff7e2baf38 __GI___inet_pton+0x18 (/usr/lib64/libc-2.26.so) 7fff7e26fa54 gaih_inet.constprop.7+0xf44 (/usr/lib64/libc-2.26.so) 7fff7e2705b4 getaddrinfo+0x164 (/usr/lib64/libc-2.26.so) 12f152d70 _init+0xbfc (/usr/bin/ping) 7fff7e1836a0 generic_start_main.isra.0+0x140 (/usr/lib64/libc-2.26.so) 7fff7e183898 __libc_start_main+0xb8 (/usr/lib64/libc-2.26.so) 0 [unknown] ([unknown]) Reported-by: Ravi Bangoria Signed-off-by: Sandipan Das Cc: Jiri Olsa Cc: Maynard Johnson Cc: Naveen N. Rao Cc: Ravi Bangoria Cc: Sukadev Bhattiprolu Link: http://lkml.kernel.org/r/66e848a7bdf2d43b39210a705ff6d828a0865661.1530724939.git.sandipan@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- tools/perf/arch/powerpc/util/skip-callchain-idx.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/perf/arch/powerpc/util/skip-callchain-idx.c b/tools/perf/arch/powerpc/util/skip-callchain-idx.c index bd630c222e65..9d1f6e976a5a 100644 --- a/tools/perf/arch/powerpc/util/skip-callchain-idx.c +++ b/tools/perf/arch/powerpc/util/skip-callchain-idx.c @@ -58,9 +58,13 @@ static int check_return_reg(int ra_regno, Dwarf_Frame *frame) } /* - * Check if return address is on the stack. + * Check if return address is on the stack. If return address + * is in a register (typically R0), it is yet to be saved on + * the stack. */ - if (nops != 0 || ops != NULL) + if ((nops != 0 || ops != NULL) && + !(nops == 1 && ops[0].atom == DW_OP_regx && + ops[0].number2 == 0 && ops[0].offset == 0)) return 0; /* From 57a1dd74441dc0c0ff5f0968138f4d82e556c80f Mon Sep 17 00:00:00 2001 From: Fredrik Noring Date: Tue, 24 Jul 2018 19:11:24 +0200 Subject: [PATCH 1048/1212] fbdev: Distinguish between interlaced and progressive modes [ Upstream commit 1ba0a59cea41ea05fda92daaf2a2958a2246b9cf ] I discovered the problem when developing a frame buffer driver for the PlayStation 2 (not yet merged), using the following video modes for the PlayStation 3 in drivers/video/fbdev/ps3fb.c: }, { /* 1080if */ "1080if", 50, 1920, 1080, 13468, 148, 484, 36, 4, 88, 5, FB_SYNC_BROADCAST, FB_VMODE_INTERLACED }, { /* 1080pf */ "1080pf", 50, 1920, 1080, 6734, 148, 484, 36, 4, 88, 5, FB_SYNC_BROADCAST, FB_VMODE_NONINTERLACED }, In ps3fb_probe, the mode_option module parameter is used with fb_find_mode but it can only select the interlaced variant of 1920x1080 since the loop matching the modes does not take the difference between interlaced and progressive modes into account. In short, without the patch, progressive 1920x1080 cannot be chosen as a mode_option parameter since fb_find_mode (falsely) thinks interlace is a perfect match. Signed-off-by: Fredrik Noring Cc: "Maciej W. Rozycki" [b.zolnierkie: updated patch description] Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/video/fbdev/core/modedb.c | 41 ++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/drivers/video/fbdev/core/modedb.c b/drivers/video/fbdev/core/modedb.c index 2510fa728d77..de119f11b78f 100644 --- a/drivers/video/fbdev/core/modedb.c +++ b/drivers/video/fbdev/core/modedb.c @@ -644,7 +644,7 @@ static int fb_try_mode(struct fb_var_screeninfo *var, struct fb_info *info, * * Valid mode specifiers for @mode_option: * - * x[M][R][-][@][i][m] or + * x[M][R][-][@][i][p][m] or * [-][@] * * with , , and decimal numbers and @@ -653,10 +653,10 @@ static int fb_try_mode(struct fb_var_screeninfo *var, struct fb_info *info, * If 'M' is present after yres (and before refresh/bpp if present), * the function will compute the timings using VESA(tm) Coordinated * Video Timings (CVT). If 'R' is present after 'M', will compute with - * reduced blanking (for flatpanels). If 'i' is present, compute - * interlaced mode. If 'm' is present, add margins equal to 1.8% - * of xres rounded down to 8 pixels, and 1.8% of yres. The char - * 'i' and 'm' must be after 'M' and 'R'. Example: + * reduced blanking (for flatpanels). If 'i' or 'p' are present, compute + * interlaced or progressive mode. If 'm' is present, add margins equal + * to 1.8% of xres rounded down to 8 pixels, and 1.8% of yres. The chars + * 'i', 'p' and 'm' must be after 'M' and 'R'. Example: * * 1024x768MR-8@60m - Reduced blank with margins at 60Hz. * @@ -697,7 +697,8 @@ int fb_find_mode(struct fb_var_screeninfo *var, unsigned int namelen = strlen(name); int res_specified = 0, bpp_specified = 0, refresh_specified = 0; unsigned int xres = 0, yres = 0, bpp = default_bpp, refresh = 0; - int yres_specified = 0, cvt = 0, rb = 0, interlace = 0; + int yres_specified = 0, cvt = 0, rb = 0; + int interlace_specified = 0, interlace = 0; int margins = 0; u32 best, diff, tdiff; @@ -748,9 +749,17 @@ int fb_find_mode(struct fb_var_screeninfo *var, if (!cvt) margins = 1; break; + case 'p': + if (!cvt) { + interlace = 0; + interlace_specified = 1; + } + break; case 'i': - if (!cvt) + if (!cvt) { interlace = 1; + interlace_specified = 1; + } break; default: goto done; @@ -819,11 +828,21 @@ int fb_find_mode(struct fb_var_screeninfo *var, if ((name_matches(db[i], name, namelen) || (res_specified && res_matches(db[i], xres, yres))) && !fb_try_mode(var, info, &db[i], bpp)) { - if (refresh_specified && db[i].refresh == refresh) - return 1; + const int db_interlace = (db[i].vmode & + FB_VMODE_INTERLACED ? 1 : 0); + int score = abs(db[i].refresh - refresh); - if (abs(db[i].refresh - refresh) < diff) { - diff = abs(db[i].refresh - refresh); + if (interlace_specified) + score += abs(db_interlace - interlace); + + if (!interlace_specified || + db_interlace == interlace) + if (refresh_specified && + db[i].refresh == refresh) + return 1; + + if (score < diff) { + diff = score; best = i; } } From 016353ef55c98bf9d743007e4ffedf5949ef175b Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 24 Jul 2018 18:48:14 +0200 Subject: [PATCH 1049/1212] ARM: exynos: Clear global variable on init error path [ Upstream commit cd4806911cee3901bc2b5eb95603cf1958720b57 ] For most of Exynos SoCs, Power Management Unit (PMU) address space is mapped into global variable 'pmu_base_addr' very early when initializing PMU interrupt controller. A lot of other machine code depends on it so when doing iounmap() on this address, clear the global as well to avoid usage of invalid value (pointing to unmapped memory region). Properly mapped PMU address space is a requirement for all other machine code so this fix is purely theoretical. Boot will fail immediately in many other places after following this error path. Signed-off-by: Krzysztof Kozlowski Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/arm/mach-exynos/suspend.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/mach-exynos/suspend.c b/arch/arm/mach-exynos/suspend.c index c169cc3049aa..e8adb428dddb 100644 --- a/arch/arm/mach-exynos/suspend.c +++ b/arch/arm/mach-exynos/suspend.c @@ -260,6 +260,7 @@ static int __init exynos_pmu_irq_init(struct device_node *node, NULL); if (!domain) { iounmap(pmu_base_addr); + pmu_base_addr = NULL; return -ENOMEM; } From d4aa4e4f77646293ad74d35f28fe4c48bea3ec9e Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Tue, 10 Jul 2018 19:28:13 +0530 Subject: [PATCH 1050/1212] perf powerpc: Fix callchain ip filtering [ Upstream commit c715fcfda5a08edabaa15508742be926b7ee51db ] For powerpc64, redundant entries in the callchain are filtered out by determining the state of the return address and the stack frame using DWARF debug information. For making these filtering decisions we must analyze the debug information for the location corresponding to the program counter value, i.e. the first entry in the callchain, and not the LR value; otherwise, perf may filter out either the second or the third entry in the callchain incorrectly. This can be observed on a powerpc64le system running Fedora 27 as shown below. Case 1 - Attaching a probe at inet_pton+0x8 (binary offset 0x15af28). Return address is still in LR and a new stack frame is not yet allocated. The LR value, i.e. the second entry, should not be filtered out. # objdump -d /usr/lib64/libc-2.26.so | less ... 000000000010eb10 : ... 10fa48: 78 bb e4 7e mr r4,r23 10fa4c: 0a 00 60 38 li r3,10 10fa50: d9 b4 04 48 bl 15af28 10fa54: 00 00 00 60 nop 10fa58: ac f4 ff 4b b 10ef04 ... 0000000000110450 : ... 1105a8: 54 00 ff 38 addi r7,r31,84 1105ac: 58 00 df 38 addi r6,r31,88 1105b0: 69 e5 ff 4b bl 10eb18 1105b4: 78 1b 71 7c mr r17,r3 1105b8: 50 01 7f e8 ld r3,336(r31) ... 000000000015af20 : 15af20: 0b 00 4c 3c addis r2,r12,11 15af24: e0 c1 42 38 addi r2,r2,-15904 15af28: a6 02 08 7c mflr r0 15af2c: f0 ff c1 fb std r30,-16(r1) 15af30: f8 ff e1 fb std r31,-8(r1) ... # perf probe -x /usr/lib64/libc-2.26.so -a inet_pton+0x8 # perf record -e probe_libc:inet_pton -g ping -6 -c 1 ::1 # perf script Before: ping 4507 [002] 514985.546540: probe_libc:inet_pton: (7fffa7dbaf28) 7fffa7dbaf28 __GI___inet_pton+0x8 (/usr/lib64/libc-2.26.so) 7fffa7d705b4 getaddrinfo+0x164 (/usr/lib64/libc-2.26.so) 13fb52d70 _init+0xbfc (/usr/bin/ping) 7fffa7c836a0 generic_start_main.isra.0+0x140 (/usr/lib64/libc-2.26.so) 7fffa7c83898 __libc_start_main+0xb8 (/usr/lib64/libc-2.26.so) 0 [unknown] ([unknown]) After: ping 4507 [002] 514985.546540: probe_libc:inet_pton: (7fffa7dbaf28) 7fffa7dbaf28 __GI___inet_pton+0x8 (/usr/lib64/libc-2.26.so) 7fffa7d6fa54 gaih_inet.constprop.7+0xf44 (/usr/lib64/libc-2.26.so) 7fffa7d705b4 getaddrinfo+0x164 (/usr/lib64/libc-2.26.so) 13fb52d70 _init+0xbfc (/usr/bin/ping) 7fffa7c836a0 generic_start_main.isra.0+0x140 (/usr/lib64/libc-2.26.so) 7fffa7c83898 __libc_start_main+0xb8 (/usr/lib64/libc-2.26.so) 0 [unknown] ([unknown]) Case 2 - Attaching a probe at _int_malloc+0x180 (binary offset 0x9cf10). Return address in still in LR and a new stack frame has already been allocated but not used. The caller's caller, i.e. the third entry, is invalid and should be filtered out and not the second one. # objdump -d /usr/lib64/libc-2.26.so | less ... 000000000009cd90 <_int_malloc>: 9cd90: 17 00 4c 3c addis r2,r12,23 9cd94: 70 a3 42 38 addi r2,r2,-23696 9cd98: 26 00 80 7d mfcr r12 9cd9c: f8 ff e1 fb std r31,-8(r1) 9cda0: 17 00 e4 3b addi r31,r4,23 9cda4: d8 ff 61 fb std r27,-40(r1) 9cda8: 78 23 9b 7c mr r27,r4 9cdac: 1f 00 bf 2b cmpldi cr7,r31,31 9cdb0: f0 ff c1 fb std r30,-16(r1) 9cdb4: b0 ff c1 fa std r22,-80(r1) 9cdb8: 78 1b 7e 7c mr r30,r3 9cdbc: 08 00 81 91 stw r12,8(r1) 9cdc0: 11 ff 21 f8 stdu r1,-240(r1) 9cdc4: 4c 01 9d 41 bgt cr7,9cf10 <_int_malloc+0x180> 9cdc8: 20 00 a4 2b cmpldi cr7,r4,32 ... 9cf08: 00 00 00 60 nop 9cf0c: 00 00 42 60 ori r2,r2,0 9cf10: e4 06 ff 7b rldicr r31,r31,0,59 9cf14: 40 f8 a4 7f cmpld cr7,r4,r31 9cf18: 68 05 9d 41 bgt cr7,9d480 <_int_malloc+0x6f0> ... 000000000009e3c0 : ... 9e420: 40 02 80 38 li r4,576 9e424: 78 fb e3 7f mr r3,r31 9e428: 71 e9 ff 4b bl 9cd98 <_int_malloc+0x8> 9e42c: 00 00 a3 2f cmpdi cr7,r3,0 9e430: 78 1b 7e 7c mr r30,r3 ... 000000000009f7a0 <__libc_malloc>: ... 9f8f8: 00 00 89 2f cmpwi cr7,r9,0 9f8fc: 1c ff 9e 40 bne cr7,9f818 <__libc_malloc+0x78> 9f900: c9 ea ff 4b bl 9e3c8 9f904: 00 00 00 60 nop 9f908: e8 90 22 e9 ld r9,-28440(r2) ... # perf probe -x /usr/lib64/libc-2.26.so -a _int_malloc+0x180 # perf record -e probe_libc:_int_malloc -g ./test-malloc # perf script Before: test-malloc 6554 [009] 515975.797403: probe_libc:_int_malloc: (7fffa6e6cf10) 7fffa6e6cf10 _int_malloc+0x180 (/usr/lib64/libc-2.26.so) 7fffa6dd0000 [unknown] (/usr/lib64/libc-2.26.so) 7fffa6e6f904 malloc+0x164 (/usr/lib64/libc-2.26.so) 7fffa6e6f9fc malloc+0x25c (/usr/lib64/libc-2.26.so) 100006b4 main+0x38 (/home/testuser/test-malloc) 7fffa6df36a0 generic_start_main.isra.0+0x140 (/usr/lib64/libc-2.26.so) 7fffa6df3898 __libc_start_main+0xb8 (/usr/lib64/libc-2.26.so) 0 [unknown] ([unknown]) After: test-malloc 6554 [009] 515975.797403: probe_libc:_int_malloc: (7fffa6e6cf10) 7fffa6e6cf10 _int_malloc+0x180 (/usr/lib64/libc-2.26.so) 7fffa6e6e42c tcache_init.part.4+0x6c (/usr/lib64/libc-2.26.so) 7fffa6e6f904 malloc+0x164 (/usr/lib64/libc-2.26.so) 7fffa6e6f9fc malloc+0x25c (/usr/lib64/libc-2.26.so) 100006b4 main+0x38 (/home/sandipan/test-malloc) 7fffa6df36a0 generic_start_main.isra.0+0x140 (/usr/lib64/libc-2.26.so) 7fffa6df3898 __libc_start_main+0xb8 (/usr/lib64/libc-2.26.so) 0 [unknown] ([unknown]) Signed-off-by: Sandipan Das Cc: Jiri Olsa Cc: Maynard Johnson Cc: Naveen N. Rao Cc: Ravi Bangoria Cc: Sukadev Bhattiprolu Fixes: a60335ba3298 ("perf tools powerpc: Adjust callchain based on DWARF debug info") Link: http://lkml.kernel.org/r/24bb726d91ed173aebc972ec3f41a2ef2249434e.1530724939.git.sandipan@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- tools/perf/arch/powerpc/util/skip-callchain-idx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/arch/powerpc/util/skip-callchain-idx.c b/tools/perf/arch/powerpc/util/skip-callchain-idx.c index 9d1f6e976a5a..9a53f6e9ef43 100644 --- a/tools/perf/arch/powerpc/util/skip-callchain-idx.c +++ b/tools/perf/arch/powerpc/util/skip-callchain-idx.c @@ -250,7 +250,7 @@ int arch_skip_callchain_idx(struct thread *thread, struct ip_callchain *chain) if (!chain || chain->nr < 3) return skip_slot; - ip = chain->ips[2]; + ip = chain->ips[1]; thread__find_addr_location(thread, PERF_RECORD_MISC_USER, MAP__FUNCTION, ip, &al); From 8e8c3ba5add95a2f841b149162b928b138412f36 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 1 May 2018 00:55:44 +1000 Subject: [PATCH 1051/1212] powerpc/powernv: opal_put_chars partial write fix [ Upstream commit bd90284cc6c1c9e8e48c8eadd0c79574fcce0b81 ] The intention here is to consume and discard the remaining buffer upon error. This works if there has not been a previous partial write. If there has been, then total_len is no longer total number of bytes to copy. total_len is always "bytes left to copy", so it should be added to written bytes. This code may not be exercised any more if partial writes will not be hit, but this is a small bugfix before a larger change. Reviewed-by: Benjamin Herrenschmidt Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/platforms/powernv/opal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index e48826aa314c..b40606051efe 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -371,7 +371,7 @@ int opal_put_chars(uint32_t vtermno, const char *data, int total_len) /* Closed or other error drop */ if (rc != OPAL_SUCCESS && rc != OPAL_BUSY && rc != OPAL_BUSY_EVENT) { - written = total_len; + written += total_len; break; } if (rc == OPAL_SUCCESS) { From 2ca7b66d826dcf33fec1fff6567f07c460a42955 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Sun, 8 Jul 2018 17:07:12 +0200 Subject: [PATCH 1052/1212] MIPS: jz4740: Bump zload address [ Upstream commit c6ea7e9747318e5a6774995f4f8e3e0f7c0fa8ba ] Having the zload address at 0x8060.0000 means the size of the uncompressed kernel cannot be bigger than around 6 MiB, as it is deflated at address 0x8001.0000. This limit is too small; a kernel with some built-in drivers and things like debugfs enabled will already be over 6 MiB in size, and so will fail to extract properly. To fix this, we bump the zload address from 0x8060.0000 to 0x8100.0000. This is fine, as all the boards featuring Ingenic JZ SoCs have at least 32 MiB of RAM, and use u-boot or compatible bootloaders which won't hardcode the load address but read it from the uImage's header. Signed-off-by: Paul Cercueil Signed-off-by: Paul Burton Patchwork: https://patchwork.linux-mips.org/patch/19787/ Cc: Ralf Baechle Cc: James Hogan Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/mips/jz4740/Platform | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/jz4740/Platform b/arch/mips/jz4740/Platform index 28448d358c10..a2a5a85ea1f9 100644 --- a/arch/mips/jz4740/Platform +++ b/arch/mips/jz4740/Platform @@ -1,4 +1,4 @@ platform-$(CONFIG_MACH_INGENIC) += jz4740/ cflags-$(CONFIG_MACH_INGENIC) += -I$(srctree)/arch/mips/include/asm/mach-jz4740 load-$(CONFIG_MACH_INGENIC) += 0xffffffff80010000 -zload-$(CONFIG_MACH_INGENIC) += 0xffffffff80600000 +zload-$(CONFIG_MACH_INGENIC) += 0xffffffff81000000 From 773320786c568693fe809c43cb46851d6f723419 Mon Sep 17 00:00:00 2001 From: Manikanta Pubbisetty Date: Tue, 10 Jul 2018 16:48:27 +0530 Subject: [PATCH 1053/1212] mac80211: restrict delayed tailroom needed decrement [ Upstream commit 133bf90dbb8b873286f8ec2e81ba26e863114b8c ] As explained in ieee80211_delayed_tailroom_dec(), during roam, keys of the old AP will be destroyed and new keys will be installed. Deletion of the old key causes crypto_tx_tailroom_needed_cnt to go from 1 to 0 and the new key installation causes a transition from 0 to 1. Whenever crypto_tx_tailroom_needed_cnt transitions from 0 to 1, we invoke synchronize_net(); the reason for doing this is to avoid a race in the TX path as explained in increment_tailroom_need_count(). This synchronize_net() operation can be slow and can affect the station roam time. To avoid this, decrementing the crypto_tx_tailroom_needed_cnt is delayed for a while so that upon installation of new key the transition would be from 1 to 2 instead of 0 to 1 and thereby improving the roam time. This is all correct for a STA iftype, but deferring the tailroom_needed decrement for other iftypes may be unnecessary. For example, let's consider the case of a 4-addr client connecting to an AP for which AP_VLAN interface is also created, let the initial value for tailroom_needed on the AP be 1. * 4-addr client connects to the AP (AP: tailroom_needed = 1) * AP will clear old keys, delay decrement of tailroom_needed count * AP_VLAN is created, it takes the tailroom count from master (AP_VLAN: tailroom_needed = 1, AP: tailroom_needed = 1) * Install new key for the station, assume key is plumbed in the HW, there won't be any change in tailroom_needed count on AP iface * Delayed decrement of tailroom_needed count on AP (AP: tailroom_needed = 0, AP_VLAN: tailroom_needed = 1) Because of the delayed decrement on AP iface, tailroom_needed count goes out of sync between AP(master iface) and AP_VLAN(slave iface) and there would be unnecessary tailroom created for the packets going through AP_VLAN iface. Also, WARN_ONs were observed while trying to bring down the AP_VLAN interface: (warn_slowpath_common) (warn_slowpath_null+0x18/0x20) (warn_slowpath_null) (ieee80211_free_keys+0x114/0x1e4) (ieee80211_free_keys) (ieee80211_del_virtual_monitor+0x51c/0x850) (ieee80211_del_virtual_monitor) (ieee80211_stop+0x30/0x3c) (ieee80211_stop) (__dev_close_many+0x94/0xb8) (__dev_close_many) (dev_close_many+0x5c/0xc8) Restricting delayed decrement to station interface alone fixes the problem and it makes sense to do so because delayed decrement is done to improve roam time which is applicable only for client devices. Signed-off-by: Manikanta Pubbisetty Signed-off-by: Johannes Berg Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/mac80211/cfg.c | 2 +- net/mac80211/key.c | 24 +++++++++++++++--------- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 00a8cc572a22..1f930032253a 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -286,7 +286,7 @@ static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev, goto out_unlock; } - ieee80211_key_free(key, true); + ieee80211_key_free(key, sdata->vif.type == NL80211_IFTYPE_STATION); ret = 0; out_unlock: diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 4a72c0d1e56f..91a4e606edcd 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -647,11 +647,15 @@ int ieee80211_key_link(struct ieee80211_key *key, { struct ieee80211_local *local = sdata->local; struct ieee80211_key *old_key; - int idx, ret; - bool pairwise; - - pairwise = key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE; - idx = key->conf.keyidx; + int idx = key->conf.keyidx; + bool pairwise = key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE; + /* + * We want to delay tailroom updates only for station - in that + * case it helps roaming speed, but in other cases it hurts and + * can cause warnings to appear. + */ + bool delay_tailroom = sdata->vif.type == NL80211_IFTYPE_STATION; + int ret; mutex_lock(&sdata->local->key_mtx); @@ -679,14 +683,14 @@ int ieee80211_key_link(struct ieee80211_key *key, increment_tailroom_need_count(sdata); ieee80211_key_replace(sdata, sta, pairwise, old_key, key); - ieee80211_key_destroy(old_key, true); + ieee80211_key_destroy(old_key, delay_tailroom); ieee80211_debugfs_key_add(key); if (!local->wowlan) { ret = ieee80211_key_enable_hw_accel(key); if (ret) - ieee80211_key_free(key, true); + ieee80211_key_free(key, delay_tailroom); } else { ret = 0; } @@ -874,7 +878,8 @@ void ieee80211_free_sta_keys(struct ieee80211_local *local, ieee80211_key_replace(key->sdata, key->sta, key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE, key, NULL); - __ieee80211_key_destroy(key, true); + __ieee80211_key_destroy(key, key->sdata->vif.type == + NL80211_IFTYPE_STATION); } for (i = 0; i < NUM_DEFAULT_KEYS; i++) { @@ -884,7 +889,8 @@ void ieee80211_free_sta_keys(struct ieee80211_local *local, ieee80211_key_replace(key->sdata, key->sta, key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE, key, NULL); - __ieee80211_key_destroy(key, true); + __ieee80211_key_destroy(key, key->sdata->vif.type == + NL80211_IFTYPE_STATION); } mutex_unlock(&local->key_mtx); From 2e0c018c0c60ab5053cccfa2f3f5f6641d703754 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 20 Jul 2018 18:33:59 +0200 Subject: [PATCH 1054/1212] xen-netfront: fix queue name setting [ Upstream commit 2d408c0d4574b01b9ed45e02516888bf925e11a9 ] Commit f599c64fdf7d ("xen-netfront: Fix race between device setup and open") changed the initialization order: xennet_create_queues() now happens before we do register_netdev() so using netdev->name in xennet_init_queue() is incorrect, we end up with the following in /proc/interrupts: 60: 139 0 xen-dyn -event eth%d-q0-tx 61: 265 0 xen-dyn -event eth%d-q0-rx 62: 234 0 xen-dyn -event eth%d-q1-tx 63: 1 0 xen-dyn -event eth%d-q1-rx and this looks ugly. Actually, using early netdev name (even when it's already set) is also not ideal: nowadays we tend to rename eth devices and queue name may end up not corresponding to the netdev name. Use nodename from xenbus device for queue naming: this can't change in VM's lifetime. Now /proc/interrupts looks like 62: 202 0 xen-dyn -event device/vif/0-q0-tx 63: 317 0 xen-dyn -event device/vif/0-q0-rx 64: 262 0 xen-dyn -event device/vif/0-q1-tx 65: 17 0 xen-dyn -event device/vif/0-q1-rx Fixes: f599c64fdf7d ("xen-netfront: Fix race between device setup and open") Signed-off-by: Vitaly Kuznetsov Reviewed-by: Ross Lagerwall Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/xen-netfront.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 68d0a5c9d437..2c0db518fe14 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -1616,7 +1616,7 @@ static int xennet_init_queue(struct netfront_queue *queue) (unsigned long)queue); snprintf(queue->name, sizeof(queue->name), "%s-q%u", - queue->info->netdev->name, queue->id); + queue->info->xbdev->nodename, queue->id); /* Initialise tx_skbs as a free chain containing every entry. */ queue->tx_skb_freelist = 0; From 4f3381ab8af23acc958828795477c7f8929e8925 Mon Sep 17 00:00:00 2001 From: Loic Poulain Date: Wed, 11 Jul 2018 14:18:23 +0200 Subject: [PATCH 1055/1212] arm64: dts: qcom: db410c: Fix Bluetooth LED trigger [ Upstream commit e53db018315b7660bb7000a29e79faff2496c2c2 ] Current LED trigger, 'bt', is not known/used by any existing driver. Fix this by renaming it to 'bluetooth-power' trigger which is controlled by the Bluetooth subsystem. Fixes: 9943230c8860 ("arm64: dts: qcom: Add apq8016-sbc board LED's related device nodes") Signed-off-by: Loic Poulain Signed-off-by: Andy Gross Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/arm64/boot/dts/qcom/apq8016-sbc.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/qcom/apq8016-sbc.dtsi b/arch/arm64/boot/dts/qcom/apq8016-sbc.dtsi index 6b8abbe68746..3011c88bd2f3 100644 --- a/arch/arm64/boot/dts/qcom/apq8016-sbc.dtsi +++ b/arch/arm64/boot/dts/qcom/apq8016-sbc.dtsi @@ -105,7 +105,7 @@ led@5 { led@6 { label = "apq8016-sbc:blue:bt"; gpios = <&pm8916_mpps 3 GPIO_ACTIVE_HIGH>; - linux,default-trigger = "bt"; + linux,default-trigger = "bluetooth-power"; default-state = "off"; }; }; From 29162495129d39080d21069364f3eb9919509a9a Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Thu, 19 Jul 2018 12:43:48 +0200 Subject: [PATCH 1056/1212] s390/qeth: fix race in used-buffer accounting [ Upstream commit a702349a4099cd5a7bab0904689d8e0bf8dcd622 ] By updating q->used_buffers only _after_ do_QDIO() has completed, there is a potential race against the buffer's TX completion. In the unlikely case that the TX completion path wins, qeth_qdio_output_handler() would decrement the counter before qeth_flush_buffers() even incremented it. Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/s390/net/qeth_core_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index 95c631125a20..09ac56317f1b 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -3505,13 +3505,14 @@ static void qeth_flush_buffers(struct qeth_qdio_out_q *queue, int index, qdio_flags = QDIO_FLAG_SYNC_OUTPUT; if (atomic_read(&queue->set_pci_flags_count)) qdio_flags |= QDIO_FLAG_PCI_OUT; + atomic_add(count, &queue->used_buffers); + rc = do_QDIO(CARD_DDEV(queue->card), qdio_flags, queue->queue_no, index, count); if (queue->card->options.performance_stats) queue->card->perf_stats.outbound_do_qdio_time += qeth_get_micros() - queue->card->perf_stats.outbound_do_qdio_start_time; - atomic_add(count, &queue->used_buffers); if (rc) { queue->card->stats.tx_errors += count; /* ignore temporary SIGA errors without busy condition */ From a4b8132c86c9c5a94e151cd9832ef17aef625963 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Thu, 19 Jul 2018 12:43:49 +0200 Subject: [PATCH 1057/1212] s390/qeth: reset layer2 attribute on layer switch [ Upstream commit 70551dc46ffa3555a0b5f3545b0cd87ab67fd002 ] After the subdriver's remove() routine has completed, the card's layer mode is undetermined again. Reflect this in the layer2 field. If qeth_dev_layer2_store() hits an error after remove() was called, the card _always_ requires a setup(), even if the previous layer mode is requested again. But qeth_dev_layer2_store() bails out early if the requested layer mode still matches the current one. So unless we reset the layer2 field, re-probing the card back to its previous mode is currently not possible. Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/s390/net/qeth_core_sys.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/s390/net/qeth_core_sys.c b/drivers/s390/net/qeth_core_sys.c index fa844b0ff847..7bcf0dae3a65 100644 --- a/drivers/s390/net/qeth_core_sys.c +++ b/drivers/s390/net/qeth_core_sys.c @@ -419,6 +419,7 @@ static ssize_t qeth_dev_layer2_store(struct device *dev, if (card->discipline) { card->discipline->remove(card->gdev); qeth_core_free_discipline(card); + card->options.layer2 = -1; } rc = qeth_core_load_discipline(card, newdis); From da189ebd788cf139303868fe51257b3d25ffbf62 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 6 Jul 2018 20:53:09 -0700 Subject: [PATCH 1058/1212] platform/x86: toshiba_acpi: Fix defined but not used build warnings [ Upstream commit c2e2a618eb7104e18fdcf739d4d911563812a81c ] Fix a build warning in toshiba_acpi.c when CONFIG_PROC_FS is not enabled by marking the unused function as __maybe_unused. ../drivers/platform/x86/toshiba_acpi.c:1685:12: warning: 'version_proc_show' defined but not used [-Wunused-function] Signed-off-by: Randy Dunlap Cc: Azael Avalos Cc: platform-driver-x86@vger.kernel.org Cc: Andy Shevchenko Signed-off-by: Darren Hart (VMware) Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/platform/x86/toshiba_acpi.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c index f774cb576ffa..1ff95b5a429d 100644 --- a/drivers/platform/x86/toshiba_acpi.c +++ b/drivers/platform/x86/toshiba_acpi.c @@ -34,6 +34,7 @@ #define TOSHIBA_ACPI_VERSION "0.23" #define PROC_INTERFACE_VERSION 1 +#include #include #include #include @@ -1472,7 +1473,7 @@ static const struct file_operations keys_proc_fops = { .write = keys_proc_write, }; -static int version_proc_show(struct seq_file *m, void *v) +static int __maybe_unused version_proc_show(struct seq_file *m, void *v) { seq_printf(m, "driver: %s\n", TOSHIBA_ACPI_VERSION); seq_printf(m, "proc_interface: %d\n", PROC_INTERFACE_VERSION); From 7150120104483994c464e55c985afa930f5e2a73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20M=C3=BCller?= Date: Sun, 15 Jul 2018 00:27:06 +0200 Subject: [PATCH 1059/1212] crypto: sharah - Unregister correct algorithms for SAHARA 3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 0e7d4d932ffc23f75efb31a8c2ac2396c1b81c55 ] This patch fixes two typos related to unregistering algorithms supported by SAHARAH 3. In sahara_register_algs the wrong algorithms are unregistered in case of an error. In sahara_unregister_algs the wrong array is used to determine the iteration count. Signed-off-by: Michael Müller Signed-off-by: Herbert Xu Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/crypto/sahara.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/sahara.c b/drivers/crypto/sahara.c index f68c24a98277..dedfc96acc66 100644 --- a/drivers/crypto/sahara.c +++ b/drivers/crypto/sahara.c @@ -1363,7 +1363,7 @@ static int sahara_register_algs(struct sahara_dev *dev) err_sha_v3_algs: for (j = 0; j < k; j++) - crypto_unregister_ahash(&sha_v4_algs[j]); + crypto_unregister_ahash(&sha_v3_algs[j]); err_aes_algs: for (j = 0; j < i; j++) @@ -1379,7 +1379,7 @@ static void sahara_unregister_algs(struct sahara_dev *dev) for (i = 0; i < ARRAY_SIZE(aes_algs); i++) crypto_unregister_alg(&aes_algs[i]); - for (i = 0; i < ARRAY_SIZE(sha_v4_algs); i++) + for (i = 0; i < ARRAY_SIZE(sha_v3_algs); i++) crypto_unregister_ahash(&sha_v3_algs[i]); if (dev->version > SAHARA_VERSION_3) From 12c3ba18e3f11cb573589d14c2d82646b1981d29 Mon Sep 17 00:00:00 2001 From: Xiao Liang Date: Tue, 14 Aug 2018 23:21:28 +0800 Subject: [PATCH 1060/1212] xen-netfront: fix warn message as irq device name has '/' [ Upstream commit 21f2706b20100bb3db378461ab9b8e2035309b5b ] There is a call trace generated after commit 2d408c0d4574b01b9ed45e02516888bf925e11a9( xen-netfront: fix queue name setting). There is no 'device/vif/xx-q0-tx' file found under /proc/irq/xx/. This patch only picks up device type and id as its name. With the patch, now /proc/interrupts looks like below and the warning message gone: 70: 21 0 0 0 xen-dyn -event vif0-q0-tx 71: 15 0 0 0 xen-dyn -event vif0-q0-rx 72: 14 0 0 0 xen-dyn -event vif0-q1-tx 73: 33 0 0 0 xen-dyn -event vif0-q1-rx 74: 12 0 0 0 xen-dyn -event vif0-q2-tx 75: 24 0 0 0 xen-dyn -event vif0-q2-rx 76: 19 0 0 0 xen-dyn -event vif0-q3-tx 77: 21 0 0 0 xen-dyn -event vif0-q3-rx Below is call trace information without this patch: name 'device/vif/0-q0-tx' WARNING: CPU: 2 PID: 37 at fs/proc/generic.c:174 __xlate_proc_name+0x85/0xa0 RIP: 0010:__xlate_proc_name+0x85/0xa0 RSP: 0018:ffffb85c40473c18 EFLAGS: 00010286 RAX: 0000000000000000 RBX: 0000000000000006 RCX: 0000000000000006 RDX: 0000000000000007 RSI: 0000000000000096 RDI: ffff984c7f516930 RBP: ffffb85c40473cb8 R08: 000000000000002c R09: 0000000000000229 R10: 0000000000000000 R11: 0000000000000001 R12: ffffb85c40473c98 R13: ffffb85c40473cb8 R14: ffffb85c40473c50 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff984c7f500000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f69b6899038 CR3: 000000001c20a006 CR4: 00000000001606e0 Call Trace: __proc_create+0x45/0x230 ? snprintf+0x49/0x60 proc_mkdir_data+0x35/0x90 register_handler_proc+0xef/0x110 ? proc_register+0xfc/0x110 ? proc_create_data+0x70/0xb0 __setup_irq+0x39b/0x660 ? request_threaded_irq+0xad/0x160 request_threaded_irq+0xf5/0x160 ? xennet_tx_buf_gc+0x1d0/0x1d0 [xen_netfront] bind_evtchn_to_irqhandler+0x3d/0x70 ? xenbus_alloc_evtchn+0x41/0xa0 netback_changed+0xa46/0xcda [xen_netfront] ? find_watch+0x40/0x40 xenwatch_thread+0xc5/0x160 ? finish_wait+0x80/0x80 kthread+0x112/0x130 ? kthread_create_worker_on_cpu+0x70/0x70 ret_from_fork+0x35/0x40 Code: 81 5c 00 48 85 c0 75 cc 5b 49 89 2e 31 c0 5d 4d 89 3c 24 41 5c 41 5d 41 5e 41 5f c3 4c 89 ee 48 c7 c7 40 4f 0e b4 e8 65 ea d8 ff <0f> 0b b8 fe ff ff ff 5b 5d 41 5c 41 5d 41 5e 41 5f c3 66 0f 1f ---[ end trace 650e5561b0caab3a ]--- Signed-off-by: Xiao Liang Reviewed-by: Juergen Gross Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/xen-netfront.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 2c0db518fe14..c48665eae9ee 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -1608,6 +1608,7 @@ static int xennet_init_queue(struct netfront_queue *queue) { unsigned short i; int err = 0; + char *devid; spin_lock_init(&queue->tx_lock); spin_lock_init(&queue->rx_lock); @@ -1615,8 +1616,9 @@ static int xennet_init_queue(struct netfront_queue *queue) setup_timer(&queue->rx_refill_timer, rx_refill_timeout, (unsigned long)queue); - snprintf(queue->name, sizeof(queue->name), "%s-q%u", - queue->info->xbdev->nodename, queue->id); + devid = strrchr(queue->info->xbdev->nodename, '/') + 1; + snprintf(queue->name, sizeof(queue->name), "vif%s-q%u", + devid, queue->id); /* Initialise tx_skbs as a free chain containing every entry. */ queue->tx_skb_freelist = 0; From 07c63fd028418528d9f58f8f8fab33ad20432ce2 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Thu, 30 Aug 2018 08:35:19 +0300 Subject: [PATCH 1061/1212] RDMA/cma: Protect cma dev list with lock commit 954a8e3aea87e896e320cf648c1a5bbe47de443e upstream. When AF_IB addresses are used during rdma_resolve_addr() a lock is not held. A cma device can get removed while list traversal is in progress which may lead to crash. ie CPU0 CPU1 ==== ==== rdma_resolve_addr() cma_resolve_ib_dev() list_for_each() cma_remove_one() cur_dev->device mutex_lock(&lock) list_del(); mutex_unlock(&lock); cma_process_remove(); Therefore, hold a lock while traversing the list which avoids such situation. Cc: # 3.10 Fixes: f17df3b0dede ("RDMA/cma: Add support for AF_IB to rdma_resolve_addr()") Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Reviewed-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/core/cma.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 0f42411d6a79..1454290078de 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -544,6 +544,7 @@ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv) dgid = (union ib_gid *) &addr->sib_addr; pkey = ntohs(addr->sib_pkey); + mutex_lock(&lock); list_for_each_entry(cur_dev, &dev_list, list) { for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) { if (!rdma_cap_af_ib(cur_dev->device, p)) @@ -567,18 +568,19 @@ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv) cma_dev = cur_dev; sgid = gid; id_priv->id.port_num = p; + goto found; } } } } - - if (!cma_dev) - return -ENODEV; + mutex_unlock(&lock); + return -ENODEV; found: cma_attach_to_dev(id_priv, cma_dev); - addr = (struct sockaddr_ib *) cma_src_addr(id_priv); - memcpy(&addr->sib_addr, &sgid, sizeof sgid); + mutex_unlock(&lock); + addr = (struct sockaddr_ib *)cma_src_addr(id_priv); + memcpy(&addr->sib_addr, &sgid, sizeof(sgid)); cma_translate_ib(addr, &id_priv->id.route.addr.dev_addr); return 0; } From 1cd01dba1355e291f636555b1920625ed58c1623 Mon Sep 17 00:00:00 2001 From: Bin Yang Date: Wed, 12 Sep 2018 03:36:34 +0000 Subject: [PATCH 1062/1212] pstore: Fix incorrect persistent ram buffer mapping commit 831b624df1b420c8f9281ed1307a8db23afb72df upstream. persistent_ram_vmap() returns the page start vaddr. persistent_ram_iomap() supports non-page-aligned mapping. persistent_ram_buffer_map() always adds offset-in-page to the vaddr returned from these two functions, which causes incorrect mapping of non-page-aligned persistent ram buffer. By default ftrace_size is 4096 and max_ftrace_cnt is nr_cpu_ids. Without this patch, the zone_sz in ramoops_init_przs() is 4096/nr_cpu_ids which might not be page aligned. If the offset-in-page > 2048, the vaddr will be in next page. If the next page is not mapped, it will cause kernel panic: [ 0.074231] BUG: unable to handle kernel paging request at ffffa19e0081b000 ... [ 0.075000] RIP: 0010:persistent_ram_new+0x1f8/0x39f ... [ 0.075000] Call Trace: [ 0.075000] ramoops_init_przs.part.10.constprop.15+0x105/0x260 [ 0.075000] ramoops_probe+0x232/0x3a0 [ 0.075000] platform_drv_probe+0x3e/0xa0 [ 0.075000] driver_probe_device+0x2cd/0x400 [ 0.075000] __driver_attach+0xe4/0x110 [ 0.075000] ? driver_probe_device+0x400/0x400 [ 0.075000] bus_for_each_dev+0x70/0xa0 [ 0.075000] driver_attach+0x1e/0x20 [ 0.075000] bus_add_driver+0x159/0x230 [ 0.075000] ? do_early_param+0x95/0x95 [ 0.075000] driver_register+0x70/0xc0 [ 0.075000] ? init_pstore_fs+0x4d/0x4d [ 0.075000] __platform_driver_register+0x36/0x40 [ 0.075000] ramoops_init+0x12f/0x131 [ 0.075000] do_one_initcall+0x4d/0x12c [ 0.075000] ? do_early_param+0x95/0x95 [ 0.075000] kernel_init_freeable+0x19b/0x222 [ 0.075000] ? rest_init+0xbb/0xbb [ 0.075000] kernel_init+0xe/0xfc [ 0.075000] ret_from_fork+0x3a/0x50 Signed-off-by: Bin Yang [kees: add comments describing the mapping differences, updated commit log] Fixes: 24c3d2f342ed ("staging: android: persistent_ram: Make it possible to use memory outside of bootmem") Cc: stable@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: Greg Kroah-Hartman --- fs/pstore/ram_core.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 27300533c2dd..bd21795ce657 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -378,7 +378,12 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size, vaddr = vmap(pages, page_count, VM_MAP, prot); kfree(pages); - return vaddr; + /* + * Since vmap() uses page granularity, we must add the offset + * into the page here, to get the byte granularity address + * into the mapping to represent the actual "start" location. + */ + return vaddr + offset_in_page(start); } static void *persistent_ram_iomap(phys_addr_t start, size_t size, @@ -397,6 +402,11 @@ static void *persistent_ram_iomap(phys_addr_t start, size_t size, else va = ioremap_wc(start, size); + /* + * Since request_mem_region() and ioremap() are byte-granularity + * there is no need handle anything special like we do when the + * vmap() case in persistent_ram_vmap() above. + */ return va; } @@ -417,7 +427,7 @@ static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size, return -ENOMEM; } - prz->buffer = prz->vaddr + offset_in_page(start); + prz->buffer = prz->vaddr; prz->buffer_size = size - sizeof(struct persistent_ram_buffer); return 0; @@ -464,7 +474,8 @@ void persistent_ram_free(struct persistent_ram_zone *prz) if (prz->vaddr) { if (pfn_valid(prz->paddr >> PAGE_SHIFT)) { - vunmap(prz->vaddr); + /* We must vunmap() at page-granularity. */ + vunmap(prz->vaddr - offset_in_page(prz->paddr)); } else { iounmap(prz->vaddr); release_mem_region(prz->paddr, prz->size); From a739cb3e04a2f87bf29119a61cb2f77483ae228b Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 7 Sep 2018 14:21:30 +0200 Subject: [PATCH 1063/1212] xen/netfront: fix waiting for xenbus state change commit 8edfe2e992b75aee3da9316e9697c531194c2f53 upstream. Commit 822fb18a82aba ("xen-netfront: wait xenbus state change when load module manually") added a new wait queue to wait on for a state change when the module is loaded manually. Unfortunately there is no wakeup anywhere to stop that waiting. Instead of introducing a new wait queue rename the existing module_unload_q to module_wq and use it for both purposes (loading and unloading). As any state change of the backend might be intended to stop waiting do the wake_up_all() in any case when netback_changed() is called. Fixes: 822fb18a82aba ("xen-netfront: wait xenbus state change when load module manually") Cc: #4.18 Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/xen-netfront.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index c48665eae9ee..3270b4333668 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -86,8 +86,7 @@ struct netfront_cb { /* IRQ name is queue name with "-tx" or "-rx" appended */ #define IRQ_NAME_SIZE (QUEUE_NAME_SIZE + 3) -static DECLARE_WAIT_QUEUE_HEAD(module_load_q); -static DECLARE_WAIT_QUEUE_HEAD(module_unload_q); +static DECLARE_WAIT_QUEUE_HEAD(module_wq); struct netfront_stats { u64 packets; @@ -1336,11 +1335,11 @@ static struct net_device *xennet_create_dev(struct xenbus_device *dev) netif_carrier_off(netdev); xenbus_switch_state(dev, XenbusStateInitialising); - wait_event(module_load_q, - xenbus_read_driver_state(dev->otherend) != - XenbusStateClosed && - xenbus_read_driver_state(dev->otherend) != - XenbusStateUnknown); + wait_event(module_wq, + xenbus_read_driver_state(dev->otherend) != + XenbusStateClosed && + xenbus_read_driver_state(dev->otherend) != + XenbusStateUnknown); return netdev; exit: @@ -2025,15 +2024,14 @@ static void netback_changed(struct xenbus_device *dev, dev_dbg(&dev->dev, "%s\n", xenbus_strstate(backend_state)); + wake_up_all(&module_wq); + switch (backend_state) { case XenbusStateInitialising: case XenbusStateInitialised: case XenbusStateReconfiguring: case XenbusStateReconfigured: - break; - case XenbusStateUnknown: - wake_up_all(&module_unload_q); break; case XenbusStateInitWait: @@ -2049,12 +2047,10 @@ static void netback_changed(struct xenbus_device *dev, break; case XenbusStateClosed: - wake_up_all(&module_unload_q); if (dev->state == XenbusStateClosed) break; /* Missed the backend's CLOSING state -- fallthrough */ case XenbusStateClosing: - wake_up_all(&module_unload_q); xenbus_frontend_closed(dev); break; } @@ -2162,14 +2158,14 @@ static int xennet_remove(struct xenbus_device *dev) if (xenbus_read_driver_state(dev->otherend) != XenbusStateClosed) { xenbus_switch_state(dev, XenbusStateClosing); - wait_event(module_unload_q, + wait_event(module_wq, xenbus_read_driver_state(dev->otherend) == XenbusStateClosing || xenbus_read_driver_state(dev->otherend) == XenbusStateUnknown); xenbus_switch_state(dev, XenbusStateClosed); - wait_event(module_unload_q, + wait_event(module_wq, xenbus_read_driver_state(dev->otherend) == XenbusStateClosed || xenbus_read_driver_state(dev->otherend) == From ee4d1a30a4dd851aa31488ac77bfdd9ae6ae2556 Mon Sep 17 00:00:00 2001 From: Aaron Knister Date: Fri, 24 Aug 2018 08:42:46 -0400 Subject: [PATCH 1064/1212] IB/ipoib: Avoid a race condition between start_xmit and cm_rep_handler commit 816e846c2eb9129a3e0afa5f920c8bbc71efecaa upstream. Inside of start_xmit() the call to check if the connection is up and the queueing of the packets for later transmission is not atomic which leaves a window where cm_rep_handler can run, set the connection up, dequeue pending packets and leave the subsequently queued packets by start_xmit() sitting on neigh->queue until they're dropped when the connection is torn down. This only applies to connected mode. These dropped packets can really upset TCP, for example, and cause multi-minute delays in transmission for open connections. Here's the code in start_xmit where we check to see if the connection is up: if (ipoib_cm_get(neigh)) { if (ipoib_cm_up(neigh)) { ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); goto unref; } } The race occurs if cm_rep_handler execution occurs after the above connection check (specifically if it gets to the point where it acquires priv->lock to dequeue pending skb's) but before the below code snippet in start_xmit where packets are queued. if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { push_pseudo_header(skb, phdr->hwaddr); spin_lock_irqsave(&priv->lock, flags); __skb_queue_tail(&neigh->queue, skb); spin_unlock_irqrestore(&priv->lock, flags); } else { ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); } The patch acquires the netif tx lock in cm_rep_handler for the section where it sets the connection up and dequeues and retransmits deferred skb's. Fixes: 839fcaba355a ("IPoIB: Connected mode experimental support") Cc: stable@vger.kernel.org Signed-off-by: Aaron Knister Tested-by: Ira Weiny Reviewed-by: Ira Weiny Signed-off-by: Jason Gunthorpe Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/ulp/ipoib/ipoib_cm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index f74b11542603..a338e60836ee 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -992,12 +992,14 @@ static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even skb_queue_head_init(&skqueue); + netif_tx_lock_bh(p->dev); spin_lock_irq(&priv->lock); set_bit(IPOIB_FLAG_OPER_UP, &p->flags); if (p->neigh) while ((skb = __skb_dequeue(&p->neigh->queue))) __skb_queue_tail(&skqueue, skb); spin_unlock_irq(&priv->lock); + netif_tx_unlock_bh(p->dev); while ((skb = __skb_dequeue(&skqueue))) { skb->dev = p->dev; From c527796fdb64c982d80b0b1f63bd53d7e0c594d5 Mon Sep 17 00:00:00 2001 From: "K. Y. Srinivasan" Date: Fri, 10 Aug 2018 23:06:07 +0000 Subject: [PATCH 1065/1212] Tools: hv: Fix a bug in the key delete code commit 86503bd35dec0ce363e9fdbf5299927422ed3899 upstream. Fix a bug in the key delete code - the num_records range from 0 to num_records-1. Signed-off-by: K. Y. Srinivasan Reported-by: David Binderman Cc: Reviewed-by: Michael Kelley Signed-off-by: Greg Kroah-Hartman --- tools/hv/hv_kvp_daemon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c index 60a94b3e532e..177480066816 100644 --- a/tools/hv/hv_kvp_daemon.c +++ b/tools/hv/hv_kvp_daemon.c @@ -286,7 +286,7 @@ static int kvp_key_delete(int pool, const __u8 *key, int key_size) * Found a match; just move the remaining * entries up. */ - if (i == num_records) { + if (i == (num_records - 1)) { kvp_file_info[pool].num_records--; kvp_update_file(pool); return 0; From 68fe884e7b98351684849a78d2bb099a7b389b64 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 15 Aug 2018 10:50:41 -0500 Subject: [PATCH 1066/1212] misc: hmc6352: fix potential Spectre v1 commit de916736aaaadddbd6061472969f667b14204aa9 upstream. val is indirectly controlled by user-space, hence leading to a potential exploitation of the Spectre variant 1 vulnerability. This issue was detected with the help of Smatch: drivers/misc/hmc6352.c:54 compass_store() warn: potential spectre issue 'map' [r] Fix this by sanitizing val before using it to index map Notice that given that speculation windows are large, the policy is to kill the speculation on the first load and not worry if it can be completed with a dependent load/store [1]. [1] https://marc.info/?l=linux-kernel&m=152449131114778&w=2 Cc: stable@vger.kernel.org Signed-off-by: Gustavo A. R. Silva Signed-off-by: Greg Kroah-Hartman --- drivers/misc/hmc6352.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/misc/hmc6352.c b/drivers/misc/hmc6352.c index 90520d76633f..9cde4c5bfba4 100644 --- a/drivers/misc/hmc6352.c +++ b/drivers/misc/hmc6352.c @@ -27,6 +27,7 @@ #include #include #include +#include static DEFINE_MUTEX(compass_mutex); @@ -50,6 +51,7 @@ static int compass_store(struct device *dev, const char *buf, size_t count, return ret; if (val >= strlen(map)) return -EINVAL; + val = array_index_nospec(val, strlen(map)); mutex_lock(&compass_mutex); ret = compass_command(c, map[val]); mutex_unlock(&compass_mutex); From c343fc8035cb31357bd7d575590bb51c4e16563c Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Tue, 4 Sep 2018 17:35:16 +0300 Subject: [PATCH 1067/1212] usb: Don't die twice if PCI xhci host is not responding in resume commit f3dc41c5d22b2ca14a0802a65d8cdc33a3882d4e upstream. usb_hc_died() should only be called once, and with the primary HCD as parameter. It will mark both primary and secondary hcd's dead. Remove the extra call to usb_cd_died with the shared hcd as parameter. Fixes: ff9d78b36f76 ("USB: Set usb_hcd->state and flags for shared roothubs") Signed-off-by: Mathias Nyman Cc: stable Acked-by: Alan Stern Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hcd-pci.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/usb/core/hcd-pci.c b/drivers/usb/core/hcd-pci.c index 40378487e023..a5e3e410db4e 100644 --- a/drivers/usb/core/hcd-pci.c +++ b/drivers/usb/core/hcd-pci.c @@ -529,8 +529,6 @@ static int resume_common(struct device *dev, int event) event == PM_EVENT_RESTORE); if (retval) { dev_err(dev, "PCI post-resume error %d!\n", retval); - if (hcd->shared_hcd) - usb_hc_died(hcd->shared_hcd); usb_hc_died(hcd); } } From 49c05a0000dd72be965287f00650a81d67770063 Mon Sep 17 00:00:00 2001 From: Tim Anderson Date: Thu, 9 Aug 2018 14:55:34 -0700 Subject: [PATCH 1068/1212] USB: Add quirk to support DJI CineSSD commit f45681f9becaa65111ed0a691ccf080a0cd5feb8 upstream. This device does not correctly handle the LPM operations. Also, the device cannot handle ATA pass-through commands and locks up when attempted while running in super speed. This patch adds the equivalent quirk logic as found in uas. Signed-off-by: Tim Anderson Acked-by: Alan Stern Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/quirks.c | 3 +++ drivers/usb/storage/scsiglue.c | 9 +++++++++ drivers/usb/storage/unusual_devs.h | 7 +++++++ 3 files changed, 19 insertions(+) diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index 99f67764765f..deab9935c1af 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -259,6 +259,9 @@ static const struct usb_device_id usb_quirk_list[] = { { USB_DEVICE(0x2040, 0x7200), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, + /* DJI CineSSD */ + { USB_DEVICE(0x2ca3, 0x0031), .driver_info = USB_QUIRK_NO_LPM }, + /* INTEL VALUE SSD */ { USB_DEVICE(0x8086, 0xf1a5), .driver_info = USB_QUIRK_RESET_RESUME }, diff --git a/drivers/usb/storage/scsiglue.c b/drivers/usb/storage/scsiglue.c index dba51362d2e2..6c186b4df94a 100644 --- a/drivers/usb/storage/scsiglue.c +++ b/drivers/usb/storage/scsiglue.c @@ -341,6 +341,15 @@ static int queuecommand_lck(struct scsi_cmnd *srb, return 0; } + if ((us->fflags & US_FL_NO_ATA_1X) && + (srb->cmnd[0] == ATA_12 || srb->cmnd[0] == ATA_16)) { + memcpy(srb->sense_buffer, usb_stor_sense_invalidCDB, + sizeof(usb_stor_sense_invalidCDB)); + srb->result = SAM_STAT_CHECK_CONDITION; + done(srb); + return 0; + } + /* enqueue the command and wake up the control thread */ srb->scsi_done = done; us->srb = srb; diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h index 1a34d2a89de6..898215cad351 100644 --- a/drivers/usb/storage/unusual_devs.h +++ b/drivers/usb/storage/unusual_devs.h @@ -2213,6 +2213,13 @@ UNUSUAL_DEV( 0x4146, 0xba01, 0x0100, 0x0100, "Micro Mini 1GB", USB_SC_DEVICE, USB_PR_DEVICE, NULL, US_FL_NOT_LOCKABLE ), +/* Reported-by: Tim Anderson */ +UNUSUAL_DEV( 0x2ca3, 0x0031, 0x0000, 0x9999, + "DJI", + "CineSSD", + USB_SC_DEVICE, USB_PR_DEVICE, NULL, + US_FL_NO_ATA_1X), + /* * Nick Bowler * SCSI stack spams (otherwise harmless) error messages. From 35bcdf481ce4811fd326ac64f547e372f49aca3d Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Mon, 3 Sep 2018 15:44:16 +0300 Subject: [PATCH 1069/1212] usb: Avoid use-after-free by flushing endpoints early in usb_set_interface() commit f9a5b4f58b280c1d26255376713c132f93837621 upstream. The steps taken by usb core to set a new interface is very different from what is done on the xHC host side. xHC hardware will do everything in one go. One command is used to set up new endpoints, free old endpoints, check bandwidth, and run the new endpoints. All this is done by xHC when usb core asks the hcd to check for available bandwidth. At this point usb core has not yet flushed the old endpoints, which will cause use-after-free issues in xhci driver as queued URBs are cancelled on a re-allocated endpoint. To resolve this add a call to usb_disable_interface() which will flush the endpoints before calling usb_hcd_alloc_bandwidth() Additional checks in xhci driver will also be implemented to gracefully handle stale URB cancel on freed and re-allocated endpoints Cc: Reported-by: Sudip Mukherjee Signed-off-by: Mathias Nyman Acked-by: Alan Stern Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/message.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c index 29adabdb305f..08cba309eb78 100644 --- a/drivers/usb/core/message.c +++ b/drivers/usb/core/message.c @@ -1282,6 +1282,11 @@ void usb_enable_interface(struct usb_device *dev, * is submitted that needs that bandwidth. Some other operating systems * allocate bandwidth early, when a configuration is chosen. * + * xHCI reserves bandwidth and configures the alternate setting in + * usb_hcd_alloc_bandwidth(). If it fails the original interface altsetting + * may be disabled. Drivers cannot rely on any particular alternate + * setting being in effect after a failure. + * * This call is synchronous, and may not be used in an interrupt context. * Also, drivers must not change altsettings while urbs are scheduled for * endpoints in that interface; all such urbs must first be completed @@ -1317,6 +1322,12 @@ int usb_set_interface(struct usb_device *dev, int interface, int alternate) alternate); return -EINVAL; } + /* + * usb3 hosts configure the interface in usb_hcd_alloc_bandwidth, + * including freeing dropped endpoint ring buffers. + * Make sure the interface endpoints are flushed before that + */ + usb_disable_interface(dev, iface, false); /* Make sure we have enough bandwidth for this alternate interface. * Remove the current alt setting and add the new alt setting. From 637acc7bdebc53500f13d7da1b37825e21b2e588 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Sat, 1 Sep 2018 17:23:47 +0800 Subject: [PATCH 1070/1212] usb: host: u132-hcd: Fix a sleep-in-atomic-context bug in u132_get_frame() commit 6d4f268fa132742fe96dad22307c68d237356d88 upstream. i_usX2Y_subs_startup in usbusx2yaudio.c is a completion handler function for the USB driver. So it should not sleep, but it is can sleep according to the function call paths (from bottom to top) in Linux-4.16. [FUNC] msleep drivers/usb/host/u132-hcd.c, 2558: msleep in u132_get_frame drivers/usb/core/hcd.c, 2231: [FUNC_PTR]u132_get_frame in usb_hcd_get_frame_number drivers/usb/core/usb.c, 822: usb_hcd_get_frame_number in usb_get_current_frame_number sound/usb/usx2y/usbusx2yaudio.c, 303: usb_get_current_frame_number in i_usX2Y_urb_complete sound/usb/usx2y/usbusx2yaudio.c, 366: i_usX2Y_urb_complete in i_usX2Y_subs_startup Note that [FUNC_PTR] means a function pointer call is used. To fix this bug, msleep() is replaced with mdelay(). This bug is found by my static analysis tool DSAC. Signed-off-by: Jia-Ju Bai Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/u132-hcd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/host/u132-hcd.c b/drivers/usb/host/u132-hcd.c index 692ccc69345e..d5434e7a3b2e 100644 --- a/drivers/usb/host/u132-hcd.c +++ b/drivers/usb/host/u132-hcd.c @@ -2565,7 +2565,7 @@ static int u132_get_frame(struct usb_hcd *hcd) } else { int frame = 0; dev_err(&u132->platform_dev->dev, "TODO: u132_get_frame\n"); - msleep(100); + mdelay(100); return frame; } } From 34f1df0ff035c4e14decea8d439b785f4f913bc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maxence=20Dupr=C3=A8s?= Date: Wed, 8 Aug 2018 23:56:33 +0000 Subject: [PATCH 1071/1212] USB: add quirk for WORLDE Controller KS49 or Prodipe MIDI 49C USB controller commit 9b83a1c301ad6d24988a128c69b42cbaaf537d82 upstream. WORLDE Controller KS49 or Prodipe MIDI 49C USB controller cause a -EPROTO error, a communication restart and loop again. This issue has already been fixed for KS25. https://lore.kernel.org/patchwork/patch/753077/ I just add device 201 for KS49 in quirks.c to get it works. Signed-off-by: Laurent Roux Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/quirks.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index deab9935c1af..37a5e07b3488 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -37,6 +37,10 @@ static const struct usb_device_id usb_quirk_list[] = { /* CBM - Flash disk */ { USB_DEVICE(0x0204, 0x6025), .driver_info = USB_QUIRK_RESET_RESUME }, + /* WORLDE Controller KS49 or Prodipe MIDI 49C USB controller */ + { USB_DEVICE(0x0218, 0x0201), .driver_info = + USB_QUIRK_CONFIG_INTF_STRINGS }, + /* WORLDE easy key (easykey.25) MIDI controller */ { USB_DEVICE(0x0218, 0x0401), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, From f409f34070299081b243aab0405b151cce28f6db Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Wed, 8 Aug 2018 11:20:39 -0400 Subject: [PATCH 1072/1212] USB: net2280: Fix erroneous synchronization change commit dec3c23c9aa1815f07d98ae0375b4cbc10971e13 upstream. Commit f16443a034c7 ("USB: gadgetfs, dummy-hcd, net2280: fix locking for callbacks") was based on a serious misunderstanding. It introduced regressions into both the dummy-hcd and net2280 drivers. The problem in dummy-hcd was fixed by commit 7dbd8f4cabd9 ("USB: dummy-hcd: Fix erroneous synchronization change"), but the problem in net2280 remains. Namely: the ->disconnect(), ->suspend(), ->resume(), and ->reset() callbacks must be invoked without the private lock held; otherwise a deadlock will occur when the callback routine tries to interact with the UDC driver. This patch largely is a reversion of the relevant parts of f16443a034c7. It also drops the private lock around the calls to ->suspend() and ->resume() (something the earlier patch forgot to do). This is safe from races with device interrupts because it occurs within the interrupt handler. Finally, the patch changes where the ->disconnect() callback is invoked when net2280_pullup() turns the pullup off. Rather than making the callback from within stop_activity() at a time when dropping the private lock could be unsafe, the callback is moved to a point after the lock has already been dropped. Signed-off-by: Alan Stern Fixes: f16443a034c7 ("USB: gadgetfs, dummy-hcd, net2280: fix locking for callbacks") Reported-by: D. Ziesche Tested-by: D. Ziesche CC: Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/net2280.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/udc/net2280.c b/drivers/usb/gadget/udc/net2280.c index a47de8c31ce9..8efeadf30b4d 100644 --- a/drivers/usb/gadget/udc/net2280.c +++ b/drivers/usb/gadget/udc/net2280.c @@ -1542,11 +1542,14 @@ static int net2280_pullup(struct usb_gadget *_gadget, int is_on) writel(tmp | BIT(USB_DETECT_ENABLE), &dev->usb->usbctl); } else { writel(tmp & ~BIT(USB_DETECT_ENABLE), &dev->usb->usbctl); - stop_activity(dev, dev->driver); + stop_activity(dev, NULL); } spin_unlock_irqrestore(&dev->lock, flags); + if (!is_on && dev->driver) + dev->driver->disconnect(&dev->gadget); + return 0; } @@ -2425,8 +2428,11 @@ static void stop_activity(struct net2280 *dev, struct usb_gadget_driver *driver) nuke(&dev->ep[i]); /* report disconnect; the driver is already quiesced */ - if (driver) + if (driver) { + spin_unlock(&dev->lock); driver->disconnect(&dev->gadget); + spin_lock(&dev->lock); + } usb_reinit(dev); } @@ -3272,6 +3278,8 @@ static void handle_stat0_irqs(struct net2280 *dev, u32 stat) BIT(PCI_RETRY_ABORT_INTERRUPT)) static void handle_stat1_irqs(struct net2280 *dev, u32 stat) +__releases(dev->lock) +__acquires(dev->lock) { struct net2280_ep *ep; u32 tmp, num, mask, scratch; @@ -3312,12 +3320,14 @@ static void handle_stat1_irqs(struct net2280 *dev, u32 stat) if (disconnect || reset) { stop_activity(dev, dev->driver); ep0_start(dev); + spin_unlock(&dev->lock); if (reset) usb_gadget_udc_reset (&dev->gadget, dev->driver); else (dev->driver->disconnect) (&dev->gadget); + spin_lock(&dev->lock); return; } } @@ -3336,6 +3346,7 @@ static void handle_stat1_irqs(struct net2280 *dev, u32 stat) tmp = BIT(SUSPEND_REQUEST_CHANGE_INTERRUPT); if (stat & tmp) { writel(tmp, &dev->regs->irqstat1); + spin_unlock(&dev->lock); if (stat & BIT(SUSPEND_REQUEST_INTERRUPT)) { if (dev->driver->suspend) dev->driver->suspend(&dev->gadget); @@ -3346,6 +3357,7 @@ static void handle_stat1_irqs(struct net2280 *dev, u32 stat) dev->driver->resume(&dev->gadget); /* at high speed, note erratum 0133 */ } + spin_lock(&dev->lock); stat &= ~tmp; } From 403c5c2377ca18f99dbd38006b31705fd0e93ddf Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 21 Aug 2018 11:59:52 +0200 Subject: [PATCH 1073/1212] USB: serial: io_ti: fix array underflow in completion handler commit 691a03cfe8ca483f9c48153b869d354e4ae3abef upstream. As reported by Dan Carpenter, a malicious USB device could set port_number to a negative value and we would underflow the port array in the interrupt completion handler. As these devices only have one or two ports, fix this by making sure we only consider the seventh bit when determining the port number (and ignore bits 0xb0 which are typically set to 0x30). Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Cc: stable Reported-by: Dan Carpenter Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/io_ti.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/serial/io_ti.h b/drivers/usb/serial/io_ti.h index 1bd67b24f916..bc9ff5ebd67c 100644 --- a/drivers/usb/serial/io_ti.h +++ b/drivers/usb/serial/io_ti.h @@ -178,7 +178,7 @@ struct ump_interrupt { } __attribute__((packed)); -#define TIUMP_GET_PORT_FROM_CODE(c) (((c) >> 4) - 3) +#define TIUMP_GET_PORT_FROM_CODE(c) (((c) >> 6) & 0x01) #define TIUMP_GET_FUNC_FROM_CODE(c) ((c) & 0x0f) #define TIUMP_INTERRUPT_CODE_LSR 0x03 #define TIUMP_INTERRUPT_CODE_MSR 0x04 From b214cde70484a04e816fae671af146d346cddc88 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Sat, 1 Sep 2018 16:25:08 +0800 Subject: [PATCH 1074/1212] usb: misc: uss720: Fix two sleep-in-atomic-context bugs commit bc8acc214d3f1cafebcbcd101a695bbac716595d upstream. async_complete() in uss720.c is a completion handler function for the USB driver. So it should not sleep, but it is can sleep according to the function call paths (from bottom to top) in Linux-4.16. [FUNC] set_1284_register(GFP_KERNEL) drivers/usb/misc/uss720.c, 372: set_1284_register in parport_uss720_frob_control drivers/parport/ieee1284.c, 560: [FUNC_PTR]parport_uss720_frob_control in parport_ieee1284_ack_data_avail drivers/parport/ieee1284.c, 577: parport_ieee1284_ack_data_avail in parport_ieee1284_interrupt ./include/linux/parport.h, 474: parport_ieee1284_interrupt in parport_generic_irq drivers/usb/misc/uss720.c, 116: parport_generic_irq in async_complete [FUNC] get_1284_register(GFP_KERNEL) drivers/usb/misc/uss720.c, 382: get_1284_register in parport_uss720_read_status drivers/parport/ieee1284.c, 555: [FUNC_PTR]parport_uss720_read_status in parport_ieee1284_ack_data_avail drivers/parport/ieee1284.c, 577: parport_ieee1284_ack_data_avail in parport_ieee1284_interrupt ./include/linux/parport.h, 474: parport_ieee1284_interrupt in parport_generic_irq drivers/usb/misc/uss720.c, 116: parport_generic_irq in async_complete Note that [FUNC_PTR] means a function pointer call is used. To fix these bugs, GFP_KERNEL is replaced with GFP_ATOMIC. These bugs are found by my static analysis tool DSAC. Signed-off-by: Jia-Ju Bai Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/misc/uss720.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/misc/uss720.c b/drivers/usb/misc/uss720.c index 442b6631162e..3d750671b85a 100644 --- a/drivers/usb/misc/uss720.c +++ b/drivers/usb/misc/uss720.c @@ -388,7 +388,7 @@ static unsigned char parport_uss720_frob_control(struct parport *pp, unsigned ch mask &= 0x0f; val &= 0x0f; d = (priv->reg[1] & (~mask)) ^ val; - if (set_1284_register(pp, 2, d, GFP_KERNEL)) + if (set_1284_register(pp, 2, d, GFP_ATOMIC)) return 0; priv->reg[1] = d; return d & 0xf; @@ -398,7 +398,7 @@ static unsigned char parport_uss720_read_status(struct parport *pp) { unsigned char ret; - if (get_1284_register(pp, 1, &ret, GFP_KERNEL)) + if (get_1284_register(pp, 1, &ret, GFP_ATOMIC)) return 0; return ret & 0xf8; } From 4be36bccc6c59b7482398a9db4c4df84f0a944af Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Wed, 15 Aug 2018 21:44:25 +0100 Subject: [PATCH 1075/1212] USB: yurex: Fix buffer over-read in yurex_write() commit 7e10f14ebface44a48275c8d6dc1caae3668d5a9 upstream. If the written data starts with a digit, yurex_write() tries to parse it as an integer using simple_strtoull(). This requires a null- terminator, and currently there's no guarantee that there is one. (The sample program at https://github.com/NeoCat/YUREX-driver-for-Linux/blob/master/sample/yurex_clock.pl writes an integer without a null terminator. It seems like it must have worked by chance!) Always add a null byte after the written data. Enlarge the buffer to allow for this. Cc: stable@vger.kernel.org Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- drivers/usb/misc/yurex.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/usb/misc/yurex.c b/drivers/usb/misc/yurex.c index 512c84adcace..e8e8702d5adf 100644 --- a/drivers/usb/misc/yurex.c +++ b/drivers/usb/misc/yurex.c @@ -439,13 +439,13 @@ static ssize_t yurex_write(struct file *file, const char __user *user_buffer, { struct usb_yurex *dev; int i, set = 0, retval = 0; - char buffer[16]; + char buffer[16 + 1]; char *data = buffer; unsigned long long c, c2 = 0; signed long timeout = 0; DEFINE_WAIT(wait); - count = min(sizeof(buffer), count); + count = min(sizeof(buffer) - 1, count); dev = file->private_data; /* verify that we actually have some data to write */ @@ -464,6 +464,7 @@ static ssize_t yurex_write(struct file *file, const char __user *user_buffer, retval = -EFAULT; goto error; } + buffer[count] = 0; memset(dev->cntl_buffer, CMD_PADDING, YUREX_BUF_SIZE); switch (buffer[0]) { From b8b53adeca0e3ce18663338deb632b41c43a5176 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Sat, 1 Sep 2018 16:12:10 +0800 Subject: [PATCH 1076/1212] usb: cdc-wdm: Fix a sleep-in-atomic-context bug in service_outstanding_interrupt() commit 6e22e3af7bb3a7b9dc53cb4687659f6e63fca427 upstream. wdm_in_callback() is a completion handler function for the USB driver. So it should not sleep. But it calls service_outstanding_interrupt(), which calls usb_submit_urb() with GFP_KERNEL. To fix this bug, GFP_KERNEL is replaced with GFP_ATOMIC. This bug is found by my static analysis tool DSAC. Signed-off-by: Jia-Ju Bai Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/cdc-wdm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c index 61ea87917433..4380e4f600ab 100644 --- a/drivers/usb/class/cdc-wdm.c +++ b/drivers/usb/class/cdc-wdm.c @@ -453,7 +453,7 @@ static int clear_wdm_read_flag(struct wdm_device *desc) set_bit(WDM_RESPONDING, &desc->flags); spin_unlock_irq(&desc->iuspin); - rv = usb_submit_urb(desc->response, GFP_KERNEL); + rv = usb_submit_urb(desc->response, GFP_ATOMIC); spin_lock_irq(&desc->iuspin); if (rv) { dev_err(&desc->intf->dev, From 2d3631960dd329d2dfb547645851feb774de781e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 6 Sep 2018 12:47:51 +0300 Subject: [PATCH 1077/1212] cifs: prevent integer overflow in nxt_dir_entry() commit 8ad8aa353524d89fa2e09522f3078166ff78ec42 upstream. The "old_entry + le32_to_cpu(pDirInfo->NextEntryOffset)" can wrap around so I have added a check for integer overflow. Reported-by: Dr Silvio Cesare of InfoSect Reviewed-by: Ronnie Sahlberg Reviewed-by: Aurelien Aptel Signed-off-by: Dan Carpenter Signed-off-by: Steve French CC: Stable Signed-off-by: Greg Kroah-Hartman --- fs/cifs/readdir.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 97d1a15873c5..57b039ebfb1f 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -373,8 +373,15 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level) new_entry = old_entry + sizeof(FIND_FILE_STANDARD_INFO) + pfData->FileNameLength; - } else - new_entry = old_entry + le32_to_cpu(pDirInfo->NextEntryOffset); + } else { + u32 next_offset = le32_to_cpu(pDirInfo->NextEntryOffset); + + if (old_entry + next_offset < old_entry) { + cifs_dbg(VFS, "invalid offset %u\n", next_offset); + return NULL; + } + new_entry = old_entry + next_offset; + } cifs_dbg(FYI, "new entry %p old entry %p\n", new_entry, old_entry); /* validate that new_entry is not past end of SMB */ if (new_entry >= end_of_smb) { From 74fb46864fcbb9b42eb03b10adeeece6f6dc6970 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 6 Sep 2018 12:48:22 +0300 Subject: [PATCH 1078/1212] CIFS: fix wrapping bugs in num_entries() commit 56446f218af1133c802dad8e9e116f07f381846c upstream. The problem is that "entryptr + next_offset" and "entryptr + len + size" can wrap. I ended up changing the type of "entryptr" because it makes the math easier when we don't have to do so much casting. Signed-off-by: Dan Carpenter Signed-off-by: Steve French Reviewed-by: Aurelien Aptel Reviewed-by: Pavel Shilovsky CC: Stable Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2pdu.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 52d79fb04115..f7111bb88ec1 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2402,33 +2402,38 @@ num_entries(char *bufstart, char *end_of_buf, char **lastentry, size_t size) int len; unsigned int entrycount = 0; unsigned int next_offset = 0; - FILE_DIRECTORY_INFO *entryptr; + char *entryptr; + FILE_DIRECTORY_INFO *dir_info; if (bufstart == NULL) return 0; - entryptr = (FILE_DIRECTORY_INFO *)bufstart; + entryptr = bufstart; while (1) { - entryptr = (FILE_DIRECTORY_INFO *) - ((char *)entryptr + next_offset); - - if ((char *)entryptr + size > end_of_buf) { + if (entryptr + next_offset < entryptr || + entryptr + next_offset > end_of_buf || + entryptr + next_offset + size > end_of_buf) { cifs_dbg(VFS, "malformed search entry would overflow\n"); break; } - len = le32_to_cpu(entryptr->FileNameLength); - if ((char *)entryptr + len + size > end_of_buf) { + entryptr = entryptr + next_offset; + dir_info = (FILE_DIRECTORY_INFO *)entryptr; + + len = le32_to_cpu(dir_info->FileNameLength); + if (entryptr + len < entryptr || + entryptr + len > end_of_buf || + entryptr + len + size > end_of_buf) { cifs_dbg(VFS, "directory entry name would overflow frame end of buf %p\n", end_of_buf); break; } - *lastentry = (char *)entryptr; + *lastentry = entryptr; entrycount++; - next_offset = le32_to_cpu(entryptr->NextEntryOffset); + next_offset = le32_to_cpu(dir_info->NextEntryOffset); if (!next_offset) break; } From 0b726a48b9854576b518d0a1c58575078fdf3811 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Tue, 15 May 2018 23:32:45 +0100 Subject: [PATCH 1079/1212] binfmt_elf: Respect error return from `regset->active' [ Upstream commit 2f819db565e82e5f73cd42b39925098986693378 ] The regset API documented in defines -ENODEV as the result of the `->active' handler to be used where the feature requested is not available on the hardware found. However code handling core file note generation in `fill_thread_core_info' interpretes any non-zero result from the `->active' handler as the regset requested being active. Consequently processing continues (and hopefully gracefully fails later on) rather than being abandoned right away for the regset requested. Fix the problem then by making the code proceed only if a positive result is returned from the `->active' handler. Signed-off-by: Maciej W. Rozycki Signed-off-by: Paul Burton Fixes: 4206d3aa1978 ("elf core dump: notes user_regset") Patchwork: https://patchwork.linux-mips.org/patch/19332/ Cc: Alexander Viro Cc: James Hogan Cc: Ralf Baechle Cc: linux-fsdevel@vger.kernel.org Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/binfmt_elf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index f44e93d2650d..62bc72001fce 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1707,7 +1707,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, const struct user_regset *regset = &view->regsets[i]; do_thread_regset_writeback(t->task, regset); if (regset->core_note_type && regset->get && - (!regset->active || regset->active(t->task, regset))) { + (!regset->active || regset->active(t->task, regset) > 0)) { int ret; size_t size = regset->n * regset->size; void *data = kmalloc(size, GFP_KERNEL); From 20d6cff3fc059ac32d92fc737a8b9e80bb01ee4e Mon Sep 17 00:00:00 2001 From: Ronny Chevalier Date: Wed, 11 Jul 2018 14:39:37 +0200 Subject: [PATCH 1080/1212] audit: fix use-after-free in audit_add_watch [ Upstream commit baa2a4fdd525c8c4b0f704d20457195b29437839 ] audit_add_watch stores locally krule->watch without taking a reference on watch. Then, it calls audit_add_to_parent, and uses the watch stored locally. Unfortunately, it is possible that audit_add_to_parent updates krule->watch. When it happens, it also drops a reference of watch which could free the watch. How to reproduce (with KASAN enabled): auditctl -w /etc/passwd -F success=0 -k test_passwd auditctl -w /etc/passwd -F success=1 -k test_passwd2 The second call to auditctl triggers the use-after-free, because audit_to_parent updates krule->watch to use a previous existing watch and drops the reference to the newly created watch. To fix the issue, we grab a reference of watch and we release it at the end of the function. Signed-off-by: Ronny Chevalier Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- kernel/audit_watch.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index a162661c9d60..f45a9a5d3e47 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -419,6 +419,13 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) struct path parent_path; int h, ret = 0; + /* + * When we will be calling audit_add_to_parent, krule->watch might have + * been updated and watch might have been freed. + * So we need to keep a reference of watch. + */ + audit_get_watch(watch); + mutex_unlock(&audit_filter_mutex); /* Avoid calling path_lookup under audit_filter_mutex. */ @@ -427,8 +434,10 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) /* caller expects mutex locked */ mutex_lock(&audit_filter_mutex); - if (ret) + if (ret) { + audit_put_watch(watch); return ret; + } /* either find an old parent or attach a new one */ parent = audit_find_parent(d_backing_inode(parent_path.dentry)); @@ -446,6 +455,7 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) *list = &audit_inode_hash[h]; error: path_put(&parent_path); + audit_put_watch(watch); return ret; } From 1533250294cf5fe0b5ea3be4b58732b6d758fc69 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Sat, 7 Jul 2018 05:37:22 +0200 Subject: [PATCH 1081/1212] mtdchar: fix overflows in adjustment of `count` [ Upstream commit 6c6bc9ea84d0008024606bf5ba10519e20d851bf ] The first checks in mtdchar_read() and mtdchar_write() attempt to limit `count` such that `*ppos + count <= mtd->size`. However, they ignore the possibility of `*ppos > mtd->size`, allowing the calculation of `count` to wrap around. `mtdchar_lseek()` prevents seeking beyond mtd->size, but the pread/pwrite syscalls bypass this. I haven't found any codepath on which this actually causes dangerous behavior, but it seems like a sensible change anyway. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Jann Horn Signed-off-by: Boris Brezillon Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/mtd/mtdchar.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c index 6d19835b80a9..0d244dac1ccb 100644 --- a/drivers/mtd/mtdchar.c +++ b/drivers/mtd/mtdchar.c @@ -160,8 +160,12 @@ static ssize_t mtdchar_read(struct file *file, char __user *buf, size_t count, pr_debug("MTD_read\n"); - if (*ppos + count > mtd->size) - count = mtd->size - *ppos; + if (*ppos + count > mtd->size) { + if (*ppos < mtd->size) + count = mtd->size - *ppos; + else + count = 0; + } if (!count) return 0; @@ -246,7 +250,7 @@ static ssize_t mtdchar_write(struct file *file, const char __user *buf, size_t c pr_debug("MTD_write\n"); - if (*ppos == mtd->size) + if (*ppos >= mtd->size) return -ENOSPC; if (*ppos + count > mtd->size) From f13ad779213b8345f55ea5e896df259c063d7898 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Mon, 16 Jul 2018 08:26:36 -0700 Subject: [PATCH 1082/1212] MIPS: loongson64: cs5536: Fix PCI_OHCI_INT_REG reads [ Upstream commit cd87668d601f622e0ebcfea4f78d116d5f572f4d ] The PCI_OHCI_INT_REG case in pci_ohci_read_reg() contains the following if statement: if ((lo & 0x00000f00) == CS5536_USB_INTR) CS5536_USB_INTR expands to the constant 11, which gives us the following condition which can never evaluate true: if ((lo & 0xf00) == 11) At least when using GCC 8.1.0 this falls foul of the tautoligcal-compare warning, and since the code is built with the -Werror flag the build fails. Fix this by shifting lo right by 8 bits in order to match the corresponding PCI_OHCI_INT_REG case in pci_ohci_write_reg(). Signed-off-by: Paul Burton Patchwork: https://patchwork.linux-mips.org/patch/19861/ Cc: Huacai Chen Cc: James Hogan Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/mips/loongson64/common/cs5536/cs5536_ohci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/loongson64/common/cs5536/cs5536_ohci.c b/arch/mips/loongson64/common/cs5536/cs5536_ohci.c index f7c905e50dc4..92dc6bafc127 100644 --- a/arch/mips/loongson64/common/cs5536/cs5536_ohci.c +++ b/arch/mips/loongson64/common/cs5536/cs5536_ohci.c @@ -138,7 +138,7 @@ u32 pci_ohci_read_reg(int reg) break; case PCI_OHCI_INT_REG: _rdmsr(DIVIL_MSR_REG(PIC_YSEL_LOW), &hi, &lo); - if ((lo & 0x00000f00) == CS5536_USB_INTR) + if (((lo >> PIC_YSEL_LOW_USB_SHIFT) & 0xf) == CS5536_USB_INTR) conf_data = 1; break; default: From 96d4584f786f1809ba6cd092fb4bfdff15e81d9b Mon Sep 17 00:00:00 2001 From: Nicholas Mc Guire Date: Thu, 12 Jul 2018 11:28:24 +0200 Subject: [PATCH 1083/1212] ARM: hisi: handle of_iomap and fix missing of_node_put [ Upstream commit d396cb185c0337aae5664b250cdd9a73f6eb1503 ] Relying on an unchecked of_iomap() which can return NULL is problematic here, an explicit check seems mandatory. Also the call to of_find_compatible_node() returns a device node with refcount incremented therefor an explicit of_node_put() is needed here. Signed-off-by: Nicholas Mc Guire Fixes: commit 22bae4290457 ("ARM: hi3xxx: add hotplug support") Signed-off-by: Wei Xu Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/arm/mach-hisi/hotplug.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/arch/arm/mach-hisi/hotplug.c b/arch/arm/mach-hisi/hotplug.c index a129aae72602..3f28c9141b48 100644 --- a/arch/arm/mach-hisi/hotplug.c +++ b/arch/arm/mach-hisi/hotplug.c @@ -148,13 +148,20 @@ static int hi3xxx_hotplug_init(void) struct device_node *node; node = of_find_compatible_node(NULL, NULL, "hisilicon,sysctrl"); - if (node) { - ctrl_base = of_iomap(node, 0); - id = HI3620_CTRL; - return 0; + if (!node) { + id = ERROR_CTRL; + return -ENOENT; } - id = ERROR_CTRL; - return -ENOENT; + + ctrl_base = of_iomap(node, 0); + of_node_put(node); + if (!ctrl_base) { + id = ERROR_CTRL; + return -ENOMEM; + } + + id = HI3620_CTRL; + return 0; } void hi3xxx_set_cpu(int cpu, bool enable) From eafee0717962080c0ff2f26cecb17525d8d7091e Mon Sep 17 00:00:00 2001 From: Nicholas Mc Guire Date: Thu, 12 Jul 2018 11:28:22 +0200 Subject: [PATCH 1084/1212] ARM: hisi: fix error handling and missing of_node_put [ Upstream commit 9f30b5ae0585ca5234fe979294b8f897299dec99 ] of_iomap() can return NULL which seems critical here and thus should be explicitly flagged so that the cause of system halting can be understood. As of_find_compatible_node() is returning a device node with refcount incremented it must be explicitly decremented here. Signed-off-by: Nicholas Mc Guire Fixes: commit 7fda91e73155 ("ARM: hisi: enable smp for HiP01") Signed-off-by: Wei Xu Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/arm/mach-hisi/hotplug.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm/mach-hisi/hotplug.c b/arch/arm/mach-hisi/hotplug.c index 3f28c9141b48..32870560b280 100644 --- a/arch/arm/mach-hisi/hotplug.c +++ b/arch/arm/mach-hisi/hotplug.c @@ -226,10 +226,10 @@ void hip01_set_cpu(int cpu, bool enable) if (!ctrl_base) { np = of_find_compatible_node(NULL, NULL, "hisilicon,hip01-sysctrl"); - if (np) - ctrl_base = of_iomap(np, 0); - else - BUG(); + BUG_ON(!np); + ctrl_base = of_iomap(np, 0); + of_node_put(np); + BUG_ON(!ctrl_base); } if (enable) { From f615d18bab766af99814ae5ab90e33f481cde4a9 Mon Sep 17 00:00:00 2001 From: Nicholas Mc Guire Date: Thu, 12 Jul 2018 11:28:23 +0200 Subject: [PATCH 1085/1212] ARM: hisi: check of_iomap and fix missing of_node_put [ Upstream commit 81646a3d39ef14749301374a3a0b8311384cd412 ] of_find_compatible_node() returns a device node with refcount incremented and thus needs an explicit of_node_put(). Further relying on an unchecked of_iomap() which can return NULL is problematic here, after all ctrl_base is critical enough for hix5hd2_set_cpu() to call BUG() if not available so a check seems mandated here. Signed-off-by: Nicholas Mc Guire 0002 Fixes: commit 06cc5c1d4d73 ("ARM: hisi: enable hix5hd2 SoC") Signed-off-by: Wei Xu Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/arm/mach-hisi/hotplug.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/arm/mach-hisi/hotplug.c b/arch/arm/mach-hisi/hotplug.c index 32870560b280..909bb2493781 100644 --- a/arch/arm/mach-hisi/hotplug.c +++ b/arch/arm/mach-hisi/hotplug.c @@ -180,11 +180,15 @@ static bool hix5hd2_hotplug_init(void) struct device_node *np; np = of_find_compatible_node(NULL, NULL, "hisilicon,cpuctrl"); - if (np) { - ctrl_base = of_iomap(np, 0); - return true; - } - return false; + if (!np) + return false; + + ctrl_base = of_iomap(np, 0); + of_node_put(np); + if (!ctrl_base) + return false; + + return true; } void hix5hd2_set_cpu(int cpu, bool enable) From eddbab9022ecbef7e00679f362449a49ddf3d323 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 30 May 2018 16:06:25 +0200 Subject: [PATCH 1086/1212] drm/nouveau: tegra: Detach from ARM DMA/IOMMU mapping [ Upstream commit b59fb482b52269977ee5de205308e5b236a03917 ] Depending on the kernel configuration, early ARM architecture setup code may have attached the GPU to a DMA/IOMMU mapping that transparently uses the IOMMU to back the DMA API. Tegra requires special handling for IOMMU backed buffers (a special bit in the GPU's MMU page tables indicates the memory path to take: via the SMMU or directly to the memory controller). Transparently backing DMA memory with an IOMMU prevents Nouveau from properly handling such memory accesses and causes memory access faults. As a side-note: buffers other than those allocated in instance memory don't need to be physically contiguous from the GPU's perspective since the GPU can map them into contiguous buffers using its own MMU. Mapping these buffers through the IOMMU is unnecessary and will even lead to performance degradation because of the additional translation. One exception to this are compressible buffers which need large pages. In order to enable these large pages, multiple small pages will have to be combined into one large (I/O virtually contiguous) mapping via the IOMMU. However, that is a topic outside the scope of this fix and isn't currently supported. An implementation will want to explicitly create these large pages in the Nouveau driver, so detaching from a DMA/IOMMU mapping would still be required. Signed-off-by: Thierry Reding Acked-by: Christoph Hellwig Reviewed-by: Robin Murphy Tested-by: Nicolas Chauvet Signed-off-by: Ben Skeggs Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/nouveau/nvkm/engine/device/tegra.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/tegra.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/tegra.c index e7e581d6a8ff..1bfc4807ce5b 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/device/tegra.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/tegra.c @@ -23,6 +23,10 @@ #ifdef CONFIG_NOUVEAU_PLATFORM_DRIVER #include "priv.h" +#if IS_ENABLED(CONFIG_ARM_DMA_USE_IOMMU) +#include +#endif + static int nvkm_device_tegra_power_up(struct nvkm_device_tegra *tdev) { @@ -85,6 +89,15 @@ nvkm_device_tegra_probe_iommu(struct nvkm_device_tegra *tdev) unsigned long pgsize_bitmap; int ret; +#if IS_ENABLED(CONFIG_ARM_DMA_USE_IOMMU) + if (dev->archdata.mapping) { + struct dma_iommu_mapping *mapping = to_dma_iommu_mapping(dev); + + arm_iommu_detach_device(dev); + arm_iommu_release_mapping(mapping); + } +#endif + if (!tdev->func->iommu_bit) return; From 05925d75714070f511dfd247f4b0f387f856b253 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Thu, 12 Jul 2018 22:29:55 +0100 Subject: [PATCH 1087/1212] parport: sunbpp: fix error return code [ Upstream commit faa1a47388b33623e4d504c23569188907b039a0 ] Return an error code on failure. Change leading spaces to tab on the first if. Problem found using Coccinelle. Signed-off-by: Julia Lawall Signed-off-by: Sudip Mukherjee Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/parport/parport_sunbpp.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/parport/parport_sunbpp.c b/drivers/parport/parport_sunbpp.c index 01cf1c1a841a..8de329546b82 100644 --- a/drivers/parport/parport_sunbpp.c +++ b/drivers/parport/parport_sunbpp.c @@ -286,12 +286,16 @@ static int bpp_probe(struct platform_device *op) ops = kmemdup(&parport_sunbpp_ops, sizeof(struct parport_operations), GFP_KERNEL); - if (!ops) + if (!ops) { + err = -ENOMEM; goto out_unmap; + } dprintk(("register_port\n")); - if (!(p = parport_register_port((unsigned long)base, irq, dma, ops))) + if (!(p = parport_register_port((unsigned long)base, irq, dma, ops))) { + err = -ENOMEM; goto out_free_ops; + } p->size = size; p->dev = &op->dev; From bfb2cbf2aaeafa2f24c31965cfbb079c79767c92 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Wed, 11 Jul 2018 13:40:28 -0600 Subject: [PATCH 1088/1212] coresight: Handle errors in finding input/output ports [ Upstream commit fe470f5f7f684ed15bc49b6183a64237547910ff ] If we fail to find the input / output port for a LINK component while enabling a path, we should fail gracefully rather than assuming port "0". Cc: Mathieu Poirier Signed-off-by: Suzuki K Poulose Signed-off-by: Mathieu Poirier Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/hwtracing/coresight/coresight.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/hwtracing/coresight/coresight.c b/drivers/hwtracing/coresight/coresight.c index 93738dfbf631..902ee6efd09c 100644 --- a/drivers/hwtracing/coresight/coresight.c +++ b/drivers/hwtracing/coresight/coresight.c @@ -86,7 +86,7 @@ static int coresight_find_link_inport(struct coresight_device *csdev) dev_err(&csdev->dev, "couldn't find inport, parent: %s, child: %s\n", dev_name(&parent->dev), dev_name(&csdev->dev)); - return 0; + return -ENODEV; } static int coresight_find_link_outport(struct coresight_device *csdev) @@ -107,7 +107,7 @@ static int coresight_find_link_outport(struct coresight_device *csdev) dev_err(&csdev->dev, "couldn't find outport, parent: %s, child: %s\n", dev_name(&csdev->dev), dev_name(&child->dev)); - return 0; + return -ENODEV; } static int coresight_enable_sink(struct coresight_device *csdev) @@ -155,6 +155,9 @@ static int coresight_enable_link(struct coresight_device *csdev) else refport = 0; + if (refport < 0) + return refport; + if (atomic_inc_return(&csdev->refcnt[refport]) == 1) { if (link_ops(csdev)->enable) { ret = link_ops(csdev)->enable(csdev, inport, outport); From 2e1c096da6402915919917a2a6dffef1ec1e0fba Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 11 Jul 2018 13:40:35 -0600 Subject: [PATCH 1089/1212] coresight: tpiu: Fix disabling timeouts [ Upstream commit ccff2dfaceaca4517432f5c149594215fe9098cc ] Probing the TPIU driver under UBSan triggers an out-of-bounds shift warning in coresight_timeout(): ... [ 5.677530] UBSAN: Undefined behaviour in drivers/hwtracing/coresight/coresight.c:929:16 [ 5.685542] shift exponent 64 is too large for 64-bit type 'long unsigned int' ... On closer inspection things are exponentially out of whack because we're passing a bitmask where a bit number should be. Amusingly, it seems that both calls will find their expected values by sheer luck and appear to succeed: 1 << FFCR_FON_MAN ends up at bit 64 which whilst undefined evaluates as zero in practice, while 1 << FFSR_FT_STOPPED finds bit 2 (TCPresent) which apparently is usually tied high. Following the examples of other drivers, define separate FOO and FOO_BIT macros for masks vs. indices, and put things right. CC: Robert Walker CC: Mike Leach CC: Mathieu Poirier Fixes: 11595db8e17f ("coresight: Fix disabling of CoreSight TPIU") Signed-off-by: Robin Murphy Signed-off-by: Mathieu Poirier Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/hwtracing/coresight/coresight-tpiu.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/hwtracing/coresight/coresight-tpiu.c b/drivers/hwtracing/coresight/coresight-tpiu.c index 22e10b7d505d..fe3a2b19a5db 100644 --- a/drivers/hwtracing/coresight/coresight-tpiu.c +++ b/drivers/hwtracing/coresight/coresight-tpiu.c @@ -46,8 +46,9 @@ /** register definition **/ /* FFSR - 0x300 */ -#define FFSR_FT_STOPPED BIT(1) +#define FFSR_FT_STOPPED_BIT 1 /* FFCR - 0x304 */ +#define FFCR_FON_MAN_BIT 6 #define FFCR_FON_MAN BIT(6) #define FFCR_STOP_FI BIT(12) @@ -93,9 +94,9 @@ static void tpiu_disable_hw(struct tpiu_drvdata *drvdata) /* Generate manual flush */ writel_relaxed(FFCR_STOP_FI | FFCR_FON_MAN, drvdata->base + TPIU_FFCR); /* Wait for flush to complete */ - coresight_timeout(drvdata->base, TPIU_FFCR, FFCR_FON_MAN, 0); + coresight_timeout(drvdata->base, TPIU_FFCR, FFCR_FON_MAN_BIT, 0); /* Wait for formatter to stop */ - coresight_timeout(drvdata->base, TPIU_FFSR, FFSR_FT_STOPPED, 1); + coresight_timeout(drvdata->base, TPIU_FFSR, FFSR_FT_STOPPED_BIT, 1); CS_LOCK(drvdata->base); } From 785cb2b02ff7c3000caa4eec977279f0fdba54c9 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 9 Jul 2018 21:47:27 +0300 Subject: [PATCH 1090/1212] gpiolib: Mark gpio_suffixes array with __maybe_unused MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit b23ec59926faf05b0c43680d05671c484e810ac4 ] Since we put static variable to a header file it's copied to each module that includes the header. But not all of them are actually used it. Mark gpio_suffixes array with __maybe_unused to hide a compiler warning: In file included from drivers/gpio/gpiolib-legacy.c:6:0: drivers/gpio/gpiolib.h:95:27: warning: ‘gpio_suffixes’ defined but not used [-Wunused-const-variable=] static const char * const gpio_suffixes[] = { "gpios", "gpio" }; ^~~~~~~~~~~~~ In file included from drivers/gpio/gpiolib-devprop.c:17:0: drivers/gpio/gpiolib.h:95:27: warning: ‘gpio_suffixes’ defined but not used [-Wunused-const-variable=] static const char * const gpio_suffixes[] = { "gpios", "gpio" }; ^~~~~~~~~~~~~ Signed-off-by: Andy Shevchenko Signed-off-by: Linus Walleij Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpio/gpiolib.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpiolib.h b/drivers/gpio/gpiolib.h index 98ab08c0aa2d..07541c5670e6 100644 --- a/drivers/gpio/gpiolib.h +++ b/drivers/gpio/gpiolib.h @@ -30,7 +30,7 @@ struct acpi_gpio_info { }; /* gpio suffixes used for ACPI and device tree lookup */ -static const char * const gpio_suffixes[] = { "gpios", "gpio" }; +static __maybe_unused const char * const gpio_suffixes[] = { "gpios", "gpio" }; #ifdef CONFIG_ACPI void acpi_gpiochip_add(struct gpio_chip *chip); From c862b3823509195cc72a2168019ffcc765a8a0d2 Mon Sep 17 00:00:00 2001 From: Wei Lu Date: Wed, 11 Jul 2018 22:32:47 -0400 Subject: [PATCH 1091/1212] drm/amdkfd: Fix error codes in kfd_get_process MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit e47cb828eb3fca3e8999a0b9aa053dda18552071 ] Return ERR_PTR(-EINVAL) if kfd_get_process fails to find the process. This fixes kernel oopses when a child process calls KFD ioctls with a file descriptor inherited from the parent process. Signed-off-by: Wei Lu Reviewed-by: Felix Kuehling Signed-off-by: Felix Kuehling Acked-by: Christian König Signed-off-by: Oded Gabbay Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index eb1da83c9902..8cdd505784ed 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -125,6 +125,8 @@ struct kfd_process *kfd_get_process(const struct task_struct *thread) return ERR_PTR(-EINVAL); process = find_process(thread); + if (!process) + return ERR_PTR(-EINVAL); return process; } From 5e1924779a64aab97efa4e49e7fd9e9d8809008a Mon Sep 17 00:00:00 2001 From: Zhouyang Jia Date: Tue, 12 Jun 2018 12:40:03 +0800 Subject: [PATCH 1092/1212] rtc: bq4802: add error handling for devm_ioremap [ Upstream commit 7874b919866ba91bac253fa219d3d4c82bb944df ] When devm_ioremap fails, the lack of error-handling code may cause unexpected results. This patch adds error-handling code after calling devm_ioremap. Signed-off-by: Zhouyang Jia Signed-off-by: Alexandre Belloni Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/rtc/rtc-bq4802.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/rtc/rtc-bq4802.c b/drivers/rtc/rtc-bq4802.c index bd170cb3361c..5747a54cbd42 100644 --- a/drivers/rtc/rtc-bq4802.c +++ b/drivers/rtc/rtc-bq4802.c @@ -164,6 +164,10 @@ static int bq4802_probe(struct platform_device *pdev) } else if (p->r->flags & IORESOURCE_MEM) { p->regs = devm_ioremap(&pdev->dev, p->r->start, resource_size(p->r)); + if (!p->regs){ + err = -ENOMEM; + goto out; + } p->read = bq4802_read_mem; p->write = bq4802_write_mem; } else { From e573a9830c52d16f4dcbbd5fa04ce2fbaa007467 Mon Sep 17 00:00:00 2001 From: Timo Wischer Date: Tue, 10 Jul 2018 17:28:45 +0200 Subject: [PATCH 1093/1212] ALSA: pcm: Fix snd_interval_refine first/last with open min/max [ Upstream commit ff2d6acdf6f13d9f8fdcd890844c6d7535ac1f10 ] Without this commit the following intervals [x y), (x y) were be replaced to (y-1 y) by snd_interval_refine_last(). This was also done if y-1 is part of the previous interval. With this changes it will be replaced with [y-1 y) in case of y-1 is part of the previous interval. A similar behavior will be used for snd_interval_refine_first(). This commit adapts the changes for alsa-lib of commit 9bb985c ("pcm: snd_interval_refine_first/last: exclude value only if also excluded before") Signed-off-by: Timo Wischer Signed-off-by: Takashi Iwai Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- sound/core/pcm_lib.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/sound/core/pcm_lib.c b/sound/core/pcm_lib.c index 4c145d6bccd4..5bc7ddf8fc70 100644 --- a/sound/core/pcm_lib.c +++ b/sound/core/pcm_lib.c @@ -648,27 +648,33 @@ EXPORT_SYMBOL(snd_interval_refine); static int snd_interval_refine_first(struct snd_interval *i) { + const unsigned int last_max = i->max; + if (snd_BUG_ON(snd_interval_empty(i))) return -EINVAL; if (snd_interval_single(i)) return 0; i->max = i->min; - i->openmax = i->openmin; - if (i->openmax) + if (i->openmin) i->max++; + /* only exclude max value if also excluded before refine */ + i->openmax = (i->openmax && i->max >= last_max); return 1; } static int snd_interval_refine_last(struct snd_interval *i) { + const unsigned int last_min = i->min; + if (snd_BUG_ON(snd_interval_empty(i))) return -EINVAL; if (snd_interval_single(i)) return 0; i->min = i->max; - i->openmin = i->openmax; - if (i->openmin) + if (i->openmax) i->min--; + /* only exclude min value if also excluded before refine */ + i->openmin = (i->openmin && i->min <= last_min); return 1; } From 5f5ea78ba60cf330bb884e56125ee49c5b30730f Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 29 May 2018 19:12:18 -0700 Subject: [PATCH 1094/1212] selftest: timers: Tweak raw_skew to SKIP when ADJ_OFFSET/other clock adjustments are in progress [ Upstream commit 1416270f4a1ae83ea84156ceba19a66a8f88be1f ] In the past we've warned when ADJ_OFFSET was in progress, usually caused by ntpd or some other time adjusting daemon running in non steady sate, which can cause the skew calculations to be incorrect. Thus, this patch checks to see if the clock was being adjusted when we fail so that we don't cause false negatives. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Miroslav Lichvar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Stephen Boyd Cc: Shuah Khan Cc: linux-kselftest@vger.kernel.org Suggested-by: Miroslav Lichvar Signed-off-by: John Stultz Signed-off-by: Greg Kroah-Hartman --- tools/testing/selftests/timers/raw_skew.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/testing/selftests/timers/raw_skew.c b/tools/testing/selftests/timers/raw_skew.c index 30906bfd9c1b..0ab937a17ebb 100644 --- a/tools/testing/selftests/timers/raw_skew.c +++ b/tools/testing/selftests/timers/raw_skew.c @@ -146,6 +146,11 @@ int main(int argv, char **argc) printf(" %lld.%i(act)", ppm/1000, abs((int)(ppm%1000))); if (llabs(eppm - ppm) > 1000) { + if (tx1.offset || tx2.offset || + tx1.freq != tx2.freq || tx1.tick != tx2.tick) { + printf(" [SKIP]\n"); + return ksft_exit_skip("The clock was adjusted externally. Shutdown NTPd or other time sync daemons\n"); + } printf(" [FAILED]\n"); return ksft_exit_fail(); } From 4951eb4bb6d665ff817333a1db79d8eff4def6fb Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 4 Jul 2018 12:38:09 +0300 Subject: [PATCH 1095/1212] drm/panel: type promotion bug in s6e8aa0_read_mtp_id() [ Upstream commit cd0e0ca69109d025b1a1b6609f70682db62138b0 ] The ARRAY_SIZE() macro is type size_t. If s6e8aa0_dcs_read() returns a negative error code, then "ret < ARRAY_SIZE(id)" is false because the negative error code is type promoted to a high positive value. Fixes: 02051ca06371 ("drm/panel: add S6E8AA0 driver") Signed-off-by: Dan Carpenter Reviewed-by: Andrzej Hajda Signed-off-by: Thierry Reding Link: https://patchwork.freedesktop.org/patch/msgid/20180704093807.s3lqsb2v6dg2k43d@kili.mountain Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/panel/panel-samsung-s6e8aa0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/panel/panel-samsung-s6e8aa0.c b/drivers/gpu/drm/panel/panel-samsung-s6e8aa0.c index a188a3959f1a..6ad827b93ae1 100644 --- a/drivers/gpu/drm/panel/panel-samsung-s6e8aa0.c +++ b/drivers/gpu/drm/panel/panel-samsung-s6e8aa0.c @@ -823,7 +823,7 @@ static void s6e8aa0_read_mtp_id(struct s6e8aa0 *ctx) int ret, i; ret = s6e8aa0_dcs_read(ctx, 0xd1, id, ARRAY_SIZE(id)); - if (ret < ARRAY_SIZE(id) || id[0] == 0x00) { + if (ret < 0 || ret < ARRAY_SIZE(id) || id[0] == 0x00) { dev_err(ctx->dev, "read id failed\n"); ctx->error = -EIO; return; From 86312d58a9defcc840c8f68ff36d82130cb84c28 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Mon, 2 Jul 2018 15:59:39 -0700 Subject: [PATCH 1096/1212] pinctrl: qcom: spmi-gpio: Fix pmic_gpio_config_get() to be compliant [ Upstream commit 1cf86bc21257a330e3af51f2a4e885f1a705f6a5 ] If you do this on an sdm845 board: grep "" /sys/kernel/debug/pinctrl/*spmi:pmic*/pinconf-groups ...it looks like nonsense. For every pin you see listed: input bias disabled, input bias high impedance, input bias pull down, input bias pull up, ... That's because pmic_gpio_config_get() isn't complying with the rules that pinconf_generic_dump_one() expects. Specifically for boolean parameters (anything with a "struct pin_config_item" where has_arg is false) the function expects that the function should return its value not through the "config" parameter but should return "0" if the value is set and "-EINVAL" if the value isn't set. Let's fix this. >From a quick sample of other pinctrl drivers, it appears to be tradition to also return 1 through the config parameter for these boolean parameters when they exist. I'm not one to knock tradition, so I'll follow tradition and return 1 in these cases. While I'm at it, I'll also continue searching for four leaf clovers, kocking on wood three times, and trying not to break mirrors. NOTE: This also fixes an apparent typo for reading PIN_CONFIG_BIAS_DISABLE where the old driver was accidentally using "=" instead of "==" and thus was setting some internal state when you tried to query PIN_CONFIG_BIAS_DISABLE. Oops. Fixes: eadff3024472 ("pinctrl: Qualcomm SPMI PMIC GPIO pin controller driver") Signed-off-by: Douglas Anderson Signed-off-by: Linus Walleij Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/pinctrl/qcom/pinctrl-spmi-gpio.c | 32 ++++++++++++++++++------ 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c b/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c index 6c42ca14d2fd..4ea810cafaac 100644 --- a/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c +++ b/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c @@ -291,31 +291,47 @@ static int pmic_gpio_config_get(struct pinctrl_dev *pctldev, switch (param) { case PIN_CONFIG_DRIVE_PUSH_PULL: - arg = pad->buffer_type == PMIC_GPIO_OUT_BUF_CMOS; + if (pad->buffer_type != PMIC_GPIO_OUT_BUF_CMOS) + return -EINVAL; + arg = 1; break; case PIN_CONFIG_DRIVE_OPEN_DRAIN: - arg = pad->buffer_type == PMIC_GPIO_OUT_BUF_OPEN_DRAIN_NMOS; + if (pad->buffer_type != PMIC_GPIO_OUT_BUF_OPEN_DRAIN_NMOS) + return -EINVAL; + arg = 1; break; case PIN_CONFIG_DRIVE_OPEN_SOURCE: - arg = pad->buffer_type == PMIC_GPIO_OUT_BUF_OPEN_DRAIN_PMOS; + if (pad->buffer_type != PMIC_GPIO_OUT_BUF_OPEN_DRAIN_PMOS) + return -EINVAL; + arg = 1; break; case PIN_CONFIG_BIAS_PULL_DOWN: - arg = pad->pullup == PMIC_GPIO_PULL_DOWN; + if (pad->pullup != PMIC_GPIO_PULL_DOWN) + return -EINVAL; + arg = 1; break; case PIN_CONFIG_BIAS_DISABLE: - arg = pad->pullup = PMIC_GPIO_PULL_DISABLE; + if (pad->pullup != PMIC_GPIO_PULL_DISABLE) + return -EINVAL; + arg = 1; break; case PIN_CONFIG_BIAS_PULL_UP: - arg = pad->pullup == PMIC_GPIO_PULL_UP_30; + if (pad->pullup != PMIC_GPIO_PULL_UP_30) + return -EINVAL; + arg = 1; break; case PIN_CONFIG_BIAS_HIGH_IMPEDANCE: - arg = !pad->is_enabled; + if (pad->is_enabled) + return -EINVAL; + arg = 1; break; case PIN_CONFIG_POWER_SOURCE: arg = pad->power_source; break; case PIN_CONFIG_INPUT_ENABLE: - arg = pad->input_enabled; + if (!pad->input_enabled) + return -EINVAL; + arg = 1; break; case PIN_CONFIG_OUTPUT: arg = pad->out_value; From 8b97b2ec3672471fa2b0a6242001280b9854ad8a Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 21 Aug 2018 11:59:53 +0200 Subject: [PATCH 1097/1212] USB: serial: ti_usb_3410_5052: fix array underflow in completion handler commit 5dfdd24eb3d39d815bc952ae98128e967c9bba49 upstream. Similarly to a recently reported bug in io_ti, a malicious USB device could set port_number to a negative value and we would underflow the port array in the interrupt completion handler. As these devices only have one or two ports, fix this by making sure we only consider the seventh bit when determining the port number (and ignore bits 0xb0 which are typically set to 0x30). Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Cc: stable Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/ti_usb_3410_5052.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/serial/ti_usb_3410_5052.h b/drivers/usb/serial/ti_usb_3410_5052.h index 98f35c656c02..0cd247f75b8b 100644 --- a/drivers/usb/serial/ti_usb_3410_5052.h +++ b/drivers/usb/serial/ti_usb_3410_5052.h @@ -227,7 +227,7 @@ struct ti_interrupt { } __attribute__((packed)); /* Interrupt codes */ -#define TI_GET_PORT_FROM_CODE(c) (((c) >> 4) - 3) +#define TI_GET_PORT_FROM_CODE(c) (((c) >> 6) & 0x01) #define TI_GET_FUNC_FROM_CODE(c) ((c) & 0x0f) #define TI_CODE_HARDWARE_ERROR 0xFF #define TI_CODE_DATA_ERROR 0x03 From 24328b80de178d4ece8175481349ae33ba3ca65f Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 11 Jul 2018 15:29:31 +0300 Subject: [PATCH 1098/1212] mei: bus: type promotion bug in mei_nfc_if_version() commit b40b3e9358fbafff6a4ba0f4b9658f6617146f9c upstream. We accidentally removed the check for negative returns without considering the issue of type promotion. The "if_version_length" variable is type size_t so if __mei_cl_recv() returns a negative then "bytes_recv" is type promoted to a high positive value and treated as success. Cc: Fixes: 582ab27a063a ("mei: bus: fix received data size check in NFC fixup") Signed-off-by: Dan Carpenter Signed-off-by: Tomas Winkler Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mei/bus-fixup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/misc/mei/bus-fixup.c b/drivers/misc/mei/bus-fixup.c index bdc7fcd80eca..9dcdc6f41ceb 100644 --- a/drivers/misc/mei/bus-fixup.c +++ b/drivers/misc/mei/bus-fixup.c @@ -151,7 +151,7 @@ static int mei_nfc_if_version(struct mei_cl *cl, ret = 0; bytes_recv = __mei_cl_recv(cl, (u8 *)reply, if_version_length); - if (bytes_recv < if_version_length) { + if (bytes_recv < 0 || bytes_recv < if_version_length) { dev_err(bus->dev, "Could not read IF version\n"); ret = -EIO; goto err; From 30eee1c68c52a02ec54730bfbe3ba4e3ff3a7d6e Mon Sep 17 00:00:00 2001 From: David Rivshin Date: Wed, 27 Apr 2016 21:32:31 -0400 Subject: [PATCH 1099/1212] drivers: net: cpsw: fix segfault in case of bad phy-handle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit d733f7542ad47cf73e033c90cf55158587e1d060 upstream. If an emac node has a phy-handle property that points to something which is not a phy, then a segmentation fault will occur when the interface is brought up. This is because while phy_connect() will return ERR_PTR() on failure, of_phy_connect() will return NULL. The common error check uses IS_ERR(), and so missed when of_phy_connect() fails. The NULL pointer is then dereferenced. Also, the common error message referenced slave->data->phy_id, which would be empty in the case of phy-handle. Instead, use the name of the device_node as a useful identifier. And in the phy_id case add the error code for completeness. Fixes: 9e42f715264f ("drivers: net: cpsw: add phy-handle parsing") Signed-off-by: David Rivshin Signed-off-by: David S. Miller [SZ Lin (林上智): Tweak the patch to use original print function of dev_info()] Signed-off-by: SZ Lin (林上智) Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/ti/cpsw.c | 37 +++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index c2e110b2549b..c1217a87d535 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -1164,25 +1164,34 @@ static void cpsw_slave_open(struct cpsw_slave *slave, struct cpsw_priv *priv) cpsw_ale_add_mcast(priv->ale, priv->ndev->broadcast, 1 << slave_port, 0, 0, ALE_MCAST_FWD_2); - if (slave->data->phy_node) + if (slave->data->phy_node) { slave->phy = of_phy_connect(priv->ndev, slave->data->phy_node, &cpsw_adjust_link, 0, slave->data->phy_if); - else + if (!slave->phy) { + dev_err(priv->dev, "phy \"%s\" not found on slave %d\n", + slave->data->phy_node->full_name, + slave->slave_num); + return; + } + } else { slave->phy = phy_connect(priv->ndev, slave->data->phy_id, &cpsw_adjust_link, slave->data->phy_if); - if (IS_ERR(slave->phy)) { - dev_err(priv->dev, "phy %s not found on slave %d\n", - slave->data->phy_id, slave->slave_num); - slave->phy = NULL; - } else { - dev_info(priv->dev, "phy found : id is : 0x%x\n", - slave->phy->phy_id); - phy_start(slave->phy); - - /* Configure GMII_SEL register */ - cpsw_phy_sel(&priv->pdev->dev, slave->phy->interface, - slave->slave_num); + if (IS_ERR(slave->phy)) { + dev_err(priv->dev, + "phy \"%s\" not found on slave %d, err %ld\n", + slave->data->phy_id, slave->slave_num, + PTR_ERR(slave->phy)); + slave->phy = NULL; + return; + } } + + dev_info(priv->dev, "phy found : id is : 0x%x\n", slave->phy->phy_id); + + phy_start(slave->phy); + + /* Configure GMII_SEL register */ + cpsw_phy_sel(&priv->pdev->dev, slave->phy->interface, slave->slave_num); } static inline void cpsw_add_default_vlan(struct cpsw_priv *priv) From 262ea6c0c422da1c5c8243cfde148fa9fdad7a26 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Thu, 30 Aug 2018 11:01:21 -0700 Subject: [PATCH 1100/1212] MIPS: VDSO: Match data page cache colouring when D$ aliases commit 0f02cfbc3d9e413d450d8d0fd660077c23f67eff upstream. When a system suffers from dcache aliasing a user program may observe stale VDSO data from an aliased cache line. Notably this can break the expectation that clock_gettime(CLOCK_MONOTONIC, ...) is, as its name suggests, monotonic. In order to ensure that users observe updates to the VDSO data page as intended, align the user mappings of the VDSO data page such that their cache colouring matches that of the virtual address range which the kernel will use to update the data page - typically its unmapped address within kseg0. This ensures that we don't introduce aliasing cache lines for the VDSO data page, and therefore that userland will observe updates without requiring cache invalidation. Signed-off-by: Paul Burton Reported-by: Hauke Mehrtens Reported-by: Rene Nielsen Reported-by: Alexandre Belloni Fixes: ebb5e78cc634 ("MIPS: Initial implementation of a VDSO") Patchwork: https://patchwork.linux-mips.org/patch/20344/ Tested-by: Alexandre Belloni Tested-by: Hauke Mehrtens Cc: James Hogan Cc: linux-mips@linux-mips.org Cc: stable@vger.kernel.org # v4.4+ Signed-off-by: Greg Kroah-Hartman --- arch/mips/kernel/vdso.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/arch/mips/kernel/vdso.c b/arch/mips/kernel/vdso.c index 5649a9e429e0..aca06b18c43e 100644 --- a/arch/mips/kernel/vdso.c +++ b/arch/mips/kernel/vdso.c @@ -14,12 +14,14 @@ #include #include #include +#include #include #include #include #include #include +#include #include /* Kernel-provided data used by the VDSO. */ @@ -118,12 +120,30 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) vvar_size = gic_size + PAGE_SIZE; size = vvar_size + image->size; + /* + * Find a region that's large enough for us to perform the + * colour-matching alignment below. + */ + if (cpu_has_dc_aliases) + size += shm_align_mask + 1; + base = get_unmapped_area(NULL, 0, size, 0, 0); if (IS_ERR_VALUE(base)) { ret = base; goto out; } + /* + * If we suffer from dcache aliasing, ensure that the VDSO data page + * mapping is coloured the same as the kernel's mapping of that memory. + * This ensures that when the kernel updates the VDSO data userland + * will observe it without requiring cache invalidations. + */ + if (cpu_has_dc_aliases) { + base = __ALIGN_MASK(base, shm_align_mask); + base += ((unsigned long)&vdso_data - gic_size) & shm_align_mask; + } + data_addr = base + gic_size; vdso_addr = data_addr + PAGE_SIZE; From 9fbcdd1319b481591803d29a458add4cf79e5431 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 26 Sep 2018 08:35:10 +0200 Subject: [PATCH 1101/1212] Linux 4.4.158 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 2d55f88e6a08..d07a6283b67e 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 157 +SUBLEVEL = 158 EXTRAVERSION = NAME = Blurry Fish Butt From ec56e98f80666644fa43f17092fdcd861348eea1 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 17 Sep 2018 15:51:40 +0200 Subject: [PATCH 1102/1212] NFC: Fix possible memory corruption when handling SHDLC I-Frame commands commit 674d9de02aa7d521ebdf66c3958758bdd9c64e11 upstream. When handling SHDLC I-Frame commands "pipe" field used for indexing into an array should be checked before usage. If left unchecked it might access memory outside of the array of size NFC_HCI_MAX_PIPES(127). Malformed NFC HCI frames could be injected by a malicious NFC device communicating with the device being attacked (remote attack vector), or even by an attacker with physical access to the I2C bus such that they could influence the data transfers on that bus (local attack vector). skb->data is controlled by the attacker and has only been sanitized in the most trivial ways (CRC check), therefore we can consider the create_info struct and all of its members to tainted. 'create_info->pipe' with max value of 255 (uint8) is used to take an offset of the hdev->pipes array of 127 elements which can lead to OOB write. Cc: Samuel Ortiz Cc: Allen Pais Cc: "David S. Miller" Suggested-by: Kevin Deus Signed-off-by: Suren Baghdasaryan Acked-by: Kees Cook Cc: stable Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/nfc/hci/core.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c index 2b0f0ac498d2..5a58f9f38095 100644 --- a/net/nfc/hci/core.c +++ b/net/nfc/hci/core.c @@ -209,6 +209,11 @@ void nfc_hci_cmd_received(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd, } create_info = (struct hci_create_pipe_resp *)skb->data; + if (create_info->pipe >= NFC_HCI_MAX_PIPES) { + status = NFC_HCI_ANY_E_NOK; + goto exit; + } + /* Save the new created pipe and bind with local gate, * the description for skb->data[3] is destination gate id * but since we received this cmd from host controller, we @@ -232,6 +237,11 @@ void nfc_hci_cmd_received(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd, } delete_info = (struct hci_delete_pipe_noti *)skb->data; + if (delete_info->pipe >= NFC_HCI_MAX_PIPES) { + status = NFC_HCI_ANY_E_NOK; + goto exit; + } + hdev->pipes[delete_info->pipe].gate = NFC_HCI_INVALID_GATE; hdev->pipes[delete_info->pipe].dest_host = NFC_HCI_INVALID_HOST; break; From b413ee0476ea3426846a8139a71514bb627f7596 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 17 Sep 2018 15:51:41 +0200 Subject: [PATCH 1103/1212] NFC: Fix the number of pipes commit e285d5bfb7e9785d289663baef252dd315e171f8 upstream. According to ETSI TS 102 622 specification chapter 4.4 pipe identifier is 7 bits long which allows for 128 unique pipe IDs. Because NFC_HCI_MAX_PIPES is used as the number of pipes supported and not as the max pipe ID, its value should be 128 instead of 127. nfc_hci_recv_from_llc extracts pipe ID from packet header using NFC_HCI_FRAGMENT(0x7F) mask which allows for pipe ID value of 127. Same happens when NCI_HCP_MSG_GET_PIPE() is being used. With pipes array having only 127 elements and pipe ID of 127 the OOB memory access will result. Cc: Samuel Ortiz Cc: Allen Pais Cc: "David S. Miller" Suggested-by: Dan Carpenter Signed-off-by: Suren Baghdasaryan Reviewed-by: Kees Cook Cc: stable Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/nfc/hci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/nfc/hci.h b/include/net/nfc/hci.h index 316694dafa5b..008f466d1da7 100644 --- a/include/net/nfc/hci.h +++ b/include/net/nfc/hci.h @@ -87,7 +87,7 @@ struct nfc_hci_pipe { * According to specification 102 622 chapter 4.4 Pipes, * the pipe identifier is 7 bits long. */ -#define NFC_HCI_MAX_PIPES 127 +#define NFC_HCI_MAX_PIPES 128 struct nfc_hci_init_data { u8 gate_count; struct nfc_hci_gate gates[NFC_HCI_MAX_CUSTOM_GATES]; From 68d542be69b3b045d5309fc15d1665aefb6fd6dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Szymanski?= Date: Thu, 6 Sep 2018 11:16:00 +0200 Subject: [PATCH 1104/1212] ASoC: cs4265: fix MMTLR Data switch control MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 90a3b7f8aba3011badacd6d8121e03aa24ac79d1 upstream. The MMTLR bit is in the CS4265_SPDIF_CTL2 register at address 0x12 bit 0 and not at address 0x0 bit 1. Fix this. Signed-off-by: Sébastien Szymanski Signed-off-by: Mark Brown Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- sound/soc/codecs/cs4265.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/cs4265.c b/sound/soc/codecs/cs4265.c index 55db19ddc5ff..93b02be3a90e 100644 --- a/sound/soc/codecs/cs4265.c +++ b/sound/soc/codecs/cs4265.c @@ -157,8 +157,8 @@ static const struct snd_kcontrol_new cs4265_snd_controls[] = { SOC_SINGLE("Validity Bit Control Switch", CS4265_SPDIF_CTL2, 3, 1, 0), SOC_ENUM("SPDIF Mono/Stereo", spdif_mono_stereo_enum), - SOC_SINGLE("MMTLR Data Switch", 0, - 1, 1, 0), + SOC_SINGLE("MMTLR Data Switch", CS4265_SPDIF_CTL2, + 0, 1, 0), SOC_ENUM("Mono Channel Select", spdif_mono_select_enum), SND_SOC_BYTES("C Data Buffer", CS4265_C_DATA_BUFF, 24), }; From 1da7fa6383fde3e7a4954460352f7f4574c99bfd Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Sun, 9 Sep 2018 22:25:12 +0900 Subject: [PATCH 1105/1212] ALSA: bebob: use address returned by kmalloc() instead of kernel stack for streaming DMA mapping commit 493626f2d87a74e6dbea1686499ed6e7e600484e upstream. When executing 'fw_run_transaction()' with 'TCODE_WRITE_BLOCK_REQUEST', an address of 'payload' argument is used for streaming DMA mapping by 'firewire_ohci' module if 'size' argument is larger than 8 byte. Although in this case the address should not be on kernel stack, current implementation of ALSA bebob driver uses data in kernel stack for a cue to boot M-Audio devices. This often brings unexpected result, especially for a case of CONFIG_VMAP_STACK=y. This commit fixes the bug. Reference: https://bugzilla.kernel.org/show_bug.cgi?id=201021 Reference: https://forum.manjaro.org/t/firewire-m-audio-410-driver-wont-load-firmware/51165 Fixes: a2b2a7798fb6('ALSA: bebob: Send a cue to load firmware for M-Audio Firewire series') Cc: # v3.16+ Signed-off-by: Takashi Sakamoto Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/firewire/bebob/bebob_maudio.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/sound/firewire/bebob/bebob_maudio.c b/sound/firewire/bebob/bebob_maudio.c index 07e5abdbceb5..0a576ccca3dc 100644 --- a/sound/firewire/bebob/bebob_maudio.c +++ b/sound/firewire/bebob/bebob_maudio.c @@ -96,17 +96,13 @@ int snd_bebob_maudio_load_firmware(struct fw_unit *unit) struct fw_device *device = fw_parent_device(unit); int err, rcode; u64 date; - __le32 cues[3] = { - cpu_to_le32(MAUDIO_BOOTLOADER_CUE1), - cpu_to_le32(MAUDIO_BOOTLOADER_CUE2), - cpu_to_le32(MAUDIO_BOOTLOADER_CUE3) - }; + __le32 *cues; /* check date of software used to build */ err = snd_bebob_read_block(unit, INFO_OFFSET_SW_DATE, &date, sizeof(u64)); if (err < 0) - goto end; + return err; /* * firmware version 5058 or later has date later than "20070401", but * 'date' is not null-terminated. @@ -114,20 +110,28 @@ int snd_bebob_maudio_load_firmware(struct fw_unit *unit) if (date < 0x3230303730343031LL) { dev_err(&unit->device, "Use firmware version 5058 or later\n"); - err = -ENOSYS; - goto end; + return -ENXIO; } + cues = kmalloc_array(3, sizeof(*cues), GFP_KERNEL); + if (!cues) + return -ENOMEM; + + cues[0] = cpu_to_le32(MAUDIO_BOOTLOADER_CUE1); + cues[1] = cpu_to_le32(MAUDIO_BOOTLOADER_CUE2); + cues[2] = cpu_to_le32(MAUDIO_BOOTLOADER_CUE3); + rcode = fw_run_transaction(device->card, TCODE_WRITE_BLOCK_REQUEST, device->node_id, device->generation, device->max_speed, BEBOB_ADDR_REG_REQ, - cues, sizeof(cues)); + cues, 3 * sizeof(*cues)); + kfree(cues); if (rcode != RCODE_COMPLETE) { dev_err(&unit->device, "Failed to send a cue to load firmware\n"); err = -EIO; } -end: + return err; } From 66d1019aff7c7d423a1ef44c3ea8fd89dd1c35bd Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sat, 8 Sep 2018 08:12:21 +0200 Subject: [PATCH 1106/1212] ALSA: emu10k1: fix possible info leak to userspace on SNDRV_EMU10K1_IOCTL_INFO commit 49434c6c575d2008c0abbc93e615019f39e01252 upstream. snd_emu10k1_fx8010_ioctl(SNDRV_EMU10K1_IOCTL_INFO) allocates memory using kmalloc() and partially fills it by calling snd_emu10k1_fx8010_info() before returning the resulting structure to userspace, leaving uninitialized holes. Let's just use kzalloc() here. BugLink: http://blog.infosectcbr.com.au/2018/09/linux-kernel-infoleaks.html Signed-off-by: Willy Tarreau Cc: Jann Horn Cc: Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/emu10k1/emufx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/emu10k1/emufx.c b/sound/pci/emu10k1/emufx.c index 56fc47bd6dba..50b216fc369f 100644 --- a/sound/pci/emu10k1/emufx.c +++ b/sound/pci/emu10k1/emufx.c @@ -2520,7 +2520,7 @@ static int snd_emu10k1_fx8010_ioctl(struct snd_hwdep * hw, struct file *file, un emu->support_tlv = 1; return put_user(SNDRV_EMU10K1_VERSION, (int __user *)argp); case SNDRV_EMU10K1_IOCTL_INFO: - info = kmalloc(sizeof(*info), GFP_KERNEL); + info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return -ENOMEM; snd_emu10k1_fx8010_info(emu, info); From 39b38432c8ac6c084a8cadc3dba07c570ac2866b Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 10 Sep 2018 13:01:53 -0500 Subject: [PATCH 1107/1212] platform/x86: alienware-wmi: Correct a memory leak commit ff0e9f26288d2daee4950f42b37a3d3d30d36ec1 upstream. An ACPI buffer that was allocated was not being freed after use. Signed-off-by: Mario Limonciello Cc: stable@vger.kernel.org Signed-off-by: Darren Hart (VMware) Signed-off-by: Greg Kroah-Hartman --- drivers/platform/x86/alienware-wmi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/x86/alienware-wmi.c b/drivers/platform/x86/alienware-wmi.c index 1e1e59423889..3df47c1b04ec 100644 --- a/drivers/platform/x86/alienware-wmi.c +++ b/drivers/platform/x86/alienware-wmi.c @@ -463,6 +463,7 @@ static acpi_status alienware_hdmi_command(struct hdmi_args *in_args, if (obj && obj->type == ACPI_TYPE_INTEGER) *out_data = (u32) obj->integer.value; } + kfree(output.pointer); return status; } From bd1468048aa3e2af865da92990539946d4483989 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 11 Sep 2018 09:04:48 +0200 Subject: [PATCH 1108/1212] xen/netfront: don't bug in case of too many frags commit ad4f15dc2c70b1de5e0a64d27335962fbc9cf71c upstream. Commit 57f230ab04d291 ("xen/netfront: raise max number of slots in xennet_get_responses()") raised the max number of allowed slots by one. This seems to be problematic in some configurations with netback using a larger MAX_SKB_FRAGS value (e.g. old Linux kernel with MAX_SKB_FRAGS defined as 18 instead of nowadays 17). Instead of BUG_ON() in this case just fall back to retransmission. Fixes: 57f230ab04d291 ("xen/netfront: raise max number of slots in xennet_get_responses()") Cc: stable@vger.kernel.org Signed-off-by: Juergen Gross Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/xen-netfront.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 3270b4333668..0a4bd73caae5 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -892,7 +892,11 @@ static RING_IDX xennet_fill_frags(struct netfront_queue *queue, BUG_ON(pull_to <= skb_headlen(skb)); __pskb_pull_tail(skb, pull_to - skb_headlen(skb)); } - BUG_ON(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS); + if (unlikely(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS)) { + queue->rx.rsp_cons = ++cons; + kfree_skb(nskb); + return ~0U; + } skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, skb_frag_page(nfrag), @@ -1029,6 +1033,8 @@ static int xennet_poll(struct napi_struct *napi, int budget) skb->len += rx->status; i = xennet_fill_frags(queue, skb, &tmpq); + if (unlikely(i == ~0U)) + goto err; if (rx->flags & XEN_NETRXF_csum_blank) skb->ip_summed = CHECKSUM_PARTIAL; From 28ca9ed1c95547daee49749287393ef6e540dbaa Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Thu, 12 Jul 2018 13:27:00 -0400 Subject: [PATCH 1109/1212] xen/x86/vpmu: Zero struct pt_regs before calling into sample handling code commit 70513d58751d7c6c1a0133557b13089b9f2e3e66 upstream. Otherwise we may leak kernel stack for events that sample user registers. Reported-by: Mark Rutland Reviewed-by: Juergen Gross Signed-off-by: Boris Ostrovsky Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/xen/pmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c index 724a08740a04..9c7358110d32 100644 --- a/arch/x86/xen/pmu.c +++ b/arch/x86/xen/pmu.c @@ -477,7 +477,7 @@ static void xen_convert_regs(const struct xen_pmu_regs *xen_regs, irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id) { int err, ret = IRQ_NONE; - struct pt_regs regs; + struct pt_regs regs = {0}; const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); uint8_t xenpmu_flags = get_xenpmu_flags(); From fed4d566a8215ab04d1e674a47d5746b051699f2 Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Fri, 7 Sep 2018 15:31:29 -0700 Subject: [PATCH 1110/1212] ring-buffer: Allow for rescheduling when removing pages commit 83f365554e47997ec68dc4eca3f5dce525cd15c3 upstream. When reducing ring buffer size, pages are removed by scheduling a work item on each CPU for the corresponding CPU ring buffer. After the pages are removed from ring buffer linked list, the pages are free()d in a tight loop. The loop does not give up CPU until all pages are removed. In a worst case behavior, when lot of pages are to be freed, it can cause system stall. After the pages are removed from the list, the free() can happen while the work is rescheduled. Call cond_resched() in the loop to prevent the system hangup. Link: http://lkml.kernel.org/r/20180907223129.71994-1-vnagarnaik@google.com Cc: stable@vger.kernel.org Fixes: 83f40318dab00 ("ring-buffer: Make removal of ring buffer pages atomic") Reported-by: Jason Behmer Signed-off-by: Vaibhav Nagarnaik Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/ring_buffer.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index fdaa88f38aec..74b20e3ab8c6 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1513,6 +1513,8 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) tmp_iter_page = first_page; do { + cond_resched(); + to_remove_page = tmp_iter_page; rb_inc_page(cpu_buffer, &tmp_iter_page); From 4da7f35b06702b1bc011270f15084a574ac76e1f Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 20 Sep 2018 12:22:39 -0700 Subject: [PATCH 1111/1212] mm: shmem.c: Correctly annotate new inodes for lockdep commit b45d71fb89ab8adfe727b9d0ee188ed58582a647 upstream. Directories and inodes don't necessarily need to be in the same lockdep class. For ex, hugetlbfs splits them out too to prevent false positives in lockdep. Annotate correctly after new inode creation. If its a directory inode, it will be put into a different class. This should fix a lockdep splat reported by syzbot: > ====================================================== > WARNING: possible circular locking dependency detected > 4.18.0-rc8-next-20180810+ #36 Not tainted > ------------------------------------------------------ > syz-executor900/4483 is trying to acquire lock: > 00000000d2bfc8fe (&sb->s_type->i_mutex_key#9){++++}, at: inode_lock > include/linux/fs.h:765 [inline] > 00000000d2bfc8fe (&sb->s_type->i_mutex_key#9){++++}, at: > shmem_fallocate+0x18b/0x12e0 mm/shmem.c:2602 > > but task is already holding lock: > 0000000025208078 (ashmem_mutex){+.+.}, at: ashmem_shrink_scan+0xb4/0x630 > drivers/staging/android/ashmem.c:448 > > which lock already depends on the new lock. > > -> #2 (ashmem_mutex){+.+.}: > __mutex_lock_common kernel/locking/mutex.c:925 [inline] > __mutex_lock+0x171/0x1700 kernel/locking/mutex.c:1073 > mutex_lock_nested+0x16/0x20 kernel/locking/mutex.c:1088 > ashmem_mmap+0x55/0x520 drivers/staging/android/ashmem.c:361 > call_mmap include/linux/fs.h:1844 [inline] > mmap_region+0xf27/0x1c50 mm/mmap.c:1762 > do_mmap+0xa10/0x1220 mm/mmap.c:1535 > do_mmap_pgoff include/linux/mm.h:2298 [inline] > vm_mmap_pgoff+0x213/0x2c0 mm/util.c:357 > ksys_mmap_pgoff+0x4da/0x660 mm/mmap.c:1585 > __do_sys_mmap arch/x86/kernel/sys_x86_64.c:100 [inline] > __se_sys_mmap arch/x86/kernel/sys_x86_64.c:91 [inline] > __x64_sys_mmap+0xe9/0x1b0 arch/x86/kernel/sys_x86_64.c:91 > do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 > entry_SYSCALL_64_after_hwframe+0x49/0xbe > > -> #1 (&mm->mmap_sem){++++}: > __might_fault+0x155/0x1e0 mm/memory.c:4568 > _copy_to_user+0x30/0x110 lib/usercopy.c:25 > copy_to_user include/linux/uaccess.h:155 [inline] > filldir+0x1ea/0x3a0 fs/readdir.c:196 > dir_emit_dot include/linux/fs.h:3464 [inline] > dir_emit_dots include/linux/fs.h:3475 [inline] > dcache_readdir+0x13a/0x620 fs/libfs.c:193 > iterate_dir+0x48b/0x5d0 fs/readdir.c:51 > __do_sys_getdents fs/readdir.c:231 [inline] > __se_sys_getdents fs/readdir.c:212 [inline] > __x64_sys_getdents+0x29f/0x510 fs/readdir.c:212 > do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 > entry_SYSCALL_64_after_hwframe+0x49/0xbe > > -> #0 (&sb->s_type->i_mutex_key#9){++++}: > lock_acquire+0x1e4/0x540 kernel/locking/lockdep.c:3924 > down_write+0x8f/0x130 kernel/locking/rwsem.c:70 > inode_lock include/linux/fs.h:765 [inline] > shmem_fallocate+0x18b/0x12e0 mm/shmem.c:2602 > ashmem_shrink_scan+0x236/0x630 drivers/staging/android/ashmem.c:455 > ashmem_ioctl+0x3ae/0x13a0 drivers/staging/android/ashmem.c:797 > vfs_ioctl fs/ioctl.c:46 [inline] > file_ioctl fs/ioctl.c:501 [inline] > do_vfs_ioctl+0x1de/0x1720 fs/ioctl.c:685 > ksys_ioctl+0xa9/0xd0 fs/ioctl.c:702 > __do_sys_ioctl fs/ioctl.c:709 [inline] > __se_sys_ioctl fs/ioctl.c:707 [inline] > __x64_sys_ioctl+0x73/0xb0 fs/ioctl.c:707 > do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 > entry_SYSCALL_64_after_hwframe+0x49/0xbe > > other info that might help us debug this: > > Chain exists of: > &sb->s_type->i_mutex_key#9 --> &mm->mmap_sem --> ashmem_mutex > > Possible unsafe locking scenario: > > CPU0 CPU1 > ---- ---- > lock(ashmem_mutex); > lock(&mm->mmap_sem); > lock(ashmem_mutex); > lock(&sb->s_type->i_mutex_key#9); > > *** DEADLOCK *** > > 1 lock held by syz-executor900/4483: > #0: 0000000025208078 (ashmem_mutex){+.+.}, at: > ashmem_shrink_scan+0xb4/0x630 drivers/staging/android/ashmem.c:448 Link: http://lkml.kernel.org/r/20180821231835.166639-1-joel@joelfernandes.org Signed-off-by: Joel Fernandes (Google) Reported-by: syzbot Reviewed-by: NeilBrown Suggested-by: NeilBrown Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- mm/shmem.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/shmem.c b/mm/shmem.c index 1b11ccc0a3b7..8e506a45a6ef 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1464,6 +1464,8 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode mpol_shared_policy_init(&info->policy, NULL); break; } + + lockdep_annotate_inode_mutex_key(inode); } else shmem_free_inode(sb); return inode; From cb66016b7b895b8f27609ee128c0b71da7213816 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Thu, 13 Sep 2018 16:43:07 +0200 Subject: [PATCH 1112/1212] gso_segment: Reset skb->mac_len after modifying network header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit c56cae23c6b167acc68043c683c4573b80cbcc2c ] When splitting a GSO segment that consists of encapsulated packets, the skb->mac_len of the segments can end up being set wrong, causing packet drops in particular when using act_mirred and ifb interfaces in combination with a qdisc that splits GSO packets. This happens because at the time skb_segment() is called, network_header will point to the inner header, throwing off the calculation in skb_reset_mac_len(). The network_header is subsequently adjust by the outer IP gso_segment handlers, but they don't set the mac_len. Fix this by adding skb_reset_mac_len() calls to both the IPv4 and IPv6 gso_segment handlers, after they modify the network_header. Many thanks to Eric Dumazet for his help in identifying the cause of the bug. Acked-by: Dave Taht Reviewed-by: Eric Dumazet Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/af_inet.c | 1 + net/ipv6/ip6_offload.c | 1 + 2 files changed, 2 insertions(+) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 967a47ff78a4..b12721ecb0b6 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1280,6 +1280,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, if (encap) skb_reset_inner_headers(skb); skb->network_header = (u8 *)iph - skb->head; + skb_reset_mac_len(skb); } while ((skb = skb->next)); out: diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 9e2ea4ae840d..244b9fec9d4d 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -118,6 +118,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff); ipv6h->payload_len = htons(skb->len - nhoff - sizeof(*ipv6h)); skb->network_header = (u8 *)ipv6h - skb->head; + skb_reset_mac_len(skb); if (udpfrag) { int err = ip6_find_1stfragopt(skb, &prevhdr); From 2ec3b47a784685347c29ff5e46bc1f2429eaf264 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 14 Sep 2018 12:02:31 -0700 Subject: [PATCH 1113/1212] ipv6: fix possible use-after-free in ip6_xmit() [ Upstream commit bbd6528d28c1b8e80832b3b018ec402b6f5c3215 ] In the unlikely case ip6_xmit() has to call skb_realloc_headroom(), we need to call skb_set_owner_w() before consuming original skb, otherwise we risk a use-after-free. Bring IPv6 in line with what we do in IPv4 to fix this. Fixes: 1da177e4c3f41 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/ip6_output.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 0feede45bd28..530b62fd6b64 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -193,12 +193,10 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, kfree_skb(skb); return -ENOBUFS; } + if (skb->sk) + skb_set_owner_w(skb2, skb->sk); consume_skb(skb); skb = skb2; - /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically, - * it is safe to call in our context (socket lock not held) - */ - skb_set_owner_w(skb, (struct sock *)sk); } if (opt->opt_flen) ipv6_push_frag_opts(skb, opt, &proto); From fee0d234419708192925c9e25a461f1f43dab24f Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Wed, 12 Sep 2018 07:36:35 +0200 Subject: [PATCH 1114/1212] net/appletalk: fix minor pointer leak to userspace in SIOCFINDIPDDPRT [ Upstream commit 9824dfae5741275473a23a7ed5756c7b6efacc9d ] Fields ->dev and ->next of struct ipddp_route may be copied to userspace on the SIOCFINDIPDDPRT ioctl. This is only accessible to CAP_NET_ADMIN though. Let's manually copy the relevant fields instead of using memcpy(). BugLink: http://blog.infosectcbr.com.au/2018/09/linux-kernel-infoleaks.html Cc: Jann Horn Signed-off-by: Willy Tarreau Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/appletalk/ipddp.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/appletalk/ipddp.c b/drivers/net/appletalk/ipddp.c index 2e4649655181..4e98e5aff7c5 100644 --- a/drivers/net/appletalk/ipddp.c +++ b/drivers/net/appletalk/ipddp.c @@ -284,8 +284,12 @@ static int ipddp_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) case SIOCFINDIPDDPRT: spin_lock_bh(&ipddp_route_lock); rp = __ipddp_find_route(&rcp); - if (rp) - memcpy(&rcp2, rp, sizeof(rcp2)); + if (rp) { + memset(&rcp2, 0, sizeof(rcp2)); + rcp2.ip = rp->ip; + rcp2.at = rp->at; + rcp2.flags = rp->flags; + } spin_unlock_bh(&ipddp_route_lock); if (rp) { From b8214c557ca00d9e3110555d919f44ff16d512a4 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 14 Sep 2018 17:39:53 +0100 Subject: [PATCH 1115/1212] net: hp100: fix always-true check for link up state [ Upstream commit a7f38002fb69b44f8fc622ecb838665d0b8666af ] The operation ~(p100_inb(VG_LAN_CFG_1) & HP100_LINK_UP) returns a value that is always non-zero and hence the wait for the link to drop always terminates prematurely. Fix this by using a logical not operator instead of a bitwise complement. This issue has been in the driver since pre-2.6.12-rc2. Detected by CoverityScan, CID#114157 ("Logical vs. bitwise operator") Signed-off-by: Colin Ian King Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/hp/hp100.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hp/hp100.c b/drivers/net/ethernet/hp/hp100.c index 3daf2d4a7ca0..884aa809baac 100644 --- a/drivers/net/ethernet/hp/hp100.c +++ b/drivers/net/ethernet/hp/hp100.c @@ -2636,7 +2636,7 @@ static int hp100_login_to_vg_hub(struct net_device *dev, u_short force_relogin) /* Wait for link to drop */ time = jiffies + (HZ / 10); do { - if (~(hp100_inb(VG_LAN_CFG_1) & HP100_LINK_UP_ST)) + if (!(hp100_inb(VG_LAN_CFG_1) & HP100_LINK_UP_ST)) break; if (!in_interrupt()) schedule_timeout_interruptible(1); From c6e386425347a8e704440f925d49496e5edfea60 Mon Sep 17 00:00:00 2001 From: Vasily Khoruzhick Date: Thu, 13 Sep 2018 11:12:03 -0700 Subject: [PATCH 1116/1212] neighbour: confirm neigh entries when ARP packet is received [ Upstream commit f0e0d04413fcce9bc76388839099aee93cd0d33b ] Update 'confirmed' timestamp when ARP packet is received. It shouldn't affect locktime logic and anyway entry can be confirmed by any higher-layer protocol. Thus it makes sense to confirm it when ARP packet is received. Fixes: 77d7123342dc ("neighbour: update neigh timestamps iff update is effective") Signed-off-by: Vasily Khoruzhick Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/neighbour.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index f60b93627876..78dc184072e8 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1140,6 +1140,12 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, lladdr = neigh->ha; } + /* Update confirmed timestamp for neighbour entry after we + * received ARP packet even if it doesn't change IP to MAC binding. + */ + if (new & NUD_CONNECTED) + neigh->confirmed = jiffies; + /* If entry was valid and address is not changed, do not change entry state, if new one is STALE. */ @@ -1163,15 +1169,12 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, } } - /* Update timestamps only once we know we will make a change to the + /* Update timestamp only once we know we will make a change to the * neighbour entry. Otherwise we risk to move the locktime window with * noop updates and ignore relevant ARP updates. */ - if (new != old || lladdr != neigh->ha) { - if (new & NUD_CONNECTED) - neigh->confirmed = jiffies; + if (new != old || lladdr != neigh->ha) neigh->updated = jiffies; - } if (new != old) { neigh_del_timer(neigh); From afba6121b3b7e1737b7e7dafbf6b7eafbcdc659c Mon Sep 17 00:00:00 2001 From: Vincent Pelletier Date: Sun, 9 Sep 2018 04:09:26 +0000 Subject: [PATCH 1117/1212] scsi: target: iscsi: Use hex2bin instead of a re-implementation commit 1816494330a83f2a064499d8ed2797045641f92c upstream. This change has the following effects, in order of descreasing importance: 1) Prevent a stack buffer overflow 2) Do not append an unnecessary NULL to an anyway binary buffer, which is writing one byte past client_digest when caller is: chap_string_to_hex(client_digest, chap_r, strlen(chap_r)); The latter was found by KASAN (see below) when input value hes expected size (32 hex chars), and further analysis revealed a stack buffer overflow can happen when network-received value is longer, allowing an unauthenticated remote attacker to smash up to 17 bytes after destination buffer (16 bytes attacker-controlled and one null). As switching to hex2bin requires specifying destination buffer length, and does not internally append any null, it solves both issues. This addresses CVE-2018-14633. Beyond this: - Validate received value length and check hex2bin accepted the input, to log this rejection reason instead of just failing authentication. - Only log received CHAP_R and CHAP_C values once they passed sanity checks. ================================================================== BUG: KASAN: stack-out-of-bounds in chap_string_to_hex+0x32/0x60 [iscsi_target_mod] Write of size 1 at addr ffff8801090ef7c8 by task kworker/0:0/1021 CPU: 0 PID: 1021 Comm: kworker/0:0 Tainted: G O 4.17.8kasan.sess.connops+ #2 Hardware name: To be filled by O.E.M. To be filled by O.E.M./Aptio CRB, BIOS 5.6.5 05/19/2014 Workqueue: events iscsi_target_do_login_rx [iscsi_target_mod] Call Trace: dump_stack+0x71/0xac print_address_description+0x65/0x22e ? chap_string_to_hex+0x32/0x60 [iscsi_target_mod] kasan_report.cold.6+0x241/0x2fd chap_string_to_hex+0x32/0x60 [iscsi_target_mod] chap_server_compute_md5.isra.2+0x2cb/0x860 [iscsi_target_mod] ? chap_binaryhex_to_asciihex.constprop.5+0x50/0x50 [iscsi_target_mod] ? ftrace_caller_op_ptr+0xe/0xe ? __orc_find+0x6f/0xc0 ? unwind_next_frame+0x231/0x850 ? kthread+0x1a0/0x1c0 ? ret_from_fork+0x35/0x40 ? ret_from_fork+0x35/0x40 ? iscsi_target_do_login_rx+0x3bc/0x4c0 [iscsi_target_mod] ? deref_stack_reg+0xd0/0xd0 ? iscsi_target_do_login_rx+0x3bc/0x4c0 [iscsi_target_mod] ? is_module_text_address+0xa/0x11 ? kernel_text_address+0x4c/0x110 ? __save_stack_trace+0x82/0x100 ? ret_from_fork+0x35/0x40 ? save_stack+0x8c/0xb0 ? 0xffffffffc1660000 ? iscsi_target_do_login+0x155/0x8d0 [iscsi_target_mod] ? iscsi_target_do_login_rx+0x3bc/0x4c0 [iscsi_target_mod] ? process_one_work+0x35c/0x640 ? worker_thread+0x66/0x5d0 ? kthread+0x1a0/0x1c0 ? ret_from_fork+0x35/0x40 ? iscsi_update_param_value+0x80/0x80 [iscsi_target_mod] ? iscsit_release_cmd+0x170/0x170 [iscsi_target_mod] chap_main_loop+0x172/0x570 [iscsi_target_mod] ? chap_server_compute_md5.isra.2+0x860/0x860 [iscsi_target_mod] ? rx_data+0xd6/0x120 [iscsi_target_mod] ? iscsit_print_session_params+0xd0/0xd0 [iscsi_target_mod] ? cyc2ns_read_begin.part.2+0x90/0x90 ? _raw_spin_lock_irqsave+0x25/0x50 ? memcmp+0x45/0x70 iscsi_target_do_login+0x875/0x8d0 [iscsi_target_mod] ? iscsi_target_check_first_request.isra.5+0x1a0/0x1a0 [iscsi_target_mod] ? del_timer+0xe0/0xe0 ? memset+0x1f/0x40 ? flush_sigqueue+0x29/0xd0 iscsi_target_do_login_rx+0x3bc/0x4c0 [iscsi_target_mod] ? iscsi_target_nego_release+0x80/0x80 [iscsi_target_mod] ? iscsi_target_restore_sock_callbacks+0x130/0x130 [iscsi_target_mod] process_one_work+0x35c/0x640 worker_thread+0x66/0x5d0 ? flush_rcu_work+0x40/0x40 kthread+0x1a0/0x1c0 ? kthread_bind+0x30/0x30 ret_from_fork+0x35/0x40 The buggy address belongs to the page: page:ffffea0004243bc0 count:0 mapcount:0 mapping:0000000000000000 index:0x0 flags: 0x17fffc000000000() raw: 017fffc000000000 0000000000000000 0000000000000000 00000000ffffffff raw: ffffea0004243c20 ffffea0004243ba0 0000000000000000 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8801090ef680: f2 f2 f2 f2 f2 f2 f2 01 f2 f2 f2 f2 f2 f2 f2 00 ffff8801090ef700: f2 f2 f2 f2 f2 f2 f2 00 02 f2 f2 f2 f2 f2 f2 00 >ffff8801090ef780: 00 f2 f2 f2 f2 f2 f2 00 00 f2 f2 f2 f2 f2 f2 00 ^ ffff8801090ef800: 00 f2 f2 f2 f2 f2 f2 00 00 00 00 02 f2 f2 f2 f2 ffff8801090ef880: f2 f2 f2 00 00 00 00 00 00 00 00 f2 f2 f2 f2 00 ================================================================== Signed-off-by: Vincent Pelletier Reviewed-by: Mike Christie Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/target/iscsi/iscsi_target_auth.c | 30 +++++++++++------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/drivers/target/iscsi/iscsi_target_auth.c b/drivers/target/iscsi/iscsi_target_auth.c index 47e249dccb5f..b380bc7ee10a 100644 --- a/drivers/target/iscsi/iscsi_target_auth.c +++ b/drivers/target/iscsi/iscsi_target_auth.c @@ -26,18 +26,6 @@ #include "iscsi_target_nego.h" #include "iscsi_target_auth.h" -static int chap_string_to_hex(unsigned char *dst, unsigned char *src, int len) -{ - int j = DIV_ROUND_UP(len, 2), rc; - - rc = hex2bin(dst, src, j); - if (rc < 0) - pr_debug("CHAP string contains non hex digit symbols\n"); - - dst[j] = '\0'; - return j; -} - static void chap_binaryhex_to_asciihex(char *dst, char *src, int src_len) { int i; @@ -241,9 +229,16 @@ static int chap_server_compute_md5( pr_err("Could not find CHAP_R.\n"); goto out; } + if (strlen(chap_r) != MD5_SIGNATURE_SIZE * 2) { + pr_err("Malformed CHAP_R\n"); + goto out; + } + if (hex2bin(client_digest, chap_r, MD5_SIGNATURE_SIZE) < 0) { + pr_err("Malformed CHAP_R\n"); + goto out; + } pr_debug("[server] Got CHAP_R=%s\n", chap_r); - chap_string_to_hex(client_digest, chap_r, strlen(chap_r)); tfm = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm)) { @@ -348,9 +343,7 @@ static int chap_server_compute_md5( pr_err("Could not find CHAP_C.\n"); goto out; } - pr_debug("[server] Got CHAP_C=%s\n", challenge); - challenge_len = chap_string_to_hex(challenge_binhex, challenge, - strlen(challenge)); + challenge_len = DIV_ROUND_UP(strlen(challenge), 2); if (!challenge_len) { pr_err("Unable to convert incoming challenge\n"); goto out; @@ -359,6 +352,11 @@ static int chap_server_compute_md5( pr_err("CHAP_C exceeds maximum binary size of 1024 bytes\n"); goto out; } + if (hex2bin(challenge_binhex, challenge, challenge_len) < 0) { + pr_err("Malformed CHAP_C\n"); + goto out; + } + pr_debug("[server] Got CHAP_C=%s\n", challenge); /* * During mutual authentication, the CHAP_C generated by the * initiator must not match the original CHAP_C generated by From 98e14c520fff92344cb0c7472be4b963fc5fa44f Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Thu, 20 Sep 2018 12:22:51 -0700 Subject: [PATCH 1118/1212] ocfs2: fix ocfs2 read block panic commit 234b69e3e089d850a98e7b3145bd00e9b52b1111 upstream. While reading block, it is possible that io error return due to underlying storage issue, in this case, BH_NeedsValidate was left in the buffer head. Then when reading the very block next time, if it was already linked into journal, that will trigger the following panic. [203748.702517] kernel BUG at fs/ocfs2/buffer_head_io.c:342! [203748.702533] invalid opcode: 0000 [#1] SMP [203748.702561] Modules linked in: ocfs2 ocfs2_dlmfs ocfs2_stack_o2cb ocfs2_dlm ocfs2_nodemanager ocfs2_stackglue configfs sunrpc dm_switch dm_queue_length dm_multipath bonding be2iscsi iscsi_boot_sysfs bnx2i cnic uio cxgb4i iw_cxgb4 cxgb4 cxgb3i libcxgbi iw_cxgb3 cxgb3 mdio ib_iser rdma_cm ib_cm iw_cm ib_sa ib_mad ib_core ib_addr ipv6 iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi ipmi_devintf iTCO_wdt iTCO_vendor_support dcdbas ipmi_ssif i2c_core ipmi_si ipmi_msghandler acpi_pad pcspkr sb_edac edac_core lpc_ich mfd_core shpchp sg tg3 ptp pps_core ext4 jbd2 mbcache2 sr_mod cdrom sd_mod ahci libahci megaraid_sas wmi dm_mirror dm_region_hash dm_log dm_mod [203748.703024] CPU: 7 PID: 38369 Comm: touch Not tainted 4.1.12-124.18.6.el6uek.x86_64 #2 [203748.703045] Hardware name: Dell Inc. PowerEdge R620/0PXXHP, BIOS 2.5.2 01/28/2015 [203748.703067] task: ffff880768139c00 ti: ffff88006ff48000 task.ti: ffff88006ff48000 [203748.703088] RIP: 0010:[] [] ocfs2_read_blocks+0x669/0x7f0 [ocfs2] [203748.703130] RSP: 0018:ffff88006ff4b818 EFLAGS: 00010206 [203748.703389] RAX: 0000000008620029 RBX: ffff88006ff4b910 RCX: 0000000000000000 [203748.703885] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 00000000023079fe [203748.704382] RBP: ffff88006ff4b8d8 R08: 0000000000000000 R09: ffff8807578c25b0 [203748.704877] R10: 000000000f637376 R11: 000000003030322e R12: 0000000000000000 [203748.705373] R13: ffff88006ff4b910 R14: ffff880732fe38f0 R15: 0000000000000000 [203748.705871] FS: 00007f401992c700(0000) GS:ffff880bfebc0000(0000) knlGS:0000000000000000 [203748.706370] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [203748.706627] CR2: 00007f4019252440 CR3: 00000000a621e000 CR4: 0000000000060670 [203748.707124] Stack: [203748.707371] ffff88006ff4b828 ffffffffa0609f52 ffff88006ff4b838 0000000000000001 [203748.707885] 0000000000000000 0000000000000000 ffff880bf67c3800 ffffffffa05eca00 [203748.708399] 00000000023079ff ffffffff81c58b80 0000000000000000 0000000000000000 [203748.708915] Call Trace: [203748.709175] [] ? ocfs2_inode_cache_io_unlock+0x12/0x20 [ocfs2] [203748.709680] [] ? ocfs2_empty_dir_filldir+0x80/0x80 [ocfs2] [203748.710185] [] ocfs2_read_dir_block_direct+0x3b/0x200 [ocfs2] [203748.710691] [] ocfs2_prepare_dx_dir_for_insert.isra.57+0x19f/0xf60 [ocfs2] [203748.711204] [] ? ocfs2_metadata_cache_io_unlock+0x1f/0x30 [ocfs2] [203748.711716] [] ocfs2_prepare_dir_for_insert+0x13a/0x890 [ocfs2] [203748.712227] [] ? ocfs2_check_dir_for_entry+0x8e/0x140 [ocfs2] [203748.712737] [] ocfs2_mknod+0x4b2/0x1370 [ocfs2] [203748.713003] [] ocfs2_create+0x65/0x170 [ocfs2] [203748.713263] [] vfs_create+0xdb/0x150 [203748.713518] [] do_last+0x815/0x1210 [203748.713772] [] ? path_init+0xb9/0x450 [203748.714123] [] path_openat+0x80/0x600 [203748.714378] [] ? handle_pte_fault+0xd15/0x1620 [203748.714634] [] do_filp_open+0x3a/0xb0 [203748.714888] [] ? __alloc_fd+0xa7/0x130 [203748.715143] [] do_sys_open+0x12c/0x220 [203748.715403] [] ? syscall_trace_enter_phase1+0x11b/0x180 [203748.715668] [] ? system_call_after_swapgs+0xe9/0x190 [203748.715928] [] SyS_open+0x1e/0x20 [203748.716184] [] system_call_fastpath+0x18/0xd7 [203748.716440] Code: 00 00 48 8b 7b 08 48 83 c3 10 45 89 f8 44 89 e1 44 89 f2 4c 89 ee e8 07 06 11 e1 48 8b 03 48 85 c0 75 df 8b 5d c8 e9 4d fa ff ff <0f> 0b 48 8b 7d a0 e8 dc c6 06 00 48 b8 00 00 00 00 00 00 00 10 [203748.717505] RIP [] ocfs2_read_blocks+0x669/0x7f0 [ocfs2] [203748.717775] RSP Joesph ever reported a similar panic. Link: https://oss.oracle.com/pipermail/ocfs2-devel/2013-May/008931.html Link: http://lkml.kernel.org/r/20180912063207.29484-1-junxiao.bi@oracle.com Signed-off-by: Junxiao Bi Cc: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Changwei Ge Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/ocfs2/buffer_head_io.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index fe50ded1b4ce..272269f1c310 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -336,6 +336,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, * for this bh as it's not marked locally * uptodate. */ status = -EIO; + clear_buffer_needs_validate(bh); put_bh(bh); bhs[i] = NULL; continue; From 64436716c3a538ed65413c5257c6176dae5d807c Mon Sep 17 00:00:00 2001 From: Lyude Paul Date: Wed, 15 Aug 2018 15:00:14 -0400 Subject: [PATCH 1119/1212] drm/nouveau/drm/nouveau: Use pm_runtime_get_noresume() in connector_detect() commit 6833fb1ec120bf078e1a527c573a09d4de286224 upstream. It's true we can't resume the device from poll workers in nouveau_connector_detect(). We can however, prevent the autosuspend timer from elapsing immediately if it hasn't already without risking any sort of deadlock with the runtime suspend/resume operations. So do that instead of entirely avoiding grabbing a power reference. Signed-off-by: Lyude Paul Reviewed-by: Karol Herbst Acked-by: Daniel Vetter Cc: stable@vger.kernel.org Cc: Lukas Wunner Signed-off-by: Ben Skeggs Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/nouveau/nouveau_connector.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_connector.c b/drivers/gpu/drm/nouveau/nouveau_connector.c index ababdaabe870..1855b475cc0b 100644 --- a/drivers/gpu/drm/nouveau/nouveau_connector.c +++ b/drivers/gpu/drm/nouveau/nouveau_connector.c @@ -253,12 +253,16 @@ nouveau_connector_detect(struct drm_connector *connector, bool force) nv_connector->edid = NULL; } - /* Outputs are only polled while runtime active, so acquiring a - * runtime PM ref here is unnecessary (and would deadlock upon - * runtime suspend because it waits for polling to finish). + /* Outputs are only polled while runtime active, so resuming the + * device here is unnecessary (and would deadlock upon runtime suspend + * because it waits for polling to finish). We do however, want to + * prevent the autosuspend timer from elapsing during this operation + * if possible. */ - if (!drm_kms_helper_is_poll_worker()) { - ret = pm_runtime_get_sync(connector->dev->dev); + if (drm_kms_helper_is_poll_worker()) { + pm_runtime_get_noresume(dev->dev); + } else { + ret = pm_runtime_get_sync(dev->dev); if (ret < 0 && ret != -EACCES) return conn_status; } @@ -329,10 +333,8 @@ nouveau_connector_detect(struct drm_connector *connector, bool force) out: - if (!drm_kms_helper_is_poll_worker()) { - pm_runtime_mark_last_busy(connector->dev->dev); - pm_runtime_put_autosuspend(connector->dev->dev); - } + pm_runtime_mark_last_busy(dev->dev); + pm_runtime_put_autosuspend(dev->dev); return conn_status; } From 1aa698b65186c13ed775896ed1dfec7c26c73d60 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 16 Aug 2018 15:30:38 -0500 Subject: [PATCH 1120/1212] tty: vt_ioctl: fix potential Spectre v1 commit e97267cb4d1ee01ca0929638ec0fcbb0904f903d upstream. vsa.console is indirectly controlled by user-space, hence leading to a potential exploitation of the Spectre variant 1 vulnerability. This issue was detected with the help of Smatch: drivers/tty/vt/vt_ioctl.c:711 vt_ioctl() warn: potential spectre issue 'vc_cons' [r] Fix this by sanitizing vsa.console before using it to index vc_cons Notice that given that speculation windows are large, the policy is to kill the speculation on the first load and not worry if it can be completed with a dependent load/store [1]. [1] https://marc.info/?l=linux-kernel&m=152449131114778&w=2 Cc: stable@vger.kernel.org Signed-off-by: Gustavo A. R. Silva Reviewed-by: Alan Cox Signed-off-by: Greg Kroah-Hartman --- drivers/tty/vt/vt_ioctl.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/tty/vt/vt_ioctl.c b/drivers/tty/vt/vt_ioctl.c index 97d5a74558a3..a86bc7afb3b2 100644 --- a/drivers/tty/vt/vt_ioctl.c +++ b/drivers/tty/vt/vt_ioctl.c @@ -31,6 +31,8 @@ #include #include +#include + #include #include #include @@ -703,6 +705,8 @@ int vt_ioctl(struct tty_struct *tty, if (vsa.console == 0 || vsa.console > MAX_NR_CONSOLES) ret = -ENXIO; else { + vsa.console = array_index_nospec(vsa.console, + MAX_NR_CONSOLES + 1); vsa.console--; console_lock(); ret = vc_allocate(vsa.console); From 7619c7f66012b18aa634b1091e11ca12e574bbdb Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 27 Aug 2018 09:22:45 -0400 Subject: [PATCH 1121/1212] ext4: avoid divide by zero fault when deleting corrupted inline directories commit 4d982e25d0bdc83d8c64e66fdeca0b89240b3b85 upstream. A specially crafted file system can trick empty_inline_dir() into reading past the last valid entry in a inline directory, and then run into the end of xattr marker. This will trigger a divide by zero fault. Fix this by using the size of the inline directory instead of dir->i_size. Also clean up error reporting in __ext4_check_dir_entry so that the message is clearer and more understandable --- and avoids the division by zero trap if the size passed in is zero. (I'm not sure why we coded it that way in the first place; printing offset % size is actually more confusing and less useful.) https://bugzilla.kernel.org/show_bug.cgi?id=200933 Signed-off-by: Theodore Ts'o Reported-by: Wen Xu Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/dir.c | 20 +++++++++----------- fs/ext4/inline.c | 4 +++- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 6d17f31a31d7..8eb768bbf5b5 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -74,7 +74,7 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len))) error_msg = "rec_len is too small for name_len"; else if (unlikely(((char *) de - buf) + rlen > size)) - error_msg = "directory entry across range"; + error_msg = "directory entry overrun"; else if (unlikely(le32_to_cpu(de->inode) > le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) error_msg = "inode out of bounds"; @@ -83,18 +83,16 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, if (filp) ext4_error_file(filp, function, line, bh->b_blocknr, - "bad entry in directory: %s - offset=%u(%u), " - "inode=%u, rec_len=%d, name_len=%d", - error_msg, (unsigned) (offset % size), - offset, le32_to_cpu(de->inode), - rlen, de->name_len); + "bad entry in directory: %s - offset=%u, " + "inode=%u, rec_len=%d, name_len=%d, size=%d", + error_msg, offset, le32_to_cpu(de->inode), + rlen, de->name_len, size); else ext4_error_inode(dir, function, line, bh->b_blocknr, - "bad entry in directory: %s - offset=%u(%u), " - "inode=%u, rec_len=%d, name_len=%d", - error_msg, (unsigned) (offset % size), - offset, le32_to_cpu(de->inode), - rlen, de->name_len); + "bad entry in directory: %s - offset=%u, " + "inode=%u, rec_len=%d, name_len=%d, size=%d", + error_msg, offset, le32_to_cpu(de->inode), + rlen, de->name_len, size); return 1; } diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index c449bc089c94..1e7a9774119c 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1756,6 +1756,7 @@ int empty_inline_dir(struct inode *dir, int *has_inline_data) { int err, inline_size; struct ext4_iloc iloc; + size_t inline_len; void *inline_pos; unsigned int offset; struct ext4_dir_entry_2 *de; @@ -1783,8 +1784,9 @@ int empty_inline_dir(struct inode *dir, int *has_inline_data) goto out; } + inline_len = ext4_get_inline_size(dir); offset = EXT4_INLINE_DOTDOT_SIZE; - while (offset < dir->i_size) { + while (offset < inline_len) { de = ext4_get_inline_entry(dir, &iloc, offset, &inline_pos, &inline_size); if (ext4_check_dir_entry(dir, NULL, de, From 66671ee85afdcdbf8152a2389bce2ebd10f5beff Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 1 Sep 2018 14:42:14 -0400 Subject: [PATCH 1122/1212] ext4: recalucate superblock checksum after updating free blocks/inodes commit 4274f516d4bc50648a4d97e4f67ecbd7b65cde4a upstream. When mounting the superblock, ext4_fill_super() calculates the free blocks and free inodes and stores them in the superblock. It's not strictly necessary, since we don't use them any more, but it's nice to keep them roughly aligned to reality. Since it's not critical for file system correctness, the code doesn't call ext4_commit_super(). The problem is that it's in ext4_commit_super() that we recalculate the superblock checksum. So if we're not going to call ext4_commit_super(), we need to call ext4_superblock_csum_set() to make sure the superblock checksum is consistent. Most of the time, this doesn't matter, since we end up calling ext4_commit_super() very soon thereafter, and definitely by the time the file system is unmounted. However, it doesn't work in this sequence: mke2fs -Fq -t ext4 /dev/vdc 128M mount /dev/vdc /vdc cp xfstests/git-versions /vdc godown /vdc umount /vdc mount /dev/vdc tune2fs -l /dev/vdc With this commit, the "tune2fs -l" no longer fails. Reported-by: Chengguang Xu Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/super.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8d18f6142da5..a3d905abbaa9 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4024,11 +4024,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) block = ext4_count_free_clusters(sb); ext4_free_blocks_count_set(sbi->s_es, EXT4_C2B(sbi, block)); + ext4_superblock_csum_set(sb); err = percpu_counter_init(&sbi->s_freeclusters_counter, block, GFP_KERNEL); if (!err) { unsigned long freei = ext4_count_free_inodes(sb); sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); + ext4_superblock_csum_set(sb); err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, GFP_KERNEL); } From 70083af59263033d8cd47f0340a0e81bcf7252c5 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 3 Sep 2018 22:19:43 -0400 Subject: [PATCH 1123/1212] ext4: fix online resize's handling of a too-small final block group commit f0a459dec5495a3580f8d784555e6f8f3bf7f263 upstream. Avoid growing the file system to an extent so that the last block group is too small to hold all of the metadata that must be stored in the block group. This problem can be triggered with the following reproducer: umount /mnt mke2fs -F -m0 -b 4096 -t ext4 -O resize_inode,^has_journal \ -E resize=1073741824 /tmp/foo.img 128M mount /tmp/foo.img /mnt truncate --size 1708M /tmp/foo.img resize2fs /dev/loop0 295400 umount /mnt e2fsck -fy /tmp/foo.img Reported-by: Torsten Hilbrich Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/resize.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index d2421fd38833..9693743cd09f 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1954,6 +1954,26 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) } } + /* + * Make sure the last group has enough space so that it's + * guaranteed to have enough space for all metadata blocks + * that it might need to hold. (We might not need to store + * the inode table blocks in the last block group, but there + * will be cases where this might be needed.) + */ + if ((ext4_group_first_block_no(sb, n_group) + + ext4_group_overhead_blocks(sb, n_group) + 2 + + sbi->s_itb_per_group + sbi->s_cluster_ratio) >= n_blocks_count) { + n_blocks_count = ext4_group_first_block_no(sb, n_group); + n_group--; + n_blocks_count_retry = 0; + if (resize_inode) { + iput(resize_inode); + resize_inode = NULL; + } + goto retry; + } + /* extend the last group */ if (n_group == o_group) add = n_blocks_count - o_blocks_count; From 47af99763a89aca2af2cbcad56629213769f6349 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 3 Sep 2018 22:25:01 -0400 Subject: [PATCH 1124/1212] ext4: fix online resizing for bigalloc file systems with a 1k block size commit 5f8c10936fab2b69a487400f2872902e597dd320 upstream. An online resize of a file system with the bigalloc feature enabled and a 1k block size would be refused since ext4_resize_begin() did not understand s_first_data_block is 0 for all bigalloc file systems, even when the block size is 1k. Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/resize.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 9693743cd09f..783280ebc2fe 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -18,6 +18,7 @@ int ext4_resize_begin(struct super_block *sb) { + struct ext4_sb_info *sbi = EXT4_SB(sb); int ret = 0; if (!capable(CAP_SYS_RESOURCE)) @@ -28,7 +29,7 @@ int ext4_resize_begin(struct super_block *sb) * because the user tools have no way of handling this. Probably a * bad time to do it anyways. */ - if (EXT4_SB(sb)->s_sbh->b_blocknr != + if (EXT4_B2C(sbi, sbi->s_sbh->b_blocknr) != le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) { ext4_warning(sb, "won't resize using backup superblock at %llu", (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr); From e77dd99d4bf7fc2d48e2e7b02b671aec5ef6e531 Mon Sep 17 00:00:00 2001 From: Li Dongyang Date: Sat, 15 Sep 2018 17:11:25 -0400 Subject: [PATCH 1125/1212] ext4: don't mark mmp buffer head dirty commit fe18d649891d813964d3aaeebad873f281627fbc upstream. Marking mmp bh dirty before writing it will make writeback pick up mmp block later and submit a write, we don't want the duplicate write as kmmpd thread should have full control of reading and writing the mmp block. Another reason is we will also have random I/O error on the writeback request when blk integrity is enabled, because kmmpd could modify the content of the mmp block(e.g. setting new seq and time) while the mmp block is under I/O requested by writeback. Signed-off-by: Li Dongyang Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/mmp.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 0a512aa81bf7..4c9d799955d1 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -48,7 +48,6 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) */ sb_start_write(sb); ext4_mmp_csum_set(sb, mmp); - mark_buffer_dirty(bh); lock_buffer(bh); bh->b_end_io = end_buffer_write_sync; get_bh(bh); From d2e646c72382f01e8b17d06f716fd6f998f926a4 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 4 Dec 2015 12:42:29 +0000 Subject: [PATCH 1126/1212] arm64: Add trace_hardirqs_off annotation in ret_to_user commit db3899a6477a4dccd26cbfb7f408b6be2cc068e0 upstream. When a kernel is built with CONFIG_TRACE_IRQFLAGS the following warning is produced when entering userspace for the first time: WARNING: at /work/Linux/linux-2.6-aarch64/kernel/locking/lockdep.c:3519 Modules linked in: CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc3+ #639 Hardware name: Juno (DT) task: ffffffc9768a0000 ti: ffffffc9768a8000 task.ti: ffffffc9768a8000 PC is at check_flags.part.22+0x19c/0x1a8 LR is at check_flags.part.22+0x19c/0x1a8 pc : [] lr : [] pstate: 600001c5 sp : ffffffc9768abe10 x29: ffffffc9768abe10 x28: ffffffc9768a8000 x27: 0000000000000000 x26: 0000000000000001 x25: 00000000000000a6 x24: ffffffc00064be6c x23: ffffffc0009f249e x22: ffffffc9768a0000 x21: ffffffc97fea5480 x20: 00000000000001c0 x19: ffffffc00169a000 x18: 0000005558cc7b58 x17: 0000007fb78e3180 x16: 0000005558d2e238 x15: ffffffffffffffff x14: 0ffffffffffffffd x13: 0000000000000008 x12: 0101010101010101 x11: 7f7f7f7f7f7f7f7f x10: fefefefefefeff63 x9 : 7f7f7f7f7f7f7f7f x8 : 6e655f7371726964 x7 : 0000000000000001 x6 : ffffffc0001079c4 x5 : 0000000000000000 x4 : 0000000000000001 x3 : ffffffc001698438 x2 : 0000000000000000 x1 : ffffffc9768a0000 x0 : 000000000000002e Call trace: [] check_flags.part.22+0x19c/0x1a8 [] lock_is_held+0x80/0x98 [] __schedule+0x404/0x730 [] schedule+0x44/0xb8 [] ret_to_user+0x0/0x24 possible reason: unannotated irqs-off. irq event stamp: 502169 hardirqs last enabled at (502169): [] el0_irq_naked+0x1c/0x24 hardirqs last disabled at (502167): [] __do_softirq+0x17c/0x298 softirqs last enabled at (502168): [] __do_softirq+0x1fc/0x298 softirqs last disabled at (502143): [] irq_exit+0xa0/0xf0 This happens because we disable interrupts in ret_to_user before calling schedule() in work_resched. This patch adds the necessary trace_hardirqs_off annotation. Signed-off-by: Catalin Marinas Reported-by: Mark Rutland Cc: Will Deacon Signed-off-by: Will Deacon Cc: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- arch/arm64/kernel/entry.S | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index dccd0c2e9023..3028d9b028c7 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -649,6 +649,9 @@ work_pending: bl do_notify_resume b ret_to_user work_resched: +#ifdef CONFIG_TRACE_IRQFLAGS + bl trace_hardirqs_off // the IRQs are off here, inform the tracing code +#endif bl schedule /* From ce144dbfb4f36223a50414fdfe5cadc6afc98a0d Mon Sep 17 00:00:00 2001 From: Roderick Colenbrander Date: Fri, 7 Oct 2016 12:39:40 -0700 Subject: [PATCH 1127/1212] HID: sony: Update device ids commit cf1015d65d7c8a5504a4c03afb60fb86bff0f032 upstream. Support additional DS4 model. Signed-off-by: Roderick Colenbrander Reviewed-by: Benjamin Tissoires Signed-off-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- drivers/hid/hid-core.c | 2 ++ drivers/hid/hid-ids.h | 1 + drivers/hid/hid-sony.c | 4 ++++ 3 files changed, 7 insertions(+) diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index e4541c6bf3d3..b992d8b010f8 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -2011,6 +2011,8 @@ static const struct hid_device_id hid_have_special_driver[] = { { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS3_CONTROLLER) }, { HID_USB_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS4_CONTROLLER) }, { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS4_CONTROLLER) }, + { HID_USB_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS4_CONTROLLER_2) }, + { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS4_CONTROLLER_2) }, { HID_USB_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_VAIO_VGX_MOUSE) }, { HID_USB_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_VAIO_VGP_MOUSE) }, { HID_USB_DEVICE(USB_VENDOR_ID_STEELSERIES, USB_DEVICE_ID_STEELSERIES_SRWS1) }, diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 60e2c9faa95f..6e25168df6a2 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -879,6 +879,7 @@ #define USB_DEVICE_ID_SONY_PS3_BDREMOTE 0x0306 #define USB_DEVICE_ID_SONY_PS3_CONTROLLER 0x0268 #define USB_DEVICE_ID_SONY_PS4_CONTROLLER 0x05c4 +#define USB_DEVICE_ID_SONY_PS4_CONTROLLER_2 0x09cc #define USB_DEVICE_ID_SONY_MOTION_CONTROLLER 0x03d5 #define USB_DEVICE_ID_SONY_NAVIGATION_CONTROLLER 0x042f #define USB_DEVICE_ID_SONY_BUZZ_CONTROLLER 0x0002 diff --git a/drivers/hid/hid-sony.c b/drivers/hid/hid-sony.c index 21febbb0d84e..979160e71156 100644 --- a/drivers/hid/hid-sony.c +++ b/drivers/hid/hid-sony.c @@ -2460,6 +2460,10 @@ static const struct hid_device_id sony_devices[] = { .driver_data = DUALSHOCK4_CONTROLLER_USB }, { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS4_CONTROLLER), .driver_data = DUALSHOCK4_CONTROLLER_BT }, + { HID_USB_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS4_CONTROLLER_2), + .driver_data = DUALSHOCK4_CONTROLLER_USB }, + { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS4_CONTROLLER_2), + .driver_data = DUALSHOCK4_CONTROLLER_BT }, { } }; MODULE_DEVICE_TABLE(hid, sony_devices); From 44c2e8a568d156baf09e8160513bb278c40ea4bd Mon Sep 17 00:00:00 2001 From: Roderick Colenbrander Date: Wed, 23 Nov 2016 14:07:11 -0800 Subject: [PATCH 1128/1212] HID: sony: Support DS4 dongle commit de66a1a04c25f2560a8dca7a95e2a150b0d5e17e upstream. Add support for USB based DS4 dongle device, which allows connecting a DS4 through Bluetooth, but hides Bluetooth from the host system. Signed-off-by: Roderick Colenbrander Signed-off-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- drivers/hid/hid-core.c | 1 + drivers/hid/hid-ids.h | 1 + drivers/hid/hid-sony.c | 2 ++ 3 files changed, 4 insertions(+) diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index b992d8b010f8..4564ecf71181 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -2013,6 +2013,7 @@ static const struct hid_device_id hid_have_special_driver[] = { { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS4_CONTROLLER) }, { HID_USB_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS4_CONTROLLER_2) }, { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS4_CONTROLLER_2) }, + { HID_USB_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS4_CONTROLLER_DONGLE) }, { HID_USB_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_VAIO_VGX_MOUSE) }, { HID_USB_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_VAIO_VGP_MOUSE) }, { HID_USB_DEVICE(USB_VENDOR_ID_STEELSERIES, USB_DEVICE_ID_STEELSERIES_SRWS1) }, diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 6e25168df6a2..00d8366a614e 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -880,6 +880,7 @@ #define USB_DEVICE_ID_SONY_PS3_CONTROLLER 0x0268 #define USB_DEVICE_ID_SONY_PS4_CONTROLLER 0x05c4 #define USB_DEVICE_ID_SONY_PS4_CONTROLLER_2 0x09cc +#define USB_DEVICE_ID_SONY_PS4_CONTROLLER_DONGLE 0x0ba0 #define USB_DEVICE_ID_SONY_MOTION_CONTROLLER 0x03d5 #define USB_DEVICE_ID_SONY_NAVIGATION_CONTROLLER 0x042f #define USB_DEVICE_ID_SONY_BUZZ_CONTROLLER 0x0002 diff --git a/drivers/hid/hid-sony.c b/drivers/hid/hid-sony.c index 979160e71156..6f3d47185bf0 100644 --- a/drivers/hid/hid-sony.c +++ b/drivers/hid/hid-sony.c @@ -2464,6 +2464,8 @@ static const struct hid_device_id sony_devices[] = { .driver_data = DUALSHOCK4_CONTROLLER_USB }, { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS4_CONTROLLER_2), .driver_data = DUALSHOCK4_CONTROLLER_BT }, + { HID_USB_DEVICE(USB_VENDOR_ID_SONY, USB_DEVICE_ID_SONY_PS4_CONTROLLER_DONGLE), + .driver_data = DUALSHOCK4_CONTROLLER_USB }, { } }; MODULE_DEVICE_TABLE(hid, sony_devices); From 82ea790afe2726f0bb01a09e234be31bb6c7749b Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Fri, 31 Aug 2018 07:15:56 -0700 Subject: [PATCH 1129/1212] iw_cxgb4: only allow 1 flush on user qps commit 308aa2b8f7b7db3332a7d41099fd37851fb793b2 upstream. Once the qp has been flushed, it cannot be flushed again. The user qp flush logic wasn't enforcing it however. The bug can cause touch-after-free crashes like: Unable to handle kernel paging request for data at address 0x000001ec Faulting instruction address: 0xc008000016069100 Oops: Kernel access of bad area, sig: 11 [#1] ... NIP [c008000016069100] flush_qp+0x80/0x480 [iw_cxgb4] LR [c00800001606cd6c] c4iw_modify_qp+0x71c/0x11d0 [iw_cxgb4] Call Trace: [c00800001606cd6c] c4iw_modify_qp+0x71c/0x11d0 [iw_cxgb4] [c00800001606e868] c4iw_ib_modify_qp+0x118/0x200 [iw_cxgb4] [c0080000119eae80] ib_security_modify_qp+0xd0/0x3d0 [ib_core] [c0080000119c4e24] ib_modify_qp+0xc4/0x2c0 [ib_core] [c008000011df0284] iwcm_modify_qp_err+0x44/0x70 [iw_cm] [c008000011df0fec] destroy_cm_id+0xcc/0x370 [iw_cm] [c008000011ed4358] rdma_destroy_id+0x3c8/0x520 [rdma_cm] [c0080000134b0540] ucma_close+0x90/0x1b0 [rdma_ucm] [c000000000444da4] __fput+0xe4/0x2f0 So fix flush_qp() to only flush the wq once. Cc: stable@vger.kernel.org Signed-off-by: Steve Wise Signed-off-by: Jason Gunthorpe Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/hw/cxgb4/qp.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index 53aa7515f542..04206c600098 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -1183,6 +1183,12 @@ static void flush_qp(struct c4iw_qp *qhp) t4_set_wq_in_error(&qhp->wq); if (qhp->ibqp.uobject) { + + /* for user qps, qhp->wq.flushed is protected by qhp->mutex */ + if (qhp->wq.flushed) + return; + + qhp->wq.flushed = 1; t4_set_cq_in_error(&rchp->cq); spin_lock_irqsave(&rchp->comp_handler_lock, flag); (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context); From 9c6cd3f3a4b8194e82fa927bc00028c7a505e3b3 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 29 Sep 2018 03:08:55 -0700 Subject: [PATCH 1130/1212] Linux 4.4.159 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index d07a6283b67e..06d5c6a6a0f6 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 158 +SUBLEVEL = 159 EXTRAVERSION = NAME = Blurry Fish Butt From d36d92bca7f25cd3376fd5710cfe80571d0b7b94 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Thu, 21 Jun 2018 09:23:22 -0700 Subject: [PATCH 1131/1212] compiler-gcc.h: Add __attribute__((gnu_inline)) to all inline declarations Functions marked extern inline do not emit an externally visible function when the gnu89 C standard is used. Some KBUILD Makefiles overwrite KBUILD_CFLAGS. This is an issue for GCC 5.1+ users as without an explicit C standard specified, the default is gnu11. Since c99, the semantics of extern inline have changed such that an externally visible function is always emitted. This can lead to multiple definition errors of extern inline functions at link time of compilation units whose build files have removed an explicit C standard compiler flag for users of GCC 5.1+ or Clang. Suggested-by: Arnd Bergmann Suggested-by: H. Peter Anvin Suggested-by: Joe Perches Signed-off-by: Nick Desaulniers Acked-by: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@redhat.com Cc: akataria@vmware.com Cc: akpm@linux-foundation.org Cc: andrea.parri@amarulasolutions.com Cc: ard.biesheuvel@linaro.org Cc: aryabinin@virtuozzo.com Cc: astrachan@google.com Cc: boris.ostrovsky@oracle.com Cc: brijesh.singh@amd.com Cc: caoj.fnst@cn.fujitsu.com Cc: geert@linux-m68k.org Cc: ghackmann@google.com Cc: gregkh@linuxfoundation.org Cc: jan.kiszka@siemens.com Cc: jarkko.sakkinen@linux.intel.com Cc: jpoimboe@redhat.com Cc: keescook@google.com Cc: kirill.shutemov@linux.intel.com Cc: kstewart@linuxfoundation.org Cc: linux-efi@vger.kernel.org Cc: linux-kbuild@vger.kernel.org Cc: manojgupta@google.com Cc: mawilcox@microsoft.com Cc: michal.lkml@markovi.net Cc: mjg59@google.com Cc: mka@chromium.org Cc: pombredanne@nexb.com Cc: rientjes@google.com Cc: rostedt@goodmis.org Cc: sedat.dilek@gmail.com Cc: thomas.lendacky@amd.com Cc: tstellar@redhat.com Cc: tweek@google.com Cc: virtualization@lists.linux-foundation.org Cc: will.deacon@arm.com Cc: yamada.masahiro@socionext.com Link: http://lkml.kernel.org/r/20180621162324.36656-2-ndesaulniers@google.com Signed-off-by: Ingo Molnar (cherry picked from commit d03db2bc26f0e4a6849ad649a09c9c73fccdc656) Signed-off-by: Mark Brown --- include/linux/compiler-gcc.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 85d6568010fd..143d40e8a1ea 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -83,17 +83,18 @@ * -Wunused-function. This turns out to avoid the need for complex #ifdef * directives. Suppress the warning in clang as well by using "unused" * function attribute, which is redundant but not harmful for gcc. + * Prefer gnu_inline, so that extern inline functions do not emit an + * externally visible function. This makes extern inline behave as per gnu89 + * semantics rather than c99. This prevents multiple symbol definition errors + * of extern inline functions at link time. + * A lot of inline functions can cause havoc with function tracing. */ #if !defined(CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING) || \ !defined(CONFIG_OPTIMIZE_INLINING) || (__GNUC__ < 4) -#define inline inline __attribute__((always_inline,unused)) notrace -#define __inline__ __inline__ __attribute__((always_inline,unused)) notrace -#define __inline __inline __attribute__((always_inline,unused)) notrace +#define inline \ + inline __attribute__((always_inline, unused)) notrace __gnu_inline #else -/* A lot of inline functions can cause havoc with function tracing */ -#define inline inline __attribute__((unused)) notrace -#define __inline__ __inline__ __attribute__((unused)) notrace -#define __inline __inline __attribute__((unused)) notrace +#define inline inline __attribute__((unused)) notrace __gnu_inline #endif #define __inline__ inline From 78291ae15aaad8e4960a00d75acc35a225e9bfa1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 1 Oct 2018 10:42:58 -0700 Subject: [PATCH 1132/1212] Revert "f2fs: use timespec64 for inode timestamps" This reverts commit e7406233c15f23d796d2e100872507d4ddc61e7e. This is to fix build errors for mips. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 12 ++++++------ fs/f2fs/namei.c | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b0151dd8ed76..787df98db916 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -752,8 +752,8 @@ struct f2fs_inode_info { int i_extra_isize; /* size of extra space located in i_addr */ kprojid_t i_projid; /* id for project quota */ int i_inline_xattr_size; /* inline xattr size */ - struct timespec64 i_crtime; /* inode creation time */ - struct timespec64 i_disk_time[4];/* inode disk times */ + struct timespec i_crtime; /* inode creation time */ + struct timespec i_disk_time[4]; /* inode disk times */ }; static inline void get_extent_info(struct extent_info *ext, @@ -2636,13 +2636,13 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) i_size_read(inode) & ~PAGE_MASK) return false; - if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime)) + if (!timespec_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime)) return false; - if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &inode->i_ctime)) + if (!timespec_equal(F2FS_I(inode)->i_disk_time + 1, &inode->i_ctime)) return false; - if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime)) + if (!timespec_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime)) return false; - if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 3, + if (!timespec_equal(F2FS_I(inode)->i_disk_time + 3, &F2FS_I(inode)->i_crtime)) return false; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index f3be4dc3cbe2..49cc29df9800 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -50,8 +50,8 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) inode->i_ino = ino; inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); - F2FS_I(inode)->i_crtime = inode->i_mtime; + inode->i_mtime = inode->i_atime = inode->i_ctime = + F2FS_I(inode)->i_crtime = current_time(inode); inode->i_generation = sbi->s_next_generation++; if (S_ISDIR(inode->i_mode)) From dc5836a4dabb74247338668db922922af9e10ae5 Mon Sep 17 00:00:00 2001 From: Alistair Strachan Date: Wed, 25 Jul 2018 16:11:38 -0700 Subject: [PATCH 1133/1212] x86_64_cuttlefish_defconfig: enable verity cert Bug: 72722987 Test: Build, boot and verify in /proc/keys Change-Id: Ia55b94d56827003a88cb6083a75340ee31347470 Signed-off-by: Alistair Strachan Signed-off-by: Amit Pundir --- arch/x86/configs/x86_64_cuttlefish_defconfig | 5 ++++ verity_dev_keys.x509 | 24 ++++++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 verity_dev_keys.x509 diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig index 71026930c04c..7b63741c622d 100644 --- a/arch/x86/configs/x86_64_cuttlefish_defconfig +++ b/arch/x86/configs/x86_64_cuttlefish_defconfig @@ -447,3 +447,8 @@ CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1 # CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set CONFIG_CRYPTO_ECHAINIV=y CONFIG_CRYPTO_SHA512=y +CONFIG_ASYMMETRIC_KEY_TYPE=y +CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y +CONFIG_X509_CERTIFICATE_PARSER=y +CONFIG_SYSTEM_TRUSTED_KEYRING=y +CONFIG_SYSTEM_TRUSTED_KEYS="verity_dev_keys.x509" diff --git a/verity_dev_keys.x509 b/verity_dev_keys.x509 new file mode 100644 index 000000000000..86399c3c1dd7 --- /dev/null +++ b/verity_dev_keys.x509 @@ -0,0 +1,24 @@ +-----BEGIN CERTIFICATE----- +MIID/TCCAuWgAwIBAgIJAJcPmDkJqolJMA0GCSqGSIb3DQEBBQUAMIGUMQswCQYD +VQQGEwJVUzETMBEGA1UECAwKQ2FsaWZvcm5pYTEWMBQGA1UEBwwNTW91bnRhaW4g +VmlldzEQMA4GA1UECgwHQW5kcm9pZDEQMA4GA1UECwwHQW5kcm9pZDEQMA4GA1UE +AwwHQW5kcm9pZDEiMCAGCSqGSIb3DQEJARYTYW5kcm9pZEBhbmRyb2lkLmNvbTAe +Fw0xNDExMDYxOTA3NDBaFw00MjAzMjQxOTA3NDBaMIGUMQswCQYDVQQGEwJVUzET +MBEGA1UECAwKQ2FsaWZvcm5pYTEWMBQGA1UEBwwNTW91bnRhaW4gVmlldzEQMA4G +A1UECgwHQW5kcm9pZDEQMA4GA1UECwwHQW5kcm9pZDEQMA4GA1UEAwwHQW5kcm9p +ZDEiMCAGCSqGSIb3DQEJARYTYW5kcm9pZEBhbmRyb2lkLmNvbTCCASIwDQYJKoZI +hvcNAQEBBQADggEPADCCAQoCggEBAOjreE0vTVSRenuzO9vnaWfk0eQzYab0gqpi +6xAzi6dmD+ugoEKJmbPiuE5Dwf21isZ9uhUUu0dQM46dK4ocKxMRrcnmGxydFn6o +fs3ODJMXOkv2gKXL/FdbEPdDbxzdu8z3yk+W67udM/fW7WbaQ3DO0knu+izKak/3 +T41c5uoXmQ81UNtAzRGzGchNVXMmWuTGOkg6U+0I2Td7K8yvUMWhAWPPpKLtVH9r +AL5TzjYNR92izdKcz3AjRsI3CTjtpiVABGeX0TcjRSuZB7K9EK56HV+OFNS6I1NP +jdD7FIShyGlqqZdUOkAUZYanbpgeT5N7QL6uuqcGpoTOkalu6kkCAwEAAaNQME4w +HQYDVR0OBBYEFH5DM/m7oArf4O3peeKO0ZIEkrQPMB8GA1UdIwQYMBaAFH5DM/m7 +oArf4O3peeKO0ZIEkrQPMAwGA1UdEwQFMAMBAf8wDQYJKoZIhvcNAQEFBQADggEB +AHO3NSvDE5jFvMehGGtS8BnFYdFKRIglDMc4niWSzhzOVYRH4WajxdtBWc5fx0ix +NF/+hVKVhP6AIOQa+++sk+HIi7RvioPPbhjcsVlZe7cUEGrLSSveGouQyc+j0+m6 +JF84kszIl5GGNMTnx0XRPO+g8t6h5LWfnVydgZfpGRRg+WHewk1U2HlvTjIceb0N +dcoJ8WKJAFWdcuE7VIm4w+vF/DYX/A2Oyzr2+QRhmYSv1cusgAeC1tvH4ap+J1Lg +UnOu5Kh/FqPLLSwNVQp4Bu7b9QFfqK8Moj84bj88NqRGZgDyqzuTrFxn6FW7dmyA +yttuAJAEAymk1mipd9+zp38= +-----END CERTIFICATE----- From 97b9031454a38819aeb63d32ae3208d58a79c580 Mon Sep 17 00:00:00 2001 From: Alistair Strachan Date: Wed, 25 Jul 2018 16:11:09 -0700 Subject: [PATCH 1134/1212] x86_64_cuttlefish_defconfig: Enable android-verity Bug: 72722987 Test: Build & boot with x86_64_cuttlefish_defconfig Change-Id: I961e6aaa944b5ab0c005cb39604a52f8dc98fb06 Signed-off-by: Alistair Strachan Signed-off-by: Amit Pundir --- arch/x86/configs/x86_64_cuttlefish_defconfig | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig index 7b63741c622d..df9b6bd228f7 100644 --- a/arch/x86/configs/x86_64_cuttlefish_defconfig +++ b/arch/x86/configs/x86_64_cuttlefish_defconfig @@ -214,13 +214,17 @@ CONFIG_SCSI_CONSTANTS=y CONFIG_SCSI_SPI_ATTRS=y CONFIG_SCSI_VIRTIO=y CONFIG_MD=y +CONFIG_BLK_DEV_MD=y +CONFIG_MD_LINEAR=y CONFIG_BLK_DEV_DM=y CONFIG_DM_CRYPT=y CONFIG_DM_MIRROR=y CONFIG_DM_ZERO=y CONFIG_DM_UEVENT=y CONFIG_DM_VERITY=y +CONFIG_DM_VERITY_HASH_PREFETCH_MIN_SIZE=1 CONFIG_DM_VERITY_FEC=y +CONFIG_DM_ANDROID_VERITY=y CONFIG_NETDEVICES=y CONFIG_NETCONSOLE=y CONFIG_NETCONSOLE_DYNAMIC=y From 072304e803ab1981914caf61e2de4fff21a624fe Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Thu, 26 Jul 2018 16:32:09 -0700 Subject: [PATCH 1135/1212] ANDROID: sdcardfs: Check stacked filesystem depth bug: 111860541 Change-Id: Ia0a30b2b8956c4ada28981584cd8647713a1e993 Signed-off-by: Daniel Rosenberg Signed-off-by: Amit Pundir --- fs/sdcardfs/main.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c index 30e0c431a1ea..27ec726e7a46 100644 --- a/fs/sdcardfs/main.c +++ b/fs/sdcardfs/main.c @@ -295,6 +295,13 @@ static int sdcardfs_read_super(struct vfsmount *mnt, struct super_block *sb, atomic_inc(&lower_sb->s_active); sdcardfs_set_lower_super(sb, lower_sb); + sb->s_stack_depth = lower_sb->s_stack_depth + 1; + if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { + pr_err("sdcardfs: maximum fs stacking depth exceeded\n"); + err = -EINVAL; + goto out_sput; + } + /* inherit maxbytes from lower file system */ sb->s_maxbytes = lower_sb->s_maxbytes; From 688f9b4e1a46ca41f806d806e440079123fd29e6 Mon Sep 17 00:00:00 2001 From: Alistair Strachan Date: Wed, 22 Aug 2018 17:07:20 -0700 Subject: [PATCH 1136/1212] ANDROID: Refresh x86_64_cuttlefish_defconfig An LTS change removed the need to set a config option. This broke the comparison validation with the output of "make savedefconfig". Change-Id: Id7ed6c6546d0efe88b67c0d1b92183152406e6f6 Signed-off-by: Alistair Strachan Signed-off-by: Amit Pundir --- arch/x86/configs/x86_64_cuttlefish_defconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig index df9b6bd228f7..99d7b53932f7 100644 --- a/arch/x86/configs/x86_64_cuttlefish_defconfig +++ b/arch/x86/configs/x86_64_cuttlefish_defconfig @@ -449,7 +449,6 @@ CONFIG_HARDENED_USERCOPY=y CONFIG_SECURITY_SELINUX=y CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1 # CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set -CONFIG_CRYPTO_ECHAINIV=y CONFIG_CRYPTO_SHA512=y CONFIG_ASYMMETRIC_KEY_TYPE=y CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y From 8e1d8cd24dd0373dfdd47c5254ad08001dd1775d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 7 Jun 2018 13:39:49 -0700 Subject: [PATCH 1137/1212] UPSTREAM: socket: close race condition between sock_close() and sockfs_setattr() fchownat() doesn't even hold refcnt of fd until it figures out fd is really needed (otherwise is ignored) and releases it after it resolves the path. This means sock_close() could race with sockfs_setattr(), which leads to a NULL pointer dereference since typically we set sock->sk to NULL in ->release(). As pointed out by Al, this is unique to sockfs. So we can fix this in socket layer by acquiring inode_lock in sock_close() and checking against NULL in sockfs_setattr(). sock_release() is called in many places, only the sock_close() path matters here. And fortunately, this should not affect normal sock_close() as it is only called when the last fd refcnt is gone. It only affects sock_close() with a parallel sockfs_setattr() in progress, which is not common. Fixes: 86741ec25462 ("net: core: Add a UID field to struct sock.") Reported-by: shankarapailoor Cc: Tetsuo Handa Cc: Lorenzo Colitti Cc: Al Viro Signed-off-by: Cong Wang Signed-off-by: David S. Miller (cherry picked from commit 6d8c50dcb029872b298eea68cc6209c866fd3e14) Signed-off-by: Chenbo Feng Bug: 112220999 Test: syzcaller reproducer doesn't trigger the crash anymore Change-Id: I90bec1515889e0dfd23f94e3f29b366c7bbfcd11 Signed-off-by: Amit Pundir --- net/socket.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/net/socket.c b/net/socket.c index b75a537807b5..b36c981d04dc 100644 --- a/net/socket.c +++ b/net/socket.c @@ -528,7 +528,10 @@ static int sockfs_setattr(struct dentry *dentry, struct iattr *iattr) if (!err && (iattr->ia_valid & ATTR_UID)) { struct socket *sock = SOCKET_I(d_inode(dentry)); - sock->sk->sk_uid = iattr->ia_uid; + if (sock->sk) + sock->sk->sk_uid = iattr->ia_uid; + else + err = -ENOENT; } return err; @@ -579,12 +582,16 @@ static struct socket *sock_alloc(void) * an inode not a file. */ -void sock_release(struct socket *sock) +static void __sock_release(struct socket *sock, struct inode *inode) { if (sock->ops) { struct module *owner = sock->ops->owner; + if (inode) + inode_lock(inode); sock->ops->release(sock); + if (inode) + inode_unlock(inode); sock->ops = NULL; module_put(owner); } @@ -599,6 +606,11 @@ void sock_release(struct socket *sock) } sock->file = NULL; } + +void sock_release(struct socket *sock) +{ + __sock_release(sock, NULL); +} EXPORT_SYMBOL(sock_release); void __sock_tx_timestamp(const struct sock *sk, __u8 *tx_flags) @@ -1035,7 +1047,7 @@ static int sock_mmap(struct file *file, struct vm_area_struct *vma) static int sock_close(struct inode *inode, struct file *filp) { - sock_release(SOCKET_I(inode)); + __sock_release(SOCKET_I(inode), inode); return 0; } From 711c8927f8a28c7aee352b50a290d9ff88c73a35 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 14 Jan 2016 15:22:32 -0800 Subject: [PATCH 1138/1212] UPSTREAM: zram: pass gfp from zcomp frontend to backend Each zcomp backend uses own gfp flag but it's pointless because the context they could be called is driven by upper layer(ie, zcomp frontend). As well, zcomp frondend could call them in different context. One context(ie, zram init part) is it should be better to make sure successful allocation other context(ie, further stream allocation part for accelarating I/O speed) is just optional so let's pass gfp down from driver (ie, zcomp frontend) like normal MM convention. [sergey.senozhatsky@gmail.com: add missing __vmalloc zero and highmem gfps] Signed-off-by: Minchan Kim Signed-off-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 75d8947a36d0c9aedd69118d1f14bf424005c7c2) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I572d0565de5aff94ebe0782eba9d34f9c9862060 Signed-off-by: Amit Pundir --- drivers/block/zram/zcomp.c | 24 ++++++++++++++++-------- drivers/block/zram/zcomp.h | 2 +- drivers/block/zram/zcomp_lz4.c | 16 +++------------- drivers/block/zram/zcomp_lzo.c | 16 +++------------- 4 files changed, 23 insertions(+), 35 deletions(-) diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index c53617752b93..3ef42e563bb5 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -74,18 +74,18 @@ static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm) * allocate new zcomp_strm structure with ->private initialized by * backend, return NULL on error */ -static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp) +static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp, gfp_t flags) { - struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), GFP_NOIO); + struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), flags); if (!zstrm) return NULL; - zstrm->private = comp->backend->create(); + zstrm->private = comp->backend->create(flags); /* * allocate 2 pages. 1 for compressed data, plus 1 extra for the * case when compressed size is larger than the original one */ - zstrm->buffer = (void *)__get_free_pages(GFP_NOIO | __GFP_ZERO, 1); + zstrm->buffer = (void *)__get_free_pages(flags | __GFP_ZERO, 1); if (!zstrm->private || !zstrm->buffer) { zcomp_strm_free(comp, zstrm); zstrm = NULL; @@ -120,8 +120,16 @@ static struct zcomp_strm *zcomp_strm_multi_find(struct zcomp *comp) /* allocate new zstrm stream */ zs->avail_strm++; spin_unlock(&zs->strm_lock); - - zstrm = zcomp_strm_alloc(comp); + /* + * This function can be called in swapout/fs write path + * so we can't use GFP_FS|IO. And it assumes we already + * have at least one stream in zram initialization so we + * don't do best effort to allocate more stream in here. + * A default stream will work well without further multiple + * streams. That's why we use NORETRY | NOWARN. + */ + zstrm = zcomp_strm_alloc(comp, GFP_NOIO | __GFP_NORETRY | + __GFP_NOWARN); if (!zstrm) { spin_lock(&zs->strm_lock); zs->avail_strm--; @@ -209,7 +217,7 @@ static int zcomp_strm_multi_create(struct zcomp *comp, int max_strm) zs->max_strm = max_strm; zs->avail_strm = 1; - zstrm = zcomp_strm_alloc(comp); + zstrm = zcomp_strm_alloc(comp, GFP_KERNEL); if (!zstrm) { kfree(zs); return -ENOMEM; @@ -259,7 +267,7 @@ static int zcomp_strm_single_create(struct zcomp *comp) comp->stream = zs; mutex_init(&zs->strm_lock); - zs->zstrm = zcomp_strm_alloc(comp); + zs->zstrm = zcomp_strm_alloc(comp, GFP_KERNEL); if (!zs->zstrm) { kfree(zs); return -ENOMEM; diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index 46e2b9f8f1f0..b7d2a4bcae54 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -33,7 +33,7 @@ struct zcomp_backend { int (*decompress)(const unsigned char *src, size_t src_len, unsigned char *dst); - void *(*create)(void); + void *(*create)(gfp_t flags); void (*destroy)(void *private); const char *name; diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c index dd6083124276..dc2338d5258c 100644 --- a/drivers/block/zram/zcomp_lz4.c +++ b/drivers/block/zram/zcomp_lz4.c @@ -15,24 +15,14 @@ #include "zcomp_lz4.h" -static void *zcomp_lz4_create(void) +static void *zcomp_lz4_create(gfp_t flags) { void *ret; - /* - * This function can be called in swapout/fs write path - * so we can't use GFP_FS|IO. And it assumes we already - * have at least one stream in zram initialization so we - * don't do best effort to allocate more stream in here. - * A default stream will work well without further multiple - * streams. That's why we use NORETRY | NOWARN. - */ - ret = kzalloc(LZ4_MEM_COMPRESS, GFP_NOIO | __GFP_NORETRY | - __GFP_NOWARN); + ret = kzalloc(LZ4_MEM_COMPRESS, flags); if (!ret) ret = __vmalloc(LZ4_MEM_COMPRESS, - GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN | - __GFP_ZERO | __GFP_HIGHMEM, + flags | __GFP_ZERO | __GFP_HIGHMEM, PAGE_KERNEL); return ret; } diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c index edc549920fa0..0ab6fce8abe4 100644 --- a/drivers/block/zram/zcomp_lzo.c +++ b/drivers/block/zram/zcomp_lzo.c @@ -15,24 +15,14 @@ #include "zcomp_lzo.h" -static void *lzo_create(void) +static void *lzo_create(gfp_t flags) { void *ret; - /* - * This function can be called in swapout/fs write path - * so we can't use GFP_FS|IO. And it assumes we already - * have at least one stream in zram initialization so we - * don't do best effort to allocate more stream in here. - * A default stream will work well without further multiple - * streams. That's why we use NORETRY | NOWARN. - */ - ret = kzalloc(LZO1X_MEM_COMPRESS, GFP_NOIO | __GFP_NORETRY | - __GFP_NOWARN); + ret = kzalloc(LZO1X_MEM_COMPRESS, flags); if (!ret) ret = __vmalloc(LZO1X_MEM_COMPRESS, - GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN | - __GFP_ZERO | __GFP_HIGHMEM, + flags | __GFP_ZERO | __GFP_HIGHMEM, PAGE_KERNEL); return ret; } From 2fafbdf79ebe83ab3121482f910e43a1914979c2 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 14 Jan 2016 15:22:35 -0800 Subject: [PATCH 1139/1212] UPSTREAM: zram/zcomp: do not zero out zcomp private pages Do not __GFP_ZERO allocated zcomp ->private pages. We keep allocated streams around and use them for read/write requests, so we supply a zeroed out ->private to compression algorithm as a scratch buffer only once -- the first time we use that stream. For the rest of IO requests served by this stream ->private usually contains some temporarily data from the previous requests. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit e02d238c9852a91b30da9ea32ce36d1416cdc683) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I911832da703f596998a4139d6033ef1564848c9e Signed-off-by: Amit Pundir --- drivers/block/zram/zcomp_lz4.c | 4 ++-- drivers/block/zram/zcomp_lzo.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c index dc2338d5258c..0110086accba 100644 --- a/drivers/block/zram/zcomp_lz4.c +++ b/drivers/block/zram/zcomp_lz4.c @@ -19,10 +19,10 @@ static void *zcomp_lz4_create(gfp_t flags) { void *ret; - ret = kzalloc(LZ4_MEM_COMPRESS, flags); + ret = kmalloc(LZ4_MEM_COMPRESS, flags); if (!ret) ret = __vmalloc(LZ4_MEM_COMPRESS, - flags | __GFP_ZERO | __GFP_HIGHMEM, + flags | __GFP_HIGHMEM, PAGE_KERNEL); return ret; } diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c index 0ab6fce8abe4..ed7a1f0549ec 100644 --- a/drivers/block/zram/zcomp_lzo.c +++ b/drivers/block/zram/zcomp_lzo.c @@ -19,10 +19,10 @@ static void *lzo_create(gfp_t flags) { void *ret; - ret = kzalloc(LZO1X_MEM_COMPRESS, flags); + ret = kmalloc(LZO1X_MEM_COMPRESS, flags); if (!ret) ret = __vmalloc(LZO1X_MEM_COMPRESS, - flags | __GFP_ZERO | __GFP_HIGHMEM, + flags | __GFP_HIGHMEM, PAGE_KERNEL); return ret; } From 055114890dc0559466982e1decd8b818a559526b Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Fri, 20 May 2016 16:59:48 -0700 Subject: [PATCH 1140/1212] BACKPORT: zsmalloc: require GFP in zs_malloc() Pass GFP flags to zs_malloc() instead of using a fixed mask supplied to zs_create_pool(), so we can be more flexible, but, more importantly, we need this to switch zram to per-cpu compression streams -- zram will try to allocate handle with preemption disabled in a fast path and switch to a slow path (using different gfp mask) if the fast one has failed. Apart from that, this also align zs_malloc() interface with zspool/zbud. [sergey.senozhatsky@gmail.com: pass GFP flags to zs_malloc() instead of using a fixed mask] Link: http://lkml.kernel.org/r/20160429150942.GA637@swordfish Link: http://lkml.kernel.org/r/20160429150942.GA637@swordfish Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit d0d8da2dc49dfdfe1d788eaf4d55eb5d4964d926) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I31276c9351be21a4ed588681b332e98142b76526 Signed-off-by: Amit Pundir --- drivers/block/zram/zram_drv.c | 4 ++-- include/linux/zsmalloc.h | 4 ++-- mm/zsmalloc.c | 24 +++++++++++++----------- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 502406c9e6e1..89245f1e1f83 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -514,7 +514,7 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize) goto out_error; } - meta->mem_pool = zs_create_pool(pool_name, GFP_NOIO | __GFP_HIGHMEM); + meta->mem_pool = zs_create_pool(pool_name); if (!meta->mem_pool) { pr_err("Error creating memory pool\n"); goto out_error; @@ -717,7 +717,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, src = uncmem; } - handle = zs_malloc(meta->mem_pool, clen); + handle = zs_malloc(meta->mem_pool, clen, GFP_NOIO | __GFP_HIGHMEM); if (!handle) { pr_err("Error allocating memory for compressed page: %u, size=%zu\n", index, clen); diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 34eb16098a33..57a8e98f2708 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -41,10 +41,10 @@ struct zs_pool_stats { struct zs_pool; -struct zs_pool *zs_create_pool(const char *name, gfp_t flags); +struct zs_pool *zs_create_pool(const char *name); void zs_destroy_pool(struct zs_pool *pool); -unsigned long zs_malloc(struct zs_pool *pool, size_t size); +unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags); void zs_free(struct zs_pool *pool, unsigned long obj); void *zs_map_object(struct zs_pool *pool, unsigned long handle, diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index c1ea19478119..8fced2101492 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -247,7 +247,6 @@ struct zs_pool { struct size_class **size_class; struct kmem_cache *handle_cachep; - gfp_t flags; /* allocation flags used when growing pool */ atomic_long_t pages_allocated; struct zs_pool_stats stats; @@ -296,10 +295,10 @@ static void destroy_handle_cache(struct zs_pool *pool) kmem_cache_destroy(pool->handle_cachep); } -static unsigned long alloc_handle(struct zs_pool *pool) +static unsigned long alloc_handle(struct zs_pool *pool, gfp_t gfp) { return (unsigned long)kmem_cache_alloc(pool->handle_cachep, - pool->flags & ~__GFP_HIGHMEM); + gfp & ~__GFP_HIGHMEM); } static void free_handle(struct zs_pool *pool, unsigned long handle) @@ -325,7 +324,12 @@ static void *zs_zpool_create(const char *name, gfp_t gfp, const struct zpool_ops *zpool_ops, struct zpool *zpool) { - return zs_create_pool(name, gfp); + /* + * Ignore global gfp flags: zs_malloc() may be invoked from + * different contexts and its caller must provide a valid + * gfp mask. + */ + return zs_create_pool(name); } static void zs_zpool_destroy(void *pool) @@ -336,7 +340,7 @@ static void zs_zpool_destroy(void *pool) static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, unsigned long *handle) { - *handle = zs_malloc(pool, size); + *handle = zs_malloc(pool, size, gfp); return *handle ? 0 : -1; } static void zs_zpool_free(void *pool, unsigned long handle) @@ -1388,7 +1392,7 @@ static unsigned long obj_malloc(struct page *first_page, * otherwise 0. * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. */ -unsigned long zs_malloc(struct zs_pool *pool, size_t size) +unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) { unsigned long handle, obj; struct size_class *class; @@ -1397,7 +1401,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) return 0; - handle = alloc_handle(pool); + handle = alloc_handle(pool, gfp); if (!handle) return 0; @@ -1410,7 +1414,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) if (!first_page) { spin_unlock(&class->lock); - first_page = alloc_zspage(class, pool->flags); + first_page = alloc_zspage(class, gfp); if (unlikely(!first_page)) { free_handle(pool, handle); return 0; @@ -1884,7 +1888,7 @@ static int zs_register_shrinker(struct zs_pool *pool) * On success, a pointer to the newly created pool is returned, * otherwise NULL. */ -struct zs_pool *zs_create_pool(const char *name, gfp_t flags) +struct zs_pool *zs_create_pool(const char *name) { int i; struct zs_pool *pool; @@ -1954,8 +1958,6 @@ struct zs_pool *zs_create_pool(const char *name, gfp_t flags) prev_class = class; } - pool->flags = flags; - if (zs_pool_stat_create(name, pool)) goto err; From c331c792c0caf08c74b259376286a1bba0208295 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Fri, 20 May 2016 16:59:51 -0700 Subject: [PATCH 1141/1212] UPSTREAM: zram: user per-cpu compression streams Remove idle streams list and keep compression streams in per-cpu data. This removes two contented spin_lock()/spin_unlock() calls from write path and also prevent write OP from being preempted while holding the compression stream, which can cause slow downs. For instance, let's assume that we have N cpus and N-2 max_comp_streams.TASK1 owns the last idle stream, TASK2-TASK3 come in with the write requests: TASK1 TASK2 TASK3 zram_bvec_write() spin_lock find stream spin_unlock compress <> zram_bvec_write() spin_lock find stream spin_unlock no_stream schedule zram_bvec_write() spin_lock find_stream spin_unlock no_stream schedule spin_lock release stream spin_unlock wake up TASK2 not only TASK2 and TASK3 will not get the stream, TASK1 will be preempted in the middle of its operation; while we would prefer it to finish compression and release the stream. Test environment: x86_64, 4 CPU box, 3G zram, lzo The following fio tests were executed: read, randread, write, randwrite, rw, randrw with the increasing number of jobs from 1 to 10. 4 streams 8 streams per-cpu =========================================================== jobs1 READ: 2520.1MB/s 2566.5MB/s 2491.5MB/s READ: 2102.7MB/s 2104.2MB/s 2091.3MB/s WRITE: 1355.1MB/s 1320.2MB/s 1378.9MB/s WRITE: 1103.5MB/s 1097.2MB/s 1122.5MB/s READ: 434013KB/s 435153KB/s 439961KB/s WRITE: 433969KB/s 435109KB/s 439917KB/s READ: 403166KB/s 405139KB/s 403373KB/s WRITE: 403223KB/s 405197KB/s 403430KB/s jobs2 READ: 7958.6MB/s 8105.6MB/s 8073.7MB/s READ: 6864.9MB/s 6989.8MB/s 7021.8MB/s WRITE: 2438.1MB/s 2346.9MB/s 3400.2MB/s WRITE: 1994.2MB/s 1990.3MB/s 2941.2MB/s READ: 981504KB/s 973906KB/s 1018.8MB/s WRITE: 981659KB/s 974060KB/s 1018.1MB/s READ: 937021KB/s 938976KB/s 987250KB/s WRITE: 934878KB/s 936830KB/s 984993KB/s jobs3 READ: 13280MB/s 13553MB/s 13553MB/s READ: 11534MB/s 11785MB/s 11755MB/s WRITE: 3456.9MB/s 3469.9MB/s 4810.3MB/s WRITE: 3029.6MB/s 3031.6MB/s 4264.8MB/s READ: 1363.8MB/s 1362.6MB/s 1448.9MB/s WRITE: 1361.9MB/s 1360.7MB/s 1446.9MB/s READ: 1309.4MB/s 1310.6MB/s 1397.5MB/s WRITE: 1307.4MB/s 1308.5MB/s 1395.3MB/s jobs4 READ: 20244MB/s 20177MB/s 20344MB/s READ: 17886MB/s 17913MB/s 17835MB/s WRITE: 4071.6MB/s 4046.1MB/s 6370.2MB/s WRITE: 3608.9MB/s 3576.3MB/s 5785.4MB/s READ: 1824.3MB/s 1821.6MB/s 1997.5MB/s WRITE: 1819.8MB/s 1817.4MB/s 1992.5MB/s READ: 1765.7MB/s 1768.3MB/s 1937.3MB/s WRITE: 1767.5MB/s 1769.1MB/s 1939.2MB/s jobs5 READ: 18663MB/s 18986MB/s 18823MB/s READ: 16659MB/s 16605MB/s 16954MB/s WRITE: 3912.4MB/s 3888.7MB/s 6126.9MB/s WRITE: 3506.4MB/s 3442.5MB/s 5519.3MB/s READ: 1798.2MB/s 1746.5MB/s 1935.8MB/s WRITE: 1792.7MB/s 1740.7MB/s 1929.1MB/s READ: 1727.6MB/s 1658.2MB/s 1917.3MB/s WRITE: 1726.5MB/s 1657.2MB/s 1916.6MB/s jobs6 READ: 21017MB/s 20922MB/s 21162MB/s READ: 19022MB/s 19140MB/s 18770MB/s WRITE: 3968.2MB/s 4037.7MB/s 6620.8MB/s WRITE: 3643.5MB/s 3590.2MB/s 6027.5MB/s READ: 1871.8MB/s 1880.5MB/s 2049.9MB/s WRITE: 1867.8MB/s 1877.2MB/s 2046.2MB/s READ: 1755.8MB/s 1710.3MB/s 1964.7MB/s WRITE: 1750.5MB/s 1705.9MB/s 1958.8MB/s jobs7 READ: 21103MB/s 20677MB/s 21482MB/s READ: 18522MB/s 18379MB/s 19443MB/s WRITE: 4022.5MB/s 4067.4MB/s 6755.9MB/s WRITE: 3691.7MB/s 3695.5MB/s 5925.6MB/s READ: 1841.5MB/s 1933.9MB/s 2090.5MB/s WRITE: 1842.7MB/s 1935.3MB/s 2091.9MB/s READ: 1832.4MB/s 1856.4MB/s 1971.5MB/s WRITE: 1822.3MB/s 1846.2MB/s 1960.6MB/s jobs8 READ: 20463MB/s 20194MB/s 20862MB/s READ: 18178MB/s 17978MB/s 18299MB/s WRITE: 4085.9MB/s 4060.2MB/s 7023.8MB/s WRITE: 3776.3MB/s 3737.9MB/s 6278.2MB/s READ: 1957.6MB/s 1944.4MB/s 2109.5MB/s WRITE: 1959.2MB/s 1946.2MB/s 2111.4MB/s READ: 1900.6MB/s 1885.7MB/s 2082.1MB/s WRITE: 1896.2MB/s 1881.4MB/s 2078.3MB/s jobs9 READ: 19692MB/s 19734MB/s 19334MB/s READ: 17678MB/s 18249MB/s 17666MB/s WRITE: 4004.7MB/s 4064.8MB/s 6990.7MB/s WRITE: 3724.7MB/s 3772.1MB/s 6193.6MB/s READ: 1953.7MB/s 1967.3MB/s 2105.6MB/s WRITE: 1953.4MB/s 1966.7MB/s 2104.1MB/s READ: 1860.4MB/s 1897.4MB/s 2068.5MB/s WRITE: 1858.9MB/s 1895.9MB/s 2066.8MB/s jobs10 READ: 19730MB/s 19579MB/s 19492MB/s READ: 18028MB/s 18018MB/s 18221MB/s WRITE: 4027.3MB/s 4090.6MB/s 7020.1MB/s WRITE: 3810.5MB/s 3846.8MB/s 6426.8MB/s READ: 1956.1MB/s 1994.6MB/s 2145.2MB/s WRITE: 1955.9MB/s 1993.5MB/s 2144.8MB/s READ: 1852.8MB/s 1911.6MB/s 2075.8MB/s WRITE: 1855.7MB/s 1914.6MB/s 2078.1MB/s perf stat 4 streams 8 streams per-cpu ==================================================================================================================== jobs1 stalled-cycles-frontend 23,174,811,209 ( 38.21%) 23,220,254,188 ( 38.25%) 23,061,406,918 ( 38.34%) stalled-cycles-backend 11,514,174,638 ( 18.98%) 11,696,722,657 ( 19.27%) 11,370,852,810 ( 18.90%) instructions 73,925,005,782 ( 1.22) 73,903,177,632 ( 1.22) 73,507,201,037 ( 1.22) branches 14,455,124,835 ( 756.063) 14,455,184,779 ( 755.281) 14,378,599,509 ( 758.546) branch-misses 69,801,336 ( 0.48%) 80,225,529 ( 0.55%) 72,044,726 ( 0.50%) jobs2 stalled-cycles-frontend 49,912,741,782 ( 46.11%) 50,101,189,290 ( 45.95%) 32,874,195,633 ( 35.11%) stalled-cycles-backend 27,080,366,230 ( 25.02%) 27,949,970,232 ( 25.63%) 16,461,222,706 ( 17.58%) instructions 122,831,629,690 ( 1.13) 122,919,846,419 ( 1.13) 121,924,786,775 ( 1.30) branches 23,725,889,239 ( 692.663) 23,733,547,140 ( 688.062) 23,553,950,311 ( 794.794) branch-misses 90,733,041 ( 0.38%) 96,320,895 ( 0.41%) 84,561,092 ( 0.36%) jobs3 stalled-cycles-frontend 66,437,834,608 ( 45.58%) 63,534,923,344 ( 43.69%) 42,101,478,505 ( 33.19%) stalled-cycles-backend 34,940,799,661 ( 23.97%) 34,774,043,148 ( 23.91%) 21,163,324,388 ( 16.68%) instructions 171,692,121,862 ( 1.18) 171,775,373,044 ( 1.18) 170,353,542,261 ( 1.34) branches 32,968,962,622 ( 628.723) 32,987,739,894 ( 630.512) 32,729,463,918 ( 717.027) branch-misses 111,522,732 ( 0.34%) 110,472,894 ( 0.33%) 99,791,291 ( 0.30%) jobs4 stalled-cycles-frontend 98,741,701,675 ( 49.72%) 94,797,349,965 ( 47.59%) 54,535,655,381 ( 33.53%) stalled-cycles-backend 54,642,609,615 ( 27.51%) 55,233,554,408 ( 27.73%) 27,882,323,541 ( 17.14%) instructions 220,884,807,851 ( 1.11) 220,930,887,273 ( 1.11) 218,926,845,851 ( 1.35) branches 42,354,518,180 ( 592.105) 42,362,770,587 ( 590.452) 41,955,552,870 ( 716.154) branch-misses 138,093,449 ( 0.33%) 131,295,286 ( 0.31%) 121,794,771 ( 0.29%) jobs5 stalled-cycles-frontend 116,219,747,212 ( 48.14%) 110,310,397,012 ( 46.29%) 66,373,082,723 ( 33.70%) stalled-cycles-backend 66,325,434,776 ( 27.48%) 64,157,087,914 ( 26.92%) 32,999,097,299 ( 16.76%) instructions 270,615,008,466 ( 1.12) 270,546,409,525 ( 1.14) 268,439,910,948 ( 1.36) branches 51,834,046,557 ( 599.108) 51,811,867,722 ( 608.883) 51,412,576,077 ( 729.213) branch-misses 158,197,086 ( 0.31%) 142,639,805 ( 0.28%) 133,425,455 ( 0.26%) jobs6 stalled-cycles-frontend 138,009,414,492 ( 48.23%) 139,063,571,254 ( 48.80%) 75,278,568,278 ( 32.80%) stalled-cycles-backend 79,211,949,650 ( 27.68%) 79,077,241,028 ( 27.75%) 37,735,797,899 ( 16.44%) instructions 319,763,993,731 ( 1.12) 319,937,782,834 ( 1.12) 316,663,600,784 ( 1.38) branches 61,219,433,294 ( 595.056) 61,250,355,540 ( 598.215) 60,523,446,617 ( 733.706) branch-misses 169,257,123 ( 0.28%) 154,898,028 ( 0.25%) 141,180,587 ( 0.23%) jobs7 stalled-cycles-frontend 162,974,812,119 ( 49.20%) 159,290,061,987 ( 48.43%) 88,046,641,169 ( 33.21%) stalled-cycles-backend 92,223,151,661 ( 27.84%) 91,667,904,406 ( 27.87%) 44,068,454,971 ( 16.62%) instructions 369,516,432,430 ( 1.12) 369,361,799,063 ( 1.12) 365,290,380,661 ( 1.38) branches 70,795,673,950 ( 594.220) 70,743,136,124 ( 597.876) 69,803,996,038 ( 732.822) branch-misses 181,708,327 ( 0.26%) 165,767,821 ( 0.23%) 150,109,797 ( 0.22%) jobs8 stalled-cycles-frontend 185,000,017,027 ( 49.30%) 182,334,345,473 ( 48.37%) 99,980,147,041 ( 33.26%) stalled-cycles-backend 105,753,516,186 ( 28.18%) 107,937,830,322 ( 28.63%) 51,404,177,181 ( 17.10%) instructions 418,153,161,055 ( 1.11) 418,308,565,828 ( 1.11) 413,653,475,581 ( 1.38) branches 80,035,882,398 ( 592.296) 80,063,204,510 ( 589.843) 79,024,105,589 ( 730.530) branch-misses 199,764,528 ( 0.25%) 177,936,926 ( 0.22%) 160,525,449 ( 0.20%) jobs9 stalled-cycles-frontend 210,941,799,094 ( 49.63%) 204,714,679,254 ( 48.55%) 114,251,113,756 ( 33.96%) stalled-cycles-backend 122,640,849,067 ( 28.85%) 122,188,553,256 ( 28.98%) 58,360,041,127 ( 17.35%) instructions 468,151,025,415 ( 1.10) 467,354,869,323 ( 1.11) 462,665,165,216 ( 1.38) branches 89,657,067,510 ( 585.628) 89,411,550,407 ( 588.990) 88,360,523,943 ( 730.151) branch-misses 218,292,301 ( 0.24%) 191,701,247 ( 0.21%) 178,535,678 ( 0.20%) jobs10 stalled-cycles-frontend 233,595,958,008 ( 49.81%) 227,540,615,689 ( 49.11%) 160,341,979,938 ( 43.07%) stalled-cycles-backend 136,153,676,021 ( 29.03%) 133,635,240,742 ( 28.84%) 65,909,135,465 ( 17.70%) instructions 517,001,168,497 ( 1.10) 516,210,976,158 ( 1.11) 511,374,038,613 ( 1.37) branches 98,911,641,329 ( 585.796) 98,700,069,712 ( 591.583) 97,646,761,028 ( 728.712) branch-misses 232,341,823 ( 0.23%) 199,256,308 ( 0.20%) 183,135,268 ( 0.19%) per-cpu streams tend to cause significantly less stalled cycles; execute less branches and hit less branch-misses. perf stat reported execution time 4 streams 8 streams per-cpu ==================================================================== jobs1 seconds elapsed 20.909073870 20.875670495 20.817838540 jobs2 seconds elapsed 18.529488399 18.720566469 16.356103108 jobs3 seconds elapsed 18.991159531 18.991340812 16.766216066 jobs4 seconds elapsed 19.560643828 19.551323547 16.246621715 jobs5 seconds elapsed 24.746498464 25.221646740 20.696112444 jobs6 seconds elapsed 28.258181828 28.289765505 22.885688857 jobs7 seconds elapsed 32.632490241 31.909125381 26.272753738 jobs8 seconds elapsed 35.651403851 36.027596308 29.108024711 jobs9 seconds elapsed 40.569362365 40.024227989 32.898204012 jobs10 seconds elapsed 44.673112304 43.874898137 35.632952191 Please see Link: http://marc.info/?l=linux-kernel&m=146166970727530 Link: http://marc.info/?l=linux-kernel&m=146174716719650 for more test results (under low memory conditions). Signed-off-by: Sergey Senozhatsky Suggested-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit da9556a2367cf2261ab4d3e100693c82fb1ddb26) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I1af1a466f0ac3f74f9c36f06685111ccef0f4ec4 Signed-off-by: Amit Pundir --- drivers/block/zram/zcomp.c | 297 ++++++++++------------------------ drivers/block/zram/zcomp.h | 14 +- drivers/block/zram/zram_drv.c | 36 ++++- 3 files changed, 116 insertions(+), 231 deletions(-) diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index 3ef42e563bb5..bc98d5ed5477 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "zcomp.h" #include "zcomp_lzo.h" @@ -20,29 +21,6 @@ #include "zcomp_lz4.h" #endif -/* - * single zcomp_strm backend - */ -struct zcomp_strm_single { - struct mutex strm_lock; - struct zcomp_strm *zstrm; -}; - -/* - * multi zcomp_strm backend - */ -struct zcomp_strm_multi { - /* protect strm list */ - spinlock_t strm_lock; - /* max possible number of zstrm streams */ - int max_strm; - /* number of available zstrm streams */ - int avail_strm; - /* list of available strms */ - struct list_head idle_strm; - wait_queue_head_t strm_wait; -}; - static struct zcomp_backend *backends[] = { &zcomp_lzo, #ifdef CONFIG_ZRAM_LZ4_COMPRESS @@ -93,188 +71,6 @@ static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp, gfp_t flags) return zstrm; } -/* - * get idle zcomp_strm or wait until other process release - * (zcomp_strm_release()) one for us - */ -static struct zcomp_strm *zcomp_strm_multi_find(struct zcomp *comp) -{ - struct zcomp_strm_multi *zs = comp->stream; - struct zcomp_strm *zstrm; - - while (1) { - spin_lock(&zs->strm_lock); - if (!list_empty(&zs->idle_strm)) { - zstrm = list_entry(zs->idle_strm.next, - struct zcomp_strm, list); - list_del(&zstrm->list); - spin_unlock(&zs->strm_lock); - return zstrm; - } - /* zstrm streams limit reached, wait for idle stream */ - if (zs->avail_strm >= zs->max_strm) { - spin_unlock(&zs->strm_lock); - wait_event(zs->strm_wait, !list_empty(&zs->idle_strm)); - continue; - } - /* allocate new zstrm stream */ - zs->avail_strm++; - spin_unlock(&zs->strm_lock); - /* - * This function can be called in swapout/fs write path - * so we can't use GFP_FS|IO. And it assumes we already - * have at least one stream in zram initialization so we - * don't do best effort to allocate more stream in here. - * A default stream will work well without further multiple - * streams. That's why we use NORETRY | NOWARN. - */ - zstrm = zcomp_strm_alloc(comp, GFP_NOIO | __GFP_NORETRY | - __GFP_NOWARN); - if (!zstrm) { - spin_lock(&zs->strm_lock); - zs->avail_strm--; - spin_unlock(&zs->strm_lock); - wait_event(zs->strm_wait, !list_empty(&zs->idle_strm)); - continue; - } - break; - } - return zstrm; -} - -/* add stream back to idle list and wake up waiter or free the stream */ -static void zcomp_strm_multi_release(struct zcomp *comp, struct zcomp_strm *zstrm) -{ - struct zcomp_strm_multi *zs = comp->stream; - - spin_lock(&zs->strm_lock); - if (zs->avail_strm <= zs->max_strm) { - list_add(&zstrm->list, &zs->idle_strm); - spin_unlock(&zs->strm_lock); - wake_up(&zs->strm_wait); - return; - } - - zs->avail_strm--; - spin_unlock(&zs->strm_lock); - zcomp_strm_free(comp, zstrm); -} - -/* change max_strm limit */ -static bool zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm) -{ - struct zcomp_strm_multi *zs = comp->stream; - struct zcomp_strm *zstrm; - - spin_lock(&zs->strm_lock); - zs->max_strm = num_strm; - /* - * if user has lowered the limit and there are idle streams, - * immediately free as much streams (and memory) as we can. - */ - while (zs->avail_strm > num_strm && !list_empty(&zs->idle_strm)) { - zstrm = list_entry(zs->idle_strm.next, - struct zcomp_strm, list); - list_del(&zstrm->list); - zcomp_strm_free(comp, zstrm); - zs->avail_strm--; - } - spin_unlock(&zs->strm_lock); - return true; -} - -static void zcomp_strm_multi_destroy(struct zcomp *comp) -{ - struct zcomp_strm_multi *zs = comp->stream; - struct zcomp_strm *zstrm; - - while (!list_empty(&zs->idle_strm)) { - zstrm = list_entry(zs->idle_strm.next, - struct zcomp_strm, list); - list_del(&zstrm->list); - zcomp_strm_free(comp, zstrm); - } - kfree(zs); -} - -static int zcomp_strm_multi_create(struct zcomp *comp, int max_strm) -{ - struct zcomp_strm *zstrm; - struct zcomp_strm_multi *zs; - - comp->destroy = zcomp_strm_multi_destroy; - comp->strm_find = zcomp_strm_multi_find; - comp->strm_release = zcomp_strm_multi_release; - comp->set_max_streams = zcomp_strm_multi_set_max_streams; - zs = kmalloc(sizeof(struct zcomp_strm_multi), GFP_KERNEL); - if (!zs) - return -ENOMEM; - - comp->stream = zs; - spin_lock_init(&zs->strm_lock); - INIT_LIST_HEAD(&zs->idle_strm); - init_waitqueue_head(&zs->strm_wait); - zs->max_strm = max_strm; - zs->avail_strm = 1; - - zstrm = zcomp_strm_alloc(comp, GFP_KERNEL); - if (!zstrm) { - kfree(zs); - return -ENOMEM; - } - list_add(&zstrm->list, &zs->idle_strm); - return 0; -} - -static struct zcomp_strm *zcomp_strm_single_find(struct zcomp *comp) -{ - struct zcomp_strm_single *zs = comp->stream; - mutex_lock(&zs->strm_lock); - return zs->zstrm; -} - -static void zcomp_strm_single_release(struct zcomp *comp, - struct zcomp_strm *zstrm) -{ - struct zcomp_strm_single *zs = comp->stream; - mutex_unlock(&zs->strm_lock); -} - -static bool zcomp_strm_single_set_max_streams(struct zcomp *comp, int num_strm) -{ - /* zcomp_strm_single support only max_comp_streams == 1 */ - return false; -} - -static void zcomp_strm_single_destroy(struct zcomp *comp) -{ - struct zcomp_strm_single *zs = comp->stream; - zcomp_strm_free(comp, zs->zstrm); - kfree(zs); -} - -static int zcomp_strm_single_create(struct zcomp *comp) -{ - struct zcomp_strm_single *zs; - - comp->destroy = zcomp_strm_single_destroy; - comp->strm_find = zcomp_strm_single_find; - comp->strm_release = zcomp_strm_single_release; - comp->set_max_streams = zcomp_strm_single_set_max_streams; - zs = kmalloc(sizeof(struct zcomp_strm_single), GFP_KERNEL); - if (!zs) - return -ENOMEM; - - comp->stream = zs; - mutex_init(&zs->strm_lock); - zs->zstrm = zcomp_strm_alloc(comp, GFP_KERNEL); - if (!zs->zstrm) { - kfree(zs); - return -ENOMEM; - } - return 0; -} - /* show available compressors */ ssize_t zcomp_available_show(const char *comp, char *buf) { @@ -301,17 +97,17 @@ bool zcomp_available_algorithm(const char *comp) bool zcomp_set_max_streams(struct zcomp *comp, int num_strm) { - return comp->set_max_streams(comp, num_strm); + return true; } struct zcomp_strm *zcomp_strm_find(struct zcomp *comp) { - return comp->strm_find(comp); + return *get_cpu_ptr(comp->stream); } void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm) { - comp->strm_release(comp, zstrm); + put_cpu_ptr(comp->stream); } int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, @@ -327,9 +123,83 @@ int zcomp_decompress(struct zcomp *comp, const unsigned char *src, return comp->backend->decompress(src, src_len, dst); } +static int __zcomp_cpu_notifier(struct zcomp *comp, + unsigned long action, unsigned long cpu) +{ + struct zcomp_strm *zstrm; + + switch (action) { + case CPU_UP_PREPARE: + if (WARN_ON(*per_cpu_ptr(comp->stream, cpu))) + break; + zstrm = zcomp_strm_alloc(comp, GFP_KERNEL); + if (IS_ERR_OR_NULL(zstrm)) { + pr_err("Can't allocate a compression stream\n"); + return NOTIFY_BAD; + } + *per_cpu_ptr(comp->stream, cpu) = zstrm; + break; + case CPU_DEAD: + case CPU_UP_CANCELED: + zstrm = *per_cpu_ptr(comp->stream, cpu); + if (!IS_ERR_OR_NULL(zstrm)) + zcomp_strm_free(comp, zstrm); + *per_cpu_ptr(comp->stream, cpu) = NULL; + break; + default: + break; + } + return NOTIFY_OK; +} + +static int zcomp_cpu_notifier(struct notifier_block *nb, + unsigned long action, void *pcpu) +{ + unsigned long cpu = (unsigned long)pcpu; + struct zcomp *comp = container_of(nb, typeof(*comp), notifier); + + return __zcomp_cpu_notifier(comp, action, cpu); +} + +static int zcomp_init(struct zcomp *comp) +{ + unsigned long cpu; + int ret; + + comp->notifier.notifier_call = zcomp_cpu_notifier; + + comp->stream = alloc_percpu(struct zcomp_strm *); + if (!comp->stream) + return -ENOMEM; + + cpu_notifier_register_begin(); + for_each_online_cpu(cpu) { + ret = __zcomp_cpu_notifier(comp, CPU_UP_PREPARE, cpu); + if (ret == NOTIFY_BAD) + goto cleanup; + } + __register_cpu_notifier(&comp->notifier); + cpu_notifier_register_done(); + return 0; + +cleanup: + for_each_online_cpu(cpu) + __zcomp_cpu_notifier(comp, CPU_UP_CANCELED, cpu); + cpu_notifier_register_done(); + return -ENOMEM; +} + void zcomp_destroy(struct zcomp *comp) { - comp->destroy(comp); + unsigned long cpu; + + cpu_notifier_register_begin(); + for_each_online_cpu(cpu) + __zcomp_cpu_notifier(comp, CPU_UP_CANCELED, cpu); + __unregister_cpu_notifier(&comp->notifier); + cpu_notifier_register_done(); + + free_percpu(comp->stream); kfree(comp); } @@ -339,9 +209,9 @@ void zcomp_destroy(struct zcomp *comp) * backend pointer or ERR_PTR if things went bad. ERR_PTR(-EINVAL) * if requested algorithm is not supported, ERR_PTR(-ENOMEM) in * case of allocation error, or any other error potentially - * returned by functions zcomp_strm_{multi,single}_create. + * returned by zcomp_init(). */ -struct zcomp *zcomp_create(const char *compress, int max_strm) +struct zcomp *zcomp_create(const char *compress) { struct zcomp *comp; struct zcomp_backend *backend; @@ -356,10 +226,7 @@ struct zcomp *zcomp_create(const char *compress, int max_strm) return ERR_PTR(-ENOMEM); comp->backend = backend; - if (max_strm > 1) - error = zcomp_strm_multi_create(comp, max_strm); - else - error = zcomp_strm_single_create(comp); + error = zcomp_init(comp); if (error) { kfree(comp); return ERR_PTR(error); diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index b7d2a4bcae54..ffd88cb747fe 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -10,8 +10,6 @@ #ifndef _ZCOMP_H_ #define _ZCOMP_H_ -#include - struct zcomp_strm { /* compression/decompression buffer */ void *buffer; @@ -21,8 +19,6 @@ struct zcomp_strm { * working memory) */ void *private; - /* used in multi stream backend, protected by backend strm_lock */ - struct list_head list; }; /* static compression backend */ @@ -41,19 +37,15 @@ struct zcomp_backend { /* dynamic per-device compression frontend */ struct zcomp { - void *stream; + struct zcomp_strm * __percpu *stream; struct zcomp_backend *backend; - - struct zcomp_strm *(*strm_find)(struct zcomp *comp); - void (*strm_release)(struct zcomp *comp, struct zcomp_strm *zstrm); - bool (*set_max_streams)(struct zcomp *comp, int num_strm); - void (*destroy)(struct zcomp *comp); + struct notifier_block notifier; }; ssize_t zcomp_available_show(const char *comp, char *buf); bool zcomp_available_algorithm(const char *comp); -struct zcomp *zcomp_create(const char *comp, int max_strm); +struct zcomp *zcomp_create(const char *comp); void zcomp_destroy(struct zcomp *comp); struct zcomp_strm *zcomp_strm_find(struct zcomp *comp); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 89245f1e1f83..e174f139dbc9 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -650,7 +650,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, { int ret = 0; size_t clen; - unsigned long handle; + unsigned long handle = 0; struct page *page; unsigned char *user_mem, *cmem, *src, *uncmem = NULL; struct zram_meta *meta = zram->meta; @@ -673,9 +673,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, goto out; } - zstrm = zcomp_strm_find(zram->comp); +compress_again: user_mem = kmap_atomic(page); - if (is_partial_io(bvec)) { memcpy(uncmem + offset, user_mem + bvec->bv_offset, bvec->bv_len); @@ -699,6 +698,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, goto out; } + zstrm = zcomp_strm_find(zram->comp); ret = zcomp_compress(zram->comp, zstrm, uncmem, &clen); if (!is_partial_io(bvec)) { kunmap_atomic(user_mem); @@ -710,6 +710,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, pr_err("Compression failed! err=%d\n", ret); goto out; } + src = zstrm->buffer; if (unlikely(clen > max_zpage_size)) { clen = PAGE_SIZE; @@ -717,8 +718,33 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, src = uncmem; } - handle = zs_malloc(meta->mem_pool, clen, GFP_NOIO | __GFP_HIGHMEM); + /* + * handle allocation has 2 paths: + * a) fast path is executed with preemption disabled (for + * per-cpu streams) and has __GFP_DIRECT_RECLAIM bit clear, + * since we can't sleep; + * b) slow path enables preemption and attempts to allocate + * the page with __GFP_DIRECT_RECLAIM bit set. we have to + * put per-cpu compression stream and, thus, to re-do + * the compression once handle is allocated. + * + * if we have a 'non-null' handle here then we are coming + * from the slow path and handle has already been allocated. + */ + if (!handle) + handle = zs_malloc(meta->mem_pool, clen, + __GFP_KSWAPD_RECLAIM | + __GFP_NOWARN | + __GFP_HIGHMEM); if (!handle) { + zcomp_strm_release(zram->comp, zstrm); + zstrm = NULL; + + handle = zs_malloc(meta->mem_pool, clen, + GFP_NOIO | __GFP_HIGHMEM); + if (handle) + goto compress_again; + pr_err("Error allocating memory for compressed page: %u, size=%zu\n", index, clen); ret = -ENOMEM; @@ -1038,7 +1064,7 @@ static ssize_t disksize_store(struct device *dev, if (!meta) return -ENOMEM; - comp = zcomp_create(zram->compressor, zram->max_comp_streams); + comp = zcomp_create(zram->compressor); if (IS_ERR(comp)) { pr_err("Cannot initialise %s compressing backend\n", zram->compressor); From 6cbf23139066c396b7856c37ceeb164135f55e77 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Fri, 20 May 2016 16:59:59 -0700 Subject: [PATCH 1142/1212] UPSTREAM: zram: remove max_comp_streams internals Remove the internal part of max_comp_streams interface, since we switched to per-cpu streams. We will keep RW max_comp_streams attr around, because: a) we may (silently) switch back to idle compression streams list and don't want to disturb user space b) max_comp_streams attr must wait for the next 'lay off cycle'; we give user space 2 years to adjust before we remove/downgrade the attr, and there are already several attrs scheduled for removal in 4.11, so it's too late for max_comp_streams. This slightly change a user visible behaviour: - First, reading from max_comp_stream file now will always return the number of online CPUs. - Second, writing to max_comp_stream will not take any effect. Link: http://lkml.kernel.org/r/20160503165546.25201-1-sergey.senozhatsky@gmail.com Signed-off-by: Sergey Senozhatsky Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 43209ea2d17aae1540d4e28274e36404f72702f2) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I1902e741b4d3b83c5bd0d66bf1bae021dbfe2056 Signed-off-by: Amit Pundir --- Documentation/blockdev/zram.txt | 25 +++++------------- drivers/block/zram/zcomp.c | 5 ---- drivers/block/zram/zram_drv.c | 45 ++++++++------------------------- drivers/block/zram/zram_drv.h | 1 - 4 files changed, 18 insertions(+), 58 deletions(-) diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 5bda5031c83d..d88f0c70cd7f 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -59,27 +59,16 @@ num_devices parameter is optional and tells zram how many devices should be pre-created. Default: 1. 2) Set max number of compression streams - Compression backend may use up to max_comp_streams compression streams, - thus allowing up to max_comp_streams concurrent compression operations. - By default, compression backend uses single compression stream. + Regardless the value passed to this attribute, ZRAM will always + allocate multiple compression streams - one per online CPUs - thus + allowing several concurrent compression operations. The number of + allocated compression streams goes down when some of the CPUs + become offline. There is no single-compression-stream mode anymore, + unless you are running a UP system or has only 1 CPU online. - Examples: - #show max compression streams number + To find out how many streams are currently available: cat /sys/block/zram0/max_comp_streams - #set max compression streams number to 3 - echo 3 > /sys/block/zram0/max_comp_streams - -Note: -In order to enable compression backend's multi stream support max_comp_streams -must be initially set to desired concurrency level before ZRAM device -initialisation. Once the device initialised as a single stream compression -backend (max_comp_streams equals to 1), you will see error if you try to change -the value of max_comp_streams because single stream compression backend -implemented as a special case by lock overhead issue and does not support -dynamic max_comp_streams. Only multi stream backend supports dynamic -max_comp_streams adjustment. - 3) Select compression algorithm Using comp_algorithm device attribute one can see available and currently selected (shown in square brackets) compression algorithms, diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index bc98d5ed5477..b51a816d766b 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -95,11 +95,6 @@ bool zcomp_available_algorithm(const char *comp) return find_backend(comp) != NULL; } -bool zcomp_set_max_streams(struct zcomp *comp, int num_strm) -{ - return true; -} - struct zcomp_strm *zcomp_strm_find(struct zcomp *comp) { return *get_cpu_ptr(comp->stream); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index e174f139dbc9..e38f4419cc4e 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -304,46 +304,25 @@ static ssize_t mem_used_max_store(struct device *dev, return len; } +/* + * We switched to per-cpu streams and this attr is not needed anymore. + * However, we will keep it around for some time, because: + * a) we may revert per-cpu streams in the future + * b) it's visible to user space and we need to follow our 2 years + * retirement rule; but we already have a number of 'soon to be + * altered' attrs, so max_comp_streams need to wait for the next + * layoff cycle. + */ static ssize_t max_comp_streams_show(struct device *dev, struct device_attribute *attr, char *buf) { - int val; - struct zram *zram = dev_to_zram(dev); - - down_read(&zram->init_lock); - val = zram->max_comp_streams; - up_read(&zram->init_lock); - - return scnprintf(buf, PAGE_SIZE, "%d\n", val); + return scnprintf(buf, PAGE_SIZE, "%d\n", num_online_cpus()); } static ssize_t max_comp_streams_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { - int num; - struct zram *zram = dev_to_zram(dev); - int ret; - - ret = kstrtoint(buf, 0, &num); - if (ret < 0) - return ret; - if (num < 1) - return -EINVAL; - - down_write(&zram->init_lock); - if (init_done(zram)) { - if (!zcomp_set_max_streams(zram->comp, num)) { - pr_info("Cannot change max compression streams\n"); - ret = -EINVAL; - goto out; - } - } - - zram->max_comp_streams = num; - ret = len; -out: - up_write(&zram->init_lock); - return ret; + return len; } static ssize_t comp_algorithm_show(struct device *dev, @@ -1035,7 +1014,6 @@ static void zram_reset_device(struct zram *zram) /* Reset stats */ memset(&zram->stats, 0, sizeof(zram->stats)); zram->disksize = 0; - zram->max_comp_streams = 1; set_capacity(zram->disk, 0); part_stat_set_all(&zram->disk->part0, 0); @@ -1301,7 +1279,6 @@ static int zram_add(void) } strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor)); zram->meta = NULL; - zram->max_comp_streams = 1; pr_info("Added device: %s\n", zram->disk->disk_name); return device_id; diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 8e92339686d7..06b1636f4722 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -102,7 +102,6 @@ struct zram { * the number of pages zram can consume for storing compressed data */ unsigned long limit_pages; - int max_comp_streams; struct zram_stats stats; atomic_t refcount; /* refcount for zram_meta */ From 9bf02241b09bf87cf109374fd0fcc0120fa6becc Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Fri, 20 May 2016 17:00:02 -0700 Subject: [PATCH 1143/1212] UPSTREAM: zram: introduce per-device debug_stat sysfs node debug_stat sysfs is read-only and represents various debugging data that zram developers may need. This file is not meant to be used by anyone else: its content is not documented and will change any time w/o any notice. Therefore, the output of debug_stat file contains a version string. To avoid any confusion, we will increase the version number every time we modify the output. At the moment this file exports only one value -- the number of re-compressions, IOW, the number of times compression fast path has failed. This stat is temporary any will be useful in case if any per-cpu compression streams regressions will be reported. Link: http://lkml.kernel.org/r/20160513230834.GB26763@bbox Link: http://lkml.kernel.org/r/20160511134553.12655-1-sergey.senozhatsky@gmail.com Signed-off-by: Sergey Senozhatsky Signed-off-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 623e47fc64f8de480b322b7ed68855f97137e2a5) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: Ie0ef61db7aa0b2c713de1d8bf48e8a545b4276e9 Signed-off-by: Amit Pundir --- Documentation/ABI/testing/sysfs-block-zram | 9 +++++++++ Documentation/blockdev/zram.txt | 1 + drivers/block/zram/zram_drv.c | 21 +++++++++++++++++++++ drivers/block/zram/zram_drv.h | 1 + 4 files changed, 32 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram index 2e69e83bf510..4518d30b8c2e 100644 --- a/Documentation/ABI/testing/sysfs-block-zram +++ b/Documentation/ABI/testing/sysfs-block-zram @@ -166,3 +166,12 @@ Description: The mm_stat file is read-only and represents device's mm statistics (orig_data_size, compr_data_size, etc.) in a format similar to block layer statistics file format. + +What: /sys/block/zram/debug_stat +Date: July 2016 +Contact: Sergey Senozhatsky +Description: + The debug_stat file is read-only and represents various + device's debugging info useful for kernel developers. Its + format is not documented intentionally and may change + anytime without any notice. diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index d88f0c70cd7f..13100fb3c26d 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -172,6 +172,7 @@ mem_limit RW the maximum amount of memory ZRAM can use to store pages_compacted RO the number of pages freed during compaction (available only via zram/mm_stat node) compact WO trigger memory compaction +debug_stat RO this file is used for zram debugging purposes WARNING ======= diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index e38f4419cc4e..74fb46afac71 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -435,8 +435,26 @@ static ssize_t mm_stat_show(struct device *dev, return ret; } +static ssize_t debug_stat_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int version = 1; + struct zram *zram = dev_to_zram(dev); + ssize_t ret; + + down_read(&zram->init_lock); + ret = scnprintf(buf, PAGE_SIZE, + "version: %d\n%8llu\n", + version, + (u64)atomic64_read(&zram->stats.writestall)); + up_read(&zram->init_lock); + + return ret; +} + static DEVICE_ATTR_RO(io_stat); static DEVICE_ATTR_RO(mm_stat); +static DEVICE_ATTR_RO(debug_stat); ZRAM_ATTR_RO(num_reads); ZRAM_ATTR_RO(num_writes); ZRAM_ATTR_RO(failed_reads); @@ -719,6 +737,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, zcomp_strm_release(zram->comp, zstrm); zstrm = NULL; + atomic64_inc(&zram->stats.writestall); + handle = zs_malloc(meta->mem_pool, clen, GFP_NOIO | __GFP_HIGHMEM); if (handle) @@ -1181,6 +1201,7 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_comp_algorithm.attr, &dev_attr_io_stat.attr, &dev_attr_mm_stat.attr, + &dev_attr_debug_stat.attr, NULL, }; diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 06b1636f4722..3f5bf66a27e4 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -85,6 +85,7 @@ struct zram_stats { atomic64_t zero_pages; /* no. of zero filled pages */ atomic64_t pages_stored; /* no. of pages currently stored */ atomic_long_t max_used_pages; /* no. of maximum pages stored */ + atomic64_t writestall; /* no. of write slow paths */ }; struct zram_meta { From a54133ad2e13cd8fc3c23ebc485601d53efeb2d8 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 26 Jul 2016 15:22:42 -0700 Subject: [PATCH 1144/1212] UPSTREAM: zram: rename zstrm find-release functions This has started as a 'add zlib support' work, but after some thinking I saw no blockers for a bigger change -- a switch to crypto API. We don't have an idle zstreams list anymore and our write path now works absolutely differently, preventing preemption during compression. This removes possibilities of read paths preempting writes at wrong places and opens the door for a move from custom LZO/LZ4 compression backends implementation to a more generic one, using crypto compress API. This patch set also eliminates the need of a new context-less crypto API interface, which was quite hard to sell, so we can move along faster. benchmarks: (x86_64, 4GB, zram-perf script) perf reported run-time fio (max jobs=3). I performed fio test with the increasing number of parallel jobs (max to 3) on a 3G zram device, using `static' data and the following crypto comp algorithms: 842, deflate, lz4, lz4hc, lzo the output was: - test running time (which can tell us what algorithms performs faster) and - zram mm_stat (which tells the compressed memory size, max used memory, etc). It's just for information. for example, LZ4HC has twice the running time of LZO, but the compressed memory size is: 23592960 vs 34603008 bytes. test-fio-zram-842 197.907655282 seconds time elapsed 201.623142884 seconds time elapsed 226.854291345 seconds time elapsed test-fio-zram-DEFLATE 253.259516155 seconds time elapsed 258.148563401 seconds time elapsed 290.251909365 seconds time elapsed test-fio-zram-LZ4 27.022598717 seconds time elapsed 29.580522717 seconds time elapsed 33.293463430 seconds time elapsed test-fio-zram-LZ4HC 56.393954615 seconds time elapsed 74.904659747 seconds time elapsed 101.940998564 seconds time elapsed test-fio-zram-LZO 28.155948075 seconds time elapsed 30.390036330 seconds time elapsed 34.455773159 seconds time elapsed zram mm_stat-s (max fio jobs=3) test-fio-zram-842 mm_stat (jobs1): 3221225472 673185792 690266112 0 690266112 0 0 mm_stat (jobs2): 3221225472 673185792 690266112 0 690266112 0 0 mm_stat (jobs3): 3221225472 673185792 690266112 0 690266112 0 0 test-fio-zram-DEFLATE mm_stat (jobs1): 3221225472 24379392 37761024 0 37761024 0 0 mm_stat (jobs2): 3221225472 24379392 37761024 0 37761024 0 0 mm_stat (jobs3): 3221225472 24379392 37761024 0 37761024 0 0 test-fio-zram-LZ4 mm_stat (jobs1): 3221225472 23592960 37761024 0 37761024 0 0 mm_stat (jobs2): 3221225472 23592960 37761024 0 37761024 0 0 mm_stat (jobs3): 3221225472 23592960 37761024 0 37761024 0 0 test-fio-zram-LZ4HC mm_stat (jobs1): 3221225472 23592960 37761024 0 37761024 0 0 mm_stat (jobs2): 3221225472 23592960 37761024 0 37761024 0 0 mm_stat (jobs3): 3221225472 23592960 37761024 0 37761024 0 0 test-fio-zram-LZO mm_stat (jobs1): 3221225472 34603008 50335744 0 50335744 0 0 mm_stat (jobs2): 3221225472 34603008 50335744 0 50335744 0 0 mm_stat (jobs3): 3221225472 34603008 50335744 0 50339840 0 0 This patch (of 8): We don't perform any zstream idle list lookup anymore, so zcomp_strm_find()/zcomp_strm_release() names are not representative. Rename to zcomp_stream_get()/zcomp_stream_put(). Link: http://lkml.kernel.org/r/20160531122017.2878-2-sergey.senozhatsky@gmail.com Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 2aea8493d326bdf15446768333e1d2c91b040b5c) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I2f4c9e215bca73ba5adb1354aaec6e32e420920d Signed-off-by: Amit Pundir --- drivers/block/zram/zcomp.c | 4 ++-- drivers/block/zram/zcomp.h | 4 ++-- drivers/block/zram/zram_drv.c | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index b51a816d766b..400f8267337e 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -95,12 +95,12 @@ bool zcomp_available_algorithm(const char *comp) return find_backend(comp) != NULL; } -struct zcomp_strm *zcomp_strm_find(struct zcomp *comp) +struct zcomp_strm *zcomp_stream_get(struct zcomp *comp) { return *get_cpu_ptr(comp->stream); } -void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm) +void zcomp_stream_put(struct zcomp *comp) { put_cpu_ptr(comp->stream); } diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index ffd88cb747fe..944b8e60dd82 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -48,8 +48,8 @@ bool zcomp_available_algorithm(const char *comp); struct zcomp *zcomp_create(const char *comp); void zcomp_destroy(struct zcomp *comp); -struct zcomp_strm *zcomp_strm_find(struct zcomp *comp); -void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm); +struct zcomp_strm *zcomp_stream_get(struct zcomp *comp); +void zcomp_stream_put(struct zcomp *comp); int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, const unsigned char *src, size_t *dst_len); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 74fb46afac71..01bd4380d7b9 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -695,7 +695,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, goto out; } - zstrm = zcomp_strm_find(zram->comp); + zstrm = zcomp_stream_get(zram->comp); ret = zcomp_compress(zram->comp, zstrm, uncmem, &clen); if (!is_partial_io(bvec)) { kunmap_atomic(user_mem); @@ -734,7 +734,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, __GFP_NOWARN | __GFP_HIGHMEM); if (!handle) { - zcomp_strm_release(zram->comp, zstrm); + zcomp_stream_put(zram->comp); zstrm = NULL; atomic64_inc(&zram->stats.writestall); @@ -769,7 +769,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, memcpy(cmem, src, clen); } - zcomp_strm_release(zram->comp, zstrm); + zcomp_stream_put(zram->comp); zstrm = NULL; zs_unmap_object(meta->mem_pool, handle); @@ -789,7 +789,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, atomic64_inc(&zram->stats.pages_stored); out: if (zstrm) - zcomp_strm_release(zram->comp, zstrm); + zcomp_stream_put(zram->comp); if (is_partial_io(bvec)) kfree(uncmem); return ret; From 0eeffba8304c2ffd02f86149e986458f045ae04d Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 26 Jul 2016 15:22:45 -0700 Subject: [PATCH 1145/1212] BACKPORT: zram: switch to crypto compress API We don't have an idle zstreams list anymore and our write path now works absolutely differently, preventing preemption during compression. This removes possibilities of read paths preempting writes at wrong places (which could badly affect the performance of both paths) and at the same time opens the door for a move from custom LZO/LZ4 compression backends implementation to a more generic one, using crypto compress API. Joonsoo Kim [1] attempted to do this a while ago, but faced with the need of introducing a new crypto API interface. The root cause was the fact that crypto API compression algorithms require a compression stream structure (in zram terminology) for both compression and decompression ops, while in reality only several of compression algorithms really need it. This resulted in a concept of context-less crypto API compression backends [2]. Both write and read paths, though, would have been executed with the preemption enabled, which in the worst case could have resulted in a decreased worst-case performance, e.g. consider the following case: CPU0 zram_write() spin_lock() take the last idle stream spin_unlock() << preempted >> zram_read() spin_lock() no idle streams spin_unlock() schedule() resuming zram_write compression() but it took me some time to realize that, and it took even longer to evolve zram and to make it ready for crypto API. The key turned out to be -- drop the idle streams list entirely. Without the idle streams list we are free to use compression algorithms that require compression stream for decompression (read), because streams are now placed in per-cpu data and each write path has to disable preemption for compression op, almost completely eliminating the aforementioned case (technically, we still have a small chance, because write path has a fast and a slow paths and the slow path is executed with the preemption enabled; but the frequency of failed fast path is too low). TEST ==== - 4 CPUs, x86_64 system - 3G zram, lzo - fio tests: read, randread, write, randwrite, rw, randrw test script [3] command: ZRAM_SIZE=3G LOG_SUFFIX=XXXX FIO_LOOPS=5 ./zram-fio-test.sh BASE PATCHED jobs1 READ: 2527.2MB/s 2482.7MB/s READ: 2102.7MB/s 2045.0MB/s WRITE: 1284.3MB/s 1324.3MB/s WRITE: 1080.7MB/s 1101.9MB/s READ: 430125KB/s 437498KB/s WRITE: 430538KB/s 437919KB/s READ: 399593KB/s 403987KB/s WRITE: 399910KB/s 404308KB/s jobs2 READ: 8133.5MB/s 7854.8MB/s READ: 7086.6MB/s 6912.8MB/s WRITE: 3177.2MB/s 3298.3MB/s WRITE: 2810.2MB/s 2871.4MB/s READ: 1017.6MB/s 1023.4MB/s WRITE: 1018.2MB/s 1023.1MB/s READ: 977836KB/s 984205KB/s WRITE: 979435KB/s 985814KB/s jobs3 READ: 13557MB/s 13391MB/s READ: 11876MB/s 11752MB/s WRITE: 4641.5MB/s 4682.1MB/s WRITE: 4164.9MB/s 4179.3MB/s READ: 1453.8MB/s 1455.1MB/s WRITE: 1455.1MB/s 1458.2MB/s READ: 1387.7MB/s 1395.7MB/s WRITE: 1386.1MB/s 1394.9MB/s jobs4 READ: 20271MB/s 20078MB/s READ: 18033MB/s 17928MB/s WRITE: 6176.8MB/s 6180.5MB/s WRITE: 5686.3MB/s 5705.3MB/s READ: 2009.4MB/s 2006.7MB/s WRITE: 2007.5MB/s 2004.9MB/s READ: 1929.7MB/s 1935.6MB/s WRITE: 1926.8MB/s 1932.6MB/s jobs5 READ: 18823MB/s 19024MB/s READ: 18968MB/s 19071MB/s WRITE: 6191.6MB/s 6372.1MB/s WRITE: 5818.7MB/s 5787.1MB/s READ: 2011.7MB/s 1981.3MB/s WRITE: 2011.4MB/s 1980.1MB/s READ: 1949.3MB/s 1935.7MB/s WRITE: 1940.4MB/s 1926.1MB/s jobs6 READ: 21870MB/s 21715MB/s READ: 19957MB/s 19879MB/s WRITE: 6528.4MB/s 6537.6MB/s WRITE: 6098.9MB/s 6073.6MB/s READ: 2048.6MB/s 2049.9MB/s WRITE: 2041.7MB/s 2042.9MB/s READ: 2013.4MB/s 1990.4MB/s WRITE: 2009.4MB/s 1986.5MB/s jobs7 READ: 21359MB/s 21124MB/s READ: 19746MB/s 19293MB/s WRITE: 6660.4MB/s 6518.8MB/s WRITE: 6211.6MB/s 6193.1MB/s READ: 2089.7MB/s 2080.6MB/s WRITE: 2085.8MB/s 2076.5MB/s READ: 2041.2MB/s 2052.5MB/s WRITE: 2037.5MB/s 2048.8MB/s jobs8 READ: 20477MB/s 19974MB/s READ: 18922MB/s 18576MB/s WRITE: 6851.9MB/s 6788.3MB/s WRITE: 6407.7MB/s 6347.5MB/s READ: 2134.8MB/s 2136.1MB/s WRITE: 2132.8MB/s 2134.4MB/s READ: 2074.2MB/s 2069.6MB/s WRITE: 2087.3MB/s 2082.4MB/s jobs9 READ: 19797MB/s 19994MB/s READ: 18806MB/s 18581MB/s WRITE: 6878.7MB/s 6822.7MB/s WRITE: 6456.8MB/s 6447.2MB/s READ: 2141.1MB/s 2154.7MB/s WRITE: 2144.4MB/s 2157.3MB/s READ: 2084.1MB/s 2085.1MB/s WRITE: 2091.5MB/s 2092.5MB/s jobs10 READ: 19794MB/s 19784MB/s READ: 18794MB/s 18745MB/s WRITE: 6984.4MB/s 6676.3MB/s WRITE: 6532.3MB/s 6342.7MB/s READ: 2150.6MB/s 2155.4MB/s WRITE: 2156.8MB/s 2161.5MB/s READ: 2106.4MB/s 2095.6MB/s WRITE: 2109.7MB/s 2098.4MB/s BASE PATCHED jobs1 perfstat stalled-cycles-frontend 102,480,595,419 ( 41.53%) 114,508,864,804 ( 46.92%) stalled-cycles-backend 51,941,417,832 ( 21.05%) 46,836,112,388 ( 19.19%) instructions 283,612,054,215 ( 1.15) 283,918,134,959 ( 1.16) branches 56,372,560,385 ( 724.923) 56,449,814,753 ( 733.766) branch-misses 374,826,000 ( 0.66%) 326,935,859 ( 0.58%) jobs2 perfstat stalled-cycles-frontend 155,142,745,777 ( 40.99%) 164,170,979,198 ( 43.82%) stalled-cycles-backend 70,813,866,387 ( 18.71%) 66,456,858,165 ( 17.74%) instructions 463,436,648,173 ( 1.22) 464,221,890,191 ( 1.24) branches 91,088,733,902 ( 760.088) 91,278,144,546 ( 769.133) branch-misses 504,460,363 ( 0.55%) 394,033,842 ( 0.43%) jobs3 perfstat stalled-cycles-frontend 201,300,397,212 ( 39.84%) 223,969,902,257 ( 44.44%) stalled-cycles-backend 87,712,593,974 ( 17.36%) 81,618,888,712 ( 16.19%) instructions 642,869,545,023 ( 1.27) 644,677,354,132 ( 1.28) branches 125,724,560,594 ( 690.682) 126,133,159,521 ( 694.542) branch-misses 527,941,798 ( 0.42%) 444,782,220 ( 0.35%) jobs4 perfstat stalled-cycles-frontend 246,701,197,429 ( 38.12%) 280,076,030,886 ( 43.29%) stalled-cycles-backend 119,050,341,112 ( 18.40%) 110,955,641,671 ( 17.15%) instructions 822,716,962,127 ( 1.27) 825,536,969,320 ( 1.28) branches 160,590,028,545 ( 688.614) 161,152,996,915 ( 691.068) branch-misses 650,295,287 ( 0.40%) 550,229,113 ( 0.34%) jobs5 perfstat stalled-cycles-frontend 298,958,462,516 ( 38.30%) 344,852,200,358 ( 44.16%) stalled-cycles-backend 137,558,742,122 ( 17.62%) 129,465,067,102 ( 16.58%) instructions 1,005,714,688,752 ( 1.29) 1,007,657,999,432 ( 1.29) branches 195,988,773,962 ( 697.730) 196,446,873,984 ( 700.319) branch-misses 695,818,940 ( 0.36%) 624,823,263 ( 0.32%) jobs6 perfstat stalled-cycles-frontend 334,497,602,856 ( 36.71%) 387,590,419,779 ( 42.38%) stalled-cycles-backend 163,539,365,335 ( 17.95%) 152,640,193,639 ( 16.69%) instructions 1,184,738,177,851 ( 1.30) 1,187,396,281,677 ( 1.30) branches 230,592,915,640 ( 702.902) 231,253,802,882 ( 702.356) branch-misses 747,934,786 ( 0.32%) 643,902,424 ( 0.28%) jobs7 perfstat stalled-cycles-frontend 396,724,684,187 ( 37.71%) 460,705,858,952 ( 43.84%) stalled-cycles-backend 188,096,616,496 ( 17.88%) 175,785,787,036 ( 16.73%) instructions 1,364,041,136,608 ( 1.30) 1,366,689,075,112 ( 1.30) branches 265,253,096,936 ( 700.078) 265,890,524,883 ( 702.839) branch-misses 784,991,589 ( 0.30%) 729,196,689 ( 0.27%) jobs8 perfstat stalled-cycles-frontend 440,248,299,870 ( 36.92%) 509,554,793,816 ( 42.46%) stalled-cycles-backend 222,575,930,616 ( 18.67%) 213,401,248,432 ( 17.78%) instructions 1,542,262,045,114 ( 1.29) 1,545,233,932,257 ( 1.29) branches 299,775,178,439 ( 697.666) 300,528,458,505 ( 694.769) branch-misses 847,496,084 ( 0.28%) 748,794,308 ( 0.25%) jobs9 perfstat stalled-cycles-frontend 506,269,882,480 ( 37.86%) 592,798,032,820 ( 44.43%) stalled-cycles-backend 253,192,498,861 ( 18.93%) 233,727,666,185 ( 17.52%) instructions 1,721,985,080,913 ( 1.29) 1,724,666,236,005 ( 1.29) branches 334,517,360,255 ( 694.134) 335,199,758,164 ( 697.131) branch-misses 873,496,730 ( 0.26%) 815,379,236 ( 0.24%) jobs10 perfstat stalled-cycles-frontend 549,063,363,749 ( 37.18%) 651,302,376,662 ( 43.61%) stalled-cycles-backend 281,680,986,810 ( 19.07%) 277,005,235,582 ( 18.55%) instructions 1,901,859,271,180 ( 1.29) 1,906,311,064,230 ( 1.28) branches 369,398,536,153 ( 694.004) 370,527,696,358 ( 688.409) branch-misses 967,929,335 ( 0.26%) 890,125,056 ( 0.24%) BASE PATCHED seconds elapsed 79.421641008 78.735285546 seconds elapsed 61.471246133 60.869085949 seconds elapsed 62.317058173 62.224188495 seconds elapsed 60.030739363 60.081102518 seconds elapsed 74.070398362 74.317582865 seconds elapsed 84.985953007 85.414364176 seconds elapsed 97.724553255 98.173311344 seconds elapsed 109.488066758 110.268399318 seconds elapsed 122.768189405 122.967164498 seconds elapsed 135.130035105 136.934770801 On my other system (8 x86_64 CPUs, short version of test results): BASE PATCHED seconds elapsed 19.518065994 19.806320662 seconds elapsed 15.172772749 15.594718291 seconds elapsed 13.820925970 13.821708564 seconds elapsed 13.293097816 14.585206405 seconds elapsed 16.207284118 16.064431606 seconds elapsed 17.958376158 17.771825767 seconds elapsed 19.478009164 19.602961508 seconds elapsed 21.347152811 21.352318709 seconds elapsed 24.478121126 24.171088735 seconds elapsed 26.865057442 26.767327618 So performance-wise the numbers are quite similar. Also update zcomp interface to be more aligned with the crypto API. [1] http://marc.info/?l=linux-kernel&m=144480832108927&w=2 [2] http://marc.info/?l=linux-kernel&m=145379613507518&w=2 [3] https://github.com/sergey-senozhatsky/zram-perf-test Link: http://lkml.kernel.org/r/20160531122017.2878-3-sergey.senozhatsky@gmail.com Signed-off-by: Sergey Senozhatsky Suggested-by: Minchan Kim Suggested-by: Joonsoo Kim Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit ebaf9ab56d9d5f350969bd1ea8f47234623c9684) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: Ia0c362b7419de59e6c6ea81c37f99ef1d22c2b4b Signed-off-by: Amit Pundir --- drivers/block/zram/Kconfig | 10 ++--- drivers/block/zram/zcomp.c | 76 ++++++++++++++++++++++------------- drivers/block/zram/zcomp.h | 17 ++++---- drivers/block/zram/zram_drv.c | 18 +++++---- 4 files changed, 69 insertions(+), 52 deletions(-) diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index 386ba3d1a6ee..2252cd7d0e89 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -1,8 +1,7 @@ config ZRAM tristate "Compressed RAM block device support" - depends on BLOCK && SYSFS && ZSMALLOC - select LZO_COMPRESS - select LZO_DECOMPRESS + depends on BLOCK && SYSFS && ZSMALLOC && CRYPTO + select CRYPTO_LZO default n help Creates virtual block devices called /dev/zramX (X = 0, 1, ...). @@ -18,9 +17,8 @@ config ZRAM config ZRAM_LZ4_COMPRESS bool "Enable LZ4 algorithm support" depends on ZRAM - select LZ4_COMPRESS - select LZ4_DECOMPRESS + select CRYPTO_LZ4 default n help This option enables LZ4 compression algorithm support. Compression - algorithm can be changed using `comp_algorithm' device attribute. \ No newline at end of file + algorithm can be changed using `comp_algorithm' device attribute. diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index 400f8267337e..f35726860a1b 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -14,42 +14,39 @@ #include #include #include +#include #include "zcomp.h" -#include "zcomp_lzo.h" -#ifdef CONFIG_ZRAM_LZ4_COMPRESS -#include "zcomp_lz4.h" -#endif -static struct zcomp_backend *backends[] = { - &zcomp_lzo, +static const char * const backends[] = { + "lzo", #ifdef CONFIG_ZRAM_LZ4_COMPRESS - &zcomp_lz4, + "lz4", #endif NULL }; -static struct zcomp_backend *find_backend(const char *compress) +static const char *find_backend(const char *compress) { int i = 0; while (backends[i]) { - if (sysfs_streq(compress, backends[i]->name)) + if (sysfs_streq(compress, backends[i])) break; i++; } return backends[i]; } -static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm) +static void zcomp_strm_free(struct zcomp_strm *zstrm) { - if (zstrm->private) - comp->backend->destroy(zstrm->private); + if (!IS_ERR_OR_NULL(zstrm->tfm)) + crypto_free_comp(zstrm->tfm); free_pages((unsigned long)zstrm->buffer, 1); kfree(zstrm); } /* - * allocate new zcomp_strm structure with ->private initialized by + * allocate new zcomp_strm structure with ->tfm initialized by * backend, return NULL on error */ static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp, gfp_t flags) @@ -58,14 +55,14 @@ static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp, gfp_t flags) if (!zstrm) return NULL; - zstrm->private = comp->backend->create(flags); + zstrm->tfm = crypto_alloc_comp(comp->name, 0, 0); /* * allocate 2 pages. 1 for compressed data, plus 1 extra for the * case when compressed size is larger than the original one */ zstrm->buffer = (void *)__get_free_pages(flags | __GFP_ZERO, 1); - if (!zstrm->private || !zstrm->buffer) { - zcomp_strm_free(comp, zstrm); + if (IS_ERR_OR_NULL(zstrm->tfm) || !zstrm->buffer) { + zcomp_strm_free(zstrm); zstrm = NULL; } return zstrm; @@ -78,12 +75,12 @@ ssize_t zcomp_available_show(const char *comp, char *buf) int i = 0; while (backends[i]) { - if (!strcmp(comp, backends[i]->name)) + if (!strcmp(comp, backends[i])) sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, - "[%s] ", backends[i]->name); + "[%s] ", backends[i]); else sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, - "%s ", backends[i]->name); + "%s ", backends[i]); i++; } sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n"); @@ -105,17 +102,38 @@ void zcomp_stream_put(struct zcomp *comp) put_cpu_ptr(comp->stream); } -int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, - const unsigned char *src, size_t *dst_len) +int zcomp_compress(struct zcomp_strm *zstrm, + const void *src, unsigned int *dst_len) { - return comp->backend->compress(src, zstrm->buffer, dst_len, - zstrm->private); + /* + * Our dst memory (zstrm->buffer) is always `2 * PAGE_SIZE' sized + * because sometimes we can endup having a bigger compressed data + * due to various reasons: for example compression algorithms tend + * to add some padding to the compressed buffer. Speaking of padding, + * comp algorithm `842' pads the compressed length to multiple of 8 + * and returns -ENOSP when the dst memory is not big enough, which + * is not something that ZRAM wants to see. We can handle the + * `compressed_size > PAGE_SIZE' case easily in ZRAM, but when we + * receive -ERRNO from the compressing backend we can't help it + * anymore. To make `842' happy we need to tell the exact size of + * the dst buffer, zram_drv will take care of the fact that + * compressed buffer is too big. + */ + *dst_len = PAGE_SIZE * 2; + + return crypto_comp_compress(zstrm->tfm, + src, PAGE_SIZE, + zstrm->buffer, dst_len); } -int zcomp_decompress(struct zcomp *comp, const unsigned char *src, - size_t src_len, unsigned char *dst) +int zcomp_decompress(struct zcomp_strm *zstrm, + const void *src, unsigned int src_len, void *dst) { - return comp->backend->decompress(src, src_len, dst); + unsigned int dst_len = PAGE_SIZE; + + return crypto_comp_decompress(zstrm->tfm, + src, src_len, + dst, &dst_len); } static int __zcomp_cpu_notifier(struct zcomp *comp, @@ -138,7 +156,7 @@ static int __zcomp_cpu_notifier(struct zcomp *comp, case CPU_UP_CANCELED: zstrm = *per_cpu_ptr(comp->stream, cpu); if (!IS_ERR_OR_NULL(zstrm)) - zcomp_strm_free(comp, zstrm); + zcomp_strm_free(zstrm); *per_cpu_ptr(comp->stream, cpu) = NULL; break; default: @@ -209,7 +227,7 @@ void zcomp_destroy(struct zcomp *comp) struct zcomp *zcomp_create(const char *compress) { struct zcomp *comp; - struct zcomp_backend *backend; + const char *backend; int error; backend = find_backend(compress); @@ -220,7 +238,7 @@ struct zcomp *zcomp_create(const char *compress) if (!comp) return ERR_PTR(-ENOMEM); - comp->backend = backend; + comp->name = backend; error = zcomp_init(comp); if (error) { kfree(comp); diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index 944b8e60dd82..c914ab7972ef 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -13,12 +13,7 @@ struct zcomp_strm { /* compression/decompression buffer */ void *buffer; - /* - * The private data of the compression stream, only compression - * stream backend can touch this (e.g. compression algorithm - * working memory) - */ - void *private; + struct crypto_comp *tfm; }; /* static compression backend */ @@ -40,6 +35,8 @@ struct zcomp { struct zcomp_strm * __percpu *stream; struct zcomp_backend *backend; struct notifier_block notifier; + + const char *name; }; ssize_t zcomp_available_show(const char *comp, char *buf); @@ -51,11 +48,11 @@ void zcomp_destroy(struct zcomp *comp); struct zcomp_strm *zcomp_stream_get(struct zcomp *comp); void zcomp_stream_put(struct zcomp *comp); -int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, - const unsigned char *src, size_t *dst_len); +int zcomp_compress(struct zcomp_strm *zstrm, + const void *src, unsigned int *dst_len); -int zcomp_decompress(struct zcomp *comp, const unsigned char *src, - size_t src_len, unsigned char *dst); +int zcomp_decompress(struct zcomp_strm *zstrm, + const void *src, unsigned int src_len, void *dst); bool zcomp_set_max_streams(struct zcomp *comp, int num_strm); #endif /* _ZCOMP_H_ */ diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 01bd4380d7b9..b5a8b6e840b1 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -563,7 +563,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) unsigned char *cmem; struct zram_meta *meta = zram->meta; unsigned long handle; - size_t size; + unsigned int size; bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); handle = meta->table[index].handle; @@ -576,10 +576,14 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) } cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO); - if (size == PAGE_SIZE) + if (size == PAGE_SIZE) { memcpy(mem, cmem, PAGE_SIZE); - else - ret = zcomp_decompress(zram->comp, cmem, size, mem); + } else { + struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp); + + ret = zcomp_decompress(zstrm, cmem, size, mem); + zcomp_stream_put(zram->comp); + } zs_unmap_object(meta->mem_pool, handle); bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); @@ -646,7 +650,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, int offset) { int ret = 0; - size_t clen; + unsigned int clen; unsigned long handle = 0; struct page *page; unsigned char *user_mem, *cmem, *src, *uncmem = NULL; @@ -696,7 +700,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, } zstrm = zcomp_stream_get(zram->comp); - ret = zcomp_compress(zram->comp, zstrm, uncmem, &clen); + ret = zcomp_compress(zstrm, uncmem, &clen); if (!is_partial_io(bvec)) { kunmap_atomic(user_mem); user_mem = NULL; @@ -744,7 +748,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, if (handle) goto compress_again; - pr_err("Error allocating memory for compressed page: %u, size=%zu\n", + pr_err("Error allocating memory for compressed page: %u, size=%u\n", index, clen); ret = -ENOMEM; goto out; From 4e6affffba21c73e5a7602bfa0539aefc177f43c Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 26 Jul 2016 15:22:48 -0700 Subject: [PATCH 1146/1212] UPSTREAM: zram: use crypto api to check alg availability There is no way to get a string with all the crypto comp algorithms supported by the crypto comp engine, so we need to maintain our own backends list. At the same time we additionally need to use crypto_has_comp() to make sure that the user has requested a compression algorithm that is recognized by the crypto comp engine. Relying on /proc/crypto is not an options here, because it does not show not-yet-inserted compression modules. Example: modprobe zram cat /proc/crypto | grep -i lz4 modprobe lz4 cat /proc/crypto | grep -i lz4 name : lz4 driver : lz4-generic module : lz4 So the user can't tell exactly if the lz4 is really supported from /proc/crypto output, unless someone or something has loaded it. This patch also adds crypto_has_comp() to zcomp_available_show(). We store all the compression algorithms names in zcomp's `backends' array, regardless the CONFIG_CRYPTO_FOO configuration, but show only those that are also supported by crypto engine. This helps user to know the exact list of compression algorithms that can be used. Example: module lz4 is not loaded yet, but is supported by the crypto engine. /proc/crypto has no information on this module, while zram's `comp_algorithm' lists it: cat /proc/crypto | grep -i lz4 cat /sys/block/zram0/comp_algorithm [lzo] lz4 deflate lz4hc 842 We still use the `backends' array to determine if the requested compression backend is known to crypto api. This array, however, may not contain some entries, therefore as the last step we call crypto_has_comp() function which attempts to insmod the requested compression algorithm to determine if crypto api supports it. The advantage of this method is that now we permit the usage of out-of-tree crypto compression modules (implementing S/W or H/W compression). [sergey.senozhatsky@gmail.com: zram-use-crypto-api-to-check-alg-availability-v3] Link: http://lkml.kernel.org/r/20160604024902.11778-4-sergey.senozhatsky@gmail.com Link: http://lkml.kernel.org/r/20160531122017.2878-5-sergey.senozhatsky@gmail.com Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Joonsoo Kim Signed-off-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 415403be37e204632b17bdb6857890fe5a220cea) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I7c823238329bd6e5180386507d16123228804cc5 Signed-off-by: Amit Pundir --- Documentation/blockdev/zram.txt | 11 ++++++ drivers/block/zram/zcomp.c | 69 ++++++++++++++++++++------------- drivers/block/zram/zram_drv.c | 16 ++++---- drivers/block/zram/zram_drv.h | 5 ++- 4 files changed, 64 insertions(+), 37 deletions(-) diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 13100fb3c26d..7c05357360a7 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -83,6 +83,17 @@ pre-created. Default: 1. #select lzo compression algorithm echo lzo > /sys/block/zram0/comp_algorithm + For the time being, the `comp_algorithm' content does not necessarily + show every compression algorithm supported by the kernel. We keep this + list primarily to simplify device configuration and one can configure + a new device with a compression algorithm that is not listed in + `comp_algorithm'. The thing is that, internally, ZRAM uses Crypto API + and, if some of the algorithms were built as modules, it's impossible + to list all of them using, for instance, /proc/crypto or any other + method. This, however, has an advantage of permitting the usage of + custom crypto compression modules (implementing S/W or H/W + compression). + 4) Set Disksize Set disk size by writing the value to sysfs node 'disksize'. The value can be either in bytes or you can use mem suffixes. diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index f35726860a1b..a2b4eb85b41d 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -26,17 +26,6 @@ static const char * const backends[] = { NULL }; -static const char *find_backend(const char *compress) -{ - int i = 0; - while (backends[i]) { - if (sysfs_streq(compress, backends[i])) - break; - i++; - } - return backends[i]; -} - static void zcomp_strm_free(struct zcomp_strm *zstrm) { if (!IS_ERR_OR_NULL(zstrm->tfm)) @@ -68,28 +57,54 @@ static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp, gfp_t flags) return zstrm; } -/* show available compressors */ -ssize_t zcomp_available_show(const char *comp, char *buf) +bool zcomp_available_algorithm(const char *comp) { - ssize_t sz = 0; int i = 0; while (backends[i]) { - if (!strcmp(comp, backends[i])) - sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, - "[%s] ", backends[i]); - else - sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, - "%s ", backends[i]); + if (sysfs_streq(comp, backends[i])) + return true; i++; } - sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n"); - return sz; + + /* + * Crypto does not ignore a trailing new line symbol, + * so make sure you don't supply a string containing + * one. + * This also means that we permit zcomp initialisation + * with any compressing algorithm known to crypto api. + */ + return crypto_has_comp(comp, 0, 0) == 1; } -bool zcomp_available_algorithm(const char *comp) +/* show available compressors */ +ssize_t zcomp_available_show(const char *comp, char *buf) { - return find_backend(comp) != NULL; + bool known_algorithm = false; + ssize_t sz = 0; + int i = 0; + + for (; backends[i]; i++) { + if (!strcmp(comp, backends[i])) { + known_algorithm = true; + sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, + "[%s] ", backends[i]); + } else { + sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, + "%s ", backends[i]); + } + } + + /* + * Out-of-tree module known to crypto api or a missing + * entry in `backends'. + */ + if (!known_algorithm && crypto_has_comp(comp, 0, 0) == 1) + sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, + "[%s] ", comp); + + sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n"); + return sz; } struct zcomp_strm *zcomp_stream_get(struct zcomp *comp) @@ -227,18 +242,16 @@ void zcomp_destroy(struct zcomp *comp) struct zcomp *zcomp_create(const char *compress) { struct zcomp *comp; - const char *backend; int error; - backend = find_backend(compress); - if (!backend) + if (!zcomp_available_algorithm(compress)) return ERR_PTR(-EINVAL); comp = kzalloc(sizeof(struct zcomp), GFP_KERNEL); if (!comp) return ERR_PTR(-ENOMEM); - comp->name = backend; + comp->name = compress; error = zcomp_init(comp); if (error) { kfree(comp); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index b5a8b6e840b1..be2ee475be5c 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -342,9 +342,16 @@ static ssize_t comp_algorithm_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct zram *zram = dev_to_zram(dev); + char compressor[CRYPTO_MAX_ALG_NAME]; size_t sz; - if (!zcomp_available_algorithm(buf)) + strlcpy(compressor, buf, sizeof(compressor)); + /* ignore trailing newline */ + sz = strlen(compressor); + if (sz > 0 && compressor[sz - 1] == '\n') + compressor[sz - 1] = 0x00; + + if (!zcomp_available_algorithm(compressor)) return -EINVAL; down_write(&zram->init_lock); @@ -353,13 +360,8 @@ static ssize_t comp_algorithm_store(struct device *dev, pr_info("Can't change algorithm for initialized device\n"); return -EBUSY; } - strlcpy(zram->compressor, buf, sizeof(zram->compressor)); - - /* ignore trailing newline */ - sz = strlen(zram->compressor); - if (sz > 0 && zram->compressor[sz - 1] == '\n') - zram->compressor[sz - 1] = 0x00; + strlcpy(zram->compressor, compressor, sizeof(compressor)); up_write(&zram->init_lock); return len; } diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 3f5bf66a27e4..74fcf10da374 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -15,8 +15,9 @@ #ifndef _ZRAM_DRV_H_ #define _ZRAM_DRV_H_ -#include +#include #include +#include #include "zcomp.h" @@ -113,7 +114,7 @@ struct zram { * we can store in a disk. */ u64 disksize; /* bytes */ - char compressor[10]; + char compressor[CRYPTO_MAX_ALG_NAME]; /* * zram is claimed so open request will be failed */ From feed2aa2a6913a2baf594445e930313aebe238d9 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 26 Jul 2016 15:22:51 -0700 Subject: [PATCH 1147/1212] UPSTREAM: zram: cosmetic: cleanup documentation zram documentation is a mix of different styles: spaces, tabs, tabs + spaces, etc. Clean it up. Link: http://lkml.kernel.org/r/20160531122017.2878-6-sergey.senozhatsky@gmail.com Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Joonsoo Kim Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 69a30a8d2ac17c8080cf6ebfc91149fd6c2648b3) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: Ib71a1933e3da12b3a9f29b805a458cdc9815c36b Signed-off-by: Amit Pundir --- Documentation/blockdev/zram.txt | 85 ++++++++++++++++----------------- 1 file changed, 42 insertions(+), 43 deletions(-) diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 7c05357360a7..0535ae1f73e5 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -59,23 +59,23 @@ num_devices parameter is optional and tells zram how many devices should be pre-created. Default: 1. 2) Set max number of compression streams - Regardless the value passed to this attribute, ZRAM will always - allocate multiple compression streams - one per online CPUs - thus - allowing several concurrent compression operations. The number of - allocated compression streams goes down when some of the CPUs - become offline. There is no single-compression-stream mode anymore, - unless you are running a UP system or has only 1 CPU online. +Regardless the value passed to this attribute, ZRAM will always +allocate multiple compression streams - one per online CPUs - thus +allowing several concurrent compression operations. The number of +allocated compression streams goes down when some of the CPUs +become offline. There is no single-compression-stream mode anymore, +unless you are running a UP system or has only 1 CPU online. - To find out how many streams are currently available: +To find out how many streams are currently available: cat /sys/block/zram0/max_comp_streams 3) Select compression algorithm - Using comp_algorithm device attribute one can see available and - currently selected (shown in square brackets) compression algorithms, - change selected compression algorithm (once the device is initialised - there is no way to change compression algorithm). +Using comp_algorithm device attribute one can see available and +currently selected (shown in square brackets) compression algorithms, +change selected compression algorithm (once the device is initialised +there is no way to change compression algorithm). - Examples: +Examples: #show supported compression algorithms cat /sys/block/zram0/comp_algorithm lzo [lz4] @@ -83,28 +83,27 @@ pre-created. Default: 1. #select lzo compression algorithm echo lzo > /sys/block/zram0/comp_algorithm - For the time being, the `comp_algorithm' content does not necessarily - show every compression algorithm supported by the kernel. We keep this - list primarily to simplify device configuration and one can configure - a new device with a compression algorithm that is not listed in - `comp_algorithm'. The thing is that, internally, ZRAM uses Crypto API - and, if some of the algorithms were built as modules, it's impossible - to list all of them using, for instance, /proc/crypto or any other - method. This, however, has an advantage of permitting the usage of - custom crypto compression modules (implementing S/W or H/W - compression). +For the time being, the `comp_algorithm' content does not necessarily +show every compression algorithm supported by the kernel. We keep this +list primarily to simplify device configuration and one can configure +a new device with a compression algorithm that is not listed in +`comp_algorithm'. The thing is that, internally, ZRAM uses Crypto API +and, if some of the algorithms were built as modules, it's impossible +to list all of them using, for instance, /proc/crypto or any other +method. This, however, has an advantage of permitting the usage of +custom crypto compression modules (implementing S/W or H/W compression). 4) Set Disksize - Set disk size by writing the value to sysfs node 'disksize'. - The value can be either in bytes or you can use mem suffixes. - Examples: - # Initialize /dev/zram0 with 50MB disksize - echo $((50*1024*1024)) > /sys/block/zram0/disksize +Set disk size by writing the value to sysfs node 'disksize'. +The value can be either in bytes or you can use mem suffixes. +Examples: + # Initialize /dev/zram0 with 50MB disksize + echo $((50*1024*1024)) > /sys/block/zram0/disksize - # Using mem suffixes - echo 256K > /sys/block/zram0/disksize - echo 512M > /sys/block/zram0/disksize - echo 1G > /sys/block/zram0/disksize + # Using mem suffixes + echo 256K > /sys/block/zram0/disksize + echo 512M > /sys/block/zram0/disksize + echo 1G > /sys/block/zram0/disksize Note: There is little point creating a zram of greater than twice the size of memory @@ -112,20 +111,20 @@ since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the size of the disk when not in use so a huge zram is wasteful. 5) Set memory limit: Optional - Set memory limit by writing the value to sysfs node 'mem_limit'. - The value can be either in bytes or you can use mem suffixes. - In addition, you could change the value in runtime. - Examples: - # limit /dev/zram0 with 50MB memory - echo $((50*1024*1024)) > /sys/block/zram0/mem_limit +Set memory limit by writing the value to sysfs node 'mem_limit'. +The value can be either in bytes or you can use mem suffixes. +In addition, you could change the value in runtime. +Examples: + # limit /dev/zram0 with 50MB memory + echo $((50*1024*1024)) > /sys/block/zram0/mem_limit - # Using mem suffixes - echo 256K > /sys/block/zram0/mem_limit - echo 512M > /sys/block/zram0/mem_limit - echo 1G > /sys/block/zram0/mem_limit + # Using mem suffixes + echo 256K > /sys/block/zram0/mem_limit + echo 512M > /sys/block/zram0/mem_limit + echo 1G > /sys/block/zram0/mem_limit - # To disable memory limit - echo 0 > /sys/block/zram0/mem_limit + # To disable memory limit + echo 0 > /sys/block/zram0/mem_limit 6) Activate: mkswap /dev/zram0 From 30a850bbf1b017cd20030a986502b92334129452 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 26 Jul 2016 15:22:54 -0700 Subject: [PATCH 1148/1212] UPSTREAM: zram: delete custom lzo/lz4 Remove lzo/lz4 backends, we use crypto API now. [sergey.senozhatsky@gmail.com: zram-delete-custom-lzo-lz4-v3] Link: http://lkml.kernel.org/r/20160604024902.11778-6-sergey.senozhatsky@gmail.com Link: http://lkml.kernel.org/r/20160531122017.2878-7-sergey.senozhatsky@gmail.com Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit ce1ed9f98e888aa220fb09da2e2bcfcfba218a27) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: Ic2aa300a1a66b61740da73833dab252dc0d4b74a Signed-off-by: Amit Pundir --- drivers/block/zram/Kconfig | 9 ------ drivers/block/zram/Makefile | 4 +-- drivers/block/zram/zcomp.c | 2 +- drivers/block/zram/zcomp.h | 15 --------- drivers/block/zram/zcomp_lz4.c | 56 ---------------------------------- drivers/block/zram/zcomp_lz4.h | 17 ----------- drivers/block/zram/zcomp_lzo.c | 56 ---------------------------------- drivers/block/zram/zcomp_lzo.h | 17 ----------- 8 files changed, 2 insertions(+), 174 deletions(-) delete mode 100644 drivers/block/zram/zcomp_lz4.c delete mode 100644 drivers/block/zram/zcomp_lz4.h delete mode 100644 drivers/block/zram/zcomp_lzo.c delete mode 100644 drivers/block/zram/zcomp_lzo.h diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index 2252cd7d0e89..b8ecba6dcd3b 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -13,12 +13,3 @@ config ZRAM disks and maybe many more. See zram.txt for more information. - -config ZRAM_LZ4_COMPRESS - bool "Enable LZ4 algorithm support" - depends on ZRAM - select CRYPTO_LZ4 - default n - help - This option enables LZ4 compression algorithm support. Compression - algorithm can be changed using `comp_algorithm' device attribute. diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile index be0763ff57a2..9e2b79e9a990 100644 --- a/drivers/block/zram/Makefile +++ b/drivers/block/zram/Makefile @@ -1,5 +1,3 @@ -zram-y := zcomp_lzo.o zcomp.o zram_drv.o - -zram-$(CONFIG_ZRAM_LZ4_COMPRESS) += zcomp_lz4.o +zram-y := zcomp.o zram_drv.o obj-$(CONFIG_ZRAM) += zram.o diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index a2b4eb85b41d..9ab45d41624b 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -20,7 +20,7 @@ static const char * const backends[] = { "lzo", -#ifdef CONFIG_ZRAM_LZ4_COMPRESS +#if IS_ENABLED(CONFIG_CRYPTO_LZ4) "lz4", #endif NULL diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index c914ab7972ef..478cac2ed465 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -16,24 +16,9 @@ struct zcomp_strm { struct crypto_comp *tfm; }; -/* static compression backend */ -struct zcomp_backend { - int (*compress)(const unsigned char *src, unsigned char *dst, - size_t *dst_len, void *private); - - int (*decompress)(const unsigned char *src, size_t src_len, - unsigned char *dst); - - void *(*create)(gfp_t flags); - void (*destroy)(void *private); - - const char *name; -}; - /* dynamic per-device compression frontend */ struct zcomp { struct zcomp_strm * __percpu *stream; - struct zcomp_backend *backend; struct notifier_block notifier; const char *name; diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c deleted file mode 100644 index 0110086accba..000000000000 --- a/drivers/block/zram/zcomp_lz4.c +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (C) 2014 Sergey Senozhatsky. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include -#include -#include -#include - -#include "zcomp_lz4.h" - -static void *zcomp_lz4_create(gfp_t flags) -{ - void *ret; - - ret = kmalloc(LZ4_MEM_COMPRESS, flags); - if (!ret) - ret = __vmalloc(LZ4_MEM_COMPRESS, - flags | __GFP_HIGHMEM, - PAGE_KERNEL); - return ret; -} - -static void zcomp_lz4_destroy(void *private) -{ - kvfree(private); -} - -static int zcomp_lz4_compress(const unsigned char *src, unsigned char *dst, - size_t *dst_len, void *private) -{ - /* return : Success if return 0 */ - return lz4_compress(src, PAGE_SIZE, dst, dst_len, private); -} - -static int zcomp_lz4_decompress(const unsigned char *src, size_t src_len, - unsigned char *dst) -{ - size_t dst_len = PAGE_SIZE; - /* return : Success if return 0 */ - return lz4_decompress_unknownoutputsize(src, src_len, dst, &dst_len); -} - -struct zcomp_backend zcomp_lz4 = { - .compress = zcomp_lz4_compress, - .decompress = zcomp_lz4_decompress, - .create = zcomp_lz4_create, - .destroy = zcomp_lz4_destroy, - .name = "lz4", -}; diff --git a/drivers/block/zram/zcomp_lz4.h b/drivers/block/zram/zcomp_lz4.h deleted file mode 100644 index 60613fb29dd8..000000000000 --- a/drivers/block/zram/zcomp_lz4.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright (C) 2014 Sergey Senozhatsky. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#ifndef _ZCOMP_LZ4_H_ -#define _ZCOMP_LZ4_H_ - -#include "zcomp.h" - -extern struct zcomp_backend zcomp_lz4; - -#endif /* _ZCOMP_LZ4_H_ */ diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c deleted file mode 100644 index ed7a1f0549ec..000000000000 --- a/drivers/block/zram/zcomp_lzo.c +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (C) 2014 Sergey Senozhatsky. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include -#include -#include -#include - -#include "zcomp_lzo.h" - -static void *lzo_create(gfp_t flags) -{ - void *ret; - - ret = kmalloc(LZO1X_MEM_COMPRESS, flags); - if (!ret) - ret = __vmalloc(LZO1X_MEM_COMPRESS, - flags | __GFP_HIGHMEM, - PAGE_KERNEL); - return ret; -} - -static void lzo_destroy(void *private) -{ - kvfree(private); -} - -static int lzo_compress(const unsigned char *src, unsigned char *dst, - size_t *dst_len, void *private) -{ - int ret = lzo1x_1_compress(src, PAGE_SIZE, dst, dst_len, private); - return ret == LZO_E_OK ? 0 : ret; -} - -static int lzo_decompress(const unsigned char *src, size_t src_len, - unsigned char *dst) -{ - size_t dst_len = PAGE_SIZE; - int ret = lzo1x_decompress_safe(src, src_len, dst, &dst_len); - return ret == LZO_E_OK ? 0 : ret; -} - -struct zcomp_backend zcomp_lzo = { - .compress = lzo_compress, - .decompress = lzo_decompress, - .create = lzo_create, - .destroy = lzo_destroy, - .name = "lzo", -}; diff --git a/drivers/block/zram/zcomp_lzo.h b/drivers/block/zram/zcomp_lzo.h deleted file mode 100644 index 128c5807fa14..000000000000 --- a/drivers/block/zram/zcomp_lzo.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright (C) 2014 Sergey Senozhatsky. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#ifndef _ZCOMP_LZO_H_ -#define _ZCOMP_LZO_H_ - -#include "zcomp.h" - -extern struct zcomp_backend zcomp_lzo; - -#endif /* _ZCOMP_LZO_H_ */ From 4225fb587ba9b7fb73bc642a76455b17fc43fba7 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 26 Jul 2016 15:22:56 -0700 Subject: [PATCH 1149/1212] UPSTREAM: zram: add more compression algorithms Add "deflate", "lz4hc", "842" algorithms to the list of known compression backends. The real availability of those algorithms, however, depends on the corresponding CONFIG_CRYPTO_FOO config options. [sergey.senozhatsky@gmail.com: zram-add-more-compression-algorithms-v3] Link: http://lkml.kernel.org/r/20160604024902.11778-7-sergey.senozhatsky@gmail.com Link: http://lkml.kernel.org/r/20160531122017.2878-8-sergey.senozhatsky@gmail.com Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit eb9f56d82547db407779967a2251ea28969245b0) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: Ie46c7676363ef13c559b45dab4968e2cc48a6cbe Signed-off-by: Amit Pundir --- drivers/block/zram/zcomp.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index 9ab45d41624b..32e521a2b8c8 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -22,6 +22,15 @@ static const char * const backends[] = { "lzo", #if IS_ENABLED(CONFIG_CRYPTO_LZ4) "lz4", +#endif +#if IS_ENABLED(CONFIG_CRYPTO_DEFLATE) + "deflate", +#endif +#if IS_ENABLED(CONFIG_CRYPTO_LZ4HC) + "lz4hc", +#endif +#if IS_ENABLED(CONFIG_CRYPTO_842) + "842", #endif NULL }; From ac99063cdf6c9fae5aa7699150381f443bde84fc Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 26 Jul 2016 15:22:59 -0700 Subject: [PATCH 1150/1212] UPSTREAM: zram: drop gfp_t from zcomp_strm_alloc() We now allocate streams from CPU_UP hot-plug path, there are no context-dependent stream allocations anymore and we can schedule from zcomp_strm_alloc(). Use GFP_KERNEL directly and drop a gfp_t parameter. Link: http://lkml.kernel.org/r/20160531122017.2878-9-sergey.senozhatsky@gmail.com Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 16d37725a042cc66f9ee95889dd40e734264508e) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: If09c4a97f3d3e45ad578d2b1d64b26f65617774d Signed-off-by: Amit Pundir --- drivers/block/zram/zcomp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index 32e521a2b8c8..4b5cd3a7b2b6 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -47,9 +47,9 @@ static void zcomp_strm_free(struct zcomp_strm *zstrm) * allocate new zcomp_strm structure with ->tfm initialized by * backend, return NULL on error */ -static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp, gfp_t flags) +static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp) { - struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), flags); + struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), GFP_KERNEL); if (!zstrm) return NULL; @@ -58,7 +58,7 @@ static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp, gfp_t flags) * allocate 2 pages. 1 for compressed data, plus 1 extra for the * case when compressed size is larger than the original one */ - zstrm->buffer = (void *)__get_free_pages(flags | __GFP_ZERO, 1); + zstrm->buffer = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1); if (IS_ERR_OR_NULL(zstrm->tfm) || !zstrm->buffer) { zcomp_strm_free(zstrm); zstrm = NULL; @@ -169,7 +169,7 @@ static int __zcomp_cpu_notifier(struct zcomp *comp, case CPU_UP_PREPARE: if (WARN_ON(*per_cpu_ptr(comp->stream, cpu))) break; - zstrm = zcomp_strm_alloc(comp, GFP_KERNEL); + zstrm = zcomp_strm_alloc(comp); if (IS_ERR_OR_NULL(zstrm)) { pr_err("Can't allocate a compression stream\n"); return NOTIFY_BAD; From 1cf7e4f3e9def846467a5c1deee42aa58a5382aa Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 26 Jul 2016 15:23:34 -0700 Subject: [PATCH 1151/1212] UPSTREAM: zram: use __GFP_MOVABLE for memory allocation Zsmalloc is ready for page migration so zram can use __GFP_MOVABLE from now on. I did test to see how it helps to make higher order pages. Test scenario is as follows. KVM guest, 1G memory, ext4 formated zram block device, for i in `seq 1 8`; do dd if=/dev/vda1 of=mnt/test$i.txt bs=128M count=1 & done wait `pidof dd` for i in `seq 1 2 8`; do rm -rf mnt/test$i.txt done fstrim -v mnt echo "init" cat /proc/buddyinfo echo "compaction" echo 1 > /proc/sys/vm/compact_memory cat /proc/buddyinfo old: init Node 0, zone DMA 208 120 51 41 11 0 0 0 0 0 0 Node 0, zone DMA32 16380 13777 9184 3805 789 54 3 0 0 0 0 compaction Node 0, zone DMA 132 82 40 39 16 2 1 0 0 0 0 Node 0, zone DMA32 5219 5526 4969 3455 1831 677 139 15 0 0 0 new: init Node 0, zone DMA 379 115 97 19 2 0 0 0 0 0 0 Node 0, zone DMA32 18891 16774 10862 3947 637 21 0 0 0 0 0 compaction Node 0, zone DMA 214 66 87 29 10 3 0 0 0 0 0 Node 0, zone DMA32 1612 3139 3154 2469 1745 990 384 94 7 0 0 As you can see, compaction made so many high-order pages. Yay! Link: http://lkml.kernel.org/r/1464736881-24886-13-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 9bc482d3460501ac809457af26b46b72cd7dc212) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I5d7f6eaa4c2d8d3f4da30fc2bd21f4db1be95e50 Signed-off-by: Amit Pundir --- drivers/block/zram/zram_drv.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index be2ee475be5c..38a6181be4b4 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -738,7 +738,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, handle = zs_malloc(meta->mem_pool, clen, __GFP_KSWAPD_RECLAIM | __GFP_NOWARN | - __GFP_HIGHMEM); + __GFP_HIGHMEM | + __GFP_MOVABLE); if (!handle) { zcomp_stream_put(zram->comp); zstrm = NULL; @@ -746,7 +747,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, atomic64_inc(&zram->stats.writestall); handle = zs_malloc(meta->mem_pool, clen, - GFP_NOIO | __GFP_HIGHMEM); + GFP_NOIO | __GFP_HIGHMEM | + __GFP_MOVABLE); if (handle) goto compress_again; From 199c3c6ef2e193e8c40c1a7323995a74630c4edf Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 10 Jan 2017 16:58:15 -0800 Subject: [PATCH 1152/1212] BACKPORT: mm: support anonymous stable page During developemnt for zram-swap asynchronous writeback, I found strange corruption of compressed page, resulting in: Modules linked in: zram(E) CPU: 3 PID: 1520 Comm: zramd-1 Tainted: G E 4.8.0-mm1-00320-ge0d4894c9c38-dirty #3274 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 task: ffff88007620b840 task.stack: ffff880078090000 RIP: set_freeobj.part.43+0x1c/0x1f RSP: 0018:ffff880078093ca8 EFLAGS: 00010246 RAX: 0000000000000018 RBX: ffff880076798d88 RCX: ffffffff81c408c8 RDX: 0000000000000018 RSI: 0000000000000000 RDI: 0000000000000246 RBP: ffff880078093cb0 R08: 0000000000000000 R09: 0000000000000000 R10: ffff88005bc43030 R11: 0000000000001df3 R12: ffff880076798d88 R13: 000000000005bc43 R14: ffff88007819d1b8 R15: 0000000000000001 FS: 0000000000000000(0000) GS:ffff88007e380000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fc934048f20 CR3: 0000000077b01000 CR4: 00000000000406e0 Call Trace: obj_malloc+0x22b/0x260 zs_malloc+0x1e4/0x580 zram_bvec_rw+0x4cd/0x830 [zram] page_requests_rw+0x9c/0x130 [zram] zram_thread+0xe6/0x173 [zram] kthread+0xca/0xe0 ret_from_fork+0x25/0x30 With investigation, it reveals currently stable page doesn't support anonymous page. IOW, reuse_swap_page can reuse the page without waiting writeback completion so it can overwrite page zram is compressing. Unfortunately, zram has used per-cpu stream feature from v4.7. It aims for increasing cache hit ratio of scratch buffer for compressing. Downside of that approach is that zram should ask memory space for compressed page in per-cpu context which requires stricted gfp flag which could be failed. If so, it retries to allocate memory space out of per-cpu context so it could get memory this time and compress the data again, copies it to the memory space. In this scenario, zram assumes the data should never be changed but it is not true unless stable page supports. So, If the data is changed under us, zram can make buffer overrun because second compression size could be bigger than one we got in previous trial and blindly, copy bigger size object to smaller buffer which is buffer overrun. The overrun breaks zsmalloc free object chaining so system goes crash like above. I think below is same problem. https://bugzilla.suse.com/show_bug.cgi?id=997574 Unfortunately, reuse_swap_page should be atomic so that we cannot wait on writeback in there so the approach in this patch is simply return false if we found it needs stable page. Although it increases memory footprint temporarily, it happens rarely and it should be reclaimed easily althoug it happened. Also, It would be better than waiting of IO completion, which is critial path for application latency. Fixes: da9556a2367c ("zram: user per-cpu compression streams") Link: http://lkml.kernel.org/r/20161120233015.GA14113@bbox Link: http://lkml.kernel.org/r/1482366980-3782-2-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Hugh Dickins Cc: Sergey Senozhatsky Cc: Darrick J. Wong Cc: Takashi Iwai Cc: Hyeoncheol Lee Cc: Cc: Sangseok Lee Cc: [4.7+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit f05714293a591038304ddae7cb0dd747bb3786cc) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I0fa5012aff9daf614b2d1d04f35b86ff7043ff21 Signed-off-by: Amit Pundir --- include/linux/swap.h | 3 ++- mm/swapfile.c | 20 +++++++++++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index f1a52c11de0e..a3c65d09e6d9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -150,8 +150,9 @@ enum { SWP_FILE = (1 << 7), /* set after swap_activate success */ SWP_AREA_DISCARD = (1 << 8), /* single-time swap area discards */ SWP_PAGE_DISCARD = (1 << 9), /* freed swap page-cluster discards */ + SWP_STABLE_WRITES = (1 << 10), /* no overwrite PG_writeback pages */ /* add others here before... */ - SWP_SCANNING = (1 << 10), /* refcount in scan_swap_map */ + SWP_SCANNING = (1 << 11), /* refcount in scan_swap_map */ }; #define SWAP_CLUSTER_MAX 32UL diff --git a/mm/swapfile.c b/mm/swapfile.c index 623c77c1327b..9f7bd5e8e68a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -932,11 +932,25 @@ int reuse_swap_page(struct page *page) count = page_mapcount(page); if (count <= 1 && PageSwapCache(page)) { count += page_swapcount(page); - if (count == 1 && !PageWriteback(page)) { + if (count != 1) + goto out; + if (!PageWriteback(page)) { delete_from_swap_cache(page); SetPageDirty(page); + } else { + swp_entry_t entry; + struct swap_info_struct *p; + + entry.val = page_private(page); + p = swap_info_get(entry); + if (p->flags & SWP_STABLE_WRITES) { + spin_unlock(&p->lock); + return false; + } + spin_unlock(&p->lock); } } +out: return count <= 1; } @@ -2481,6 +2495,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = -ENOMEM; goto bad_swap; } + + if (bdi_cap_stable_pages_required(inode_to_bdi(inode))) + p->flags |= SWP_STABLE_WRITES; + if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { int cpu; From ba6cca5f80fb306055ab79d094c146a7a6dfc316 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 10 Jan 2017 16:58:18 -0800 Subject: [PATCH 1153/1212] UPSTREAM: zram: revalidate disk under init_lock Commit b4c5c60920e3 ("zram: avoid lockdep splat by revalidate_disk") moved revalidate_disk call out of init_lock to avoid lockdep false-positive splat. However, commit 08eee69fcf6b ("zram: remove init_lock in zram_make_request") removed init_lock in IO path so there is no worry about lockdep splat. So, let's restore it. This patch is needed to set BDI_CAP_STABLE_WRITES atomically in next patch. Fixes: da9556a2367c ("zram: user per-cpu compression streams") Link: http://lkml.kernel.org/r/1482366980-3782-3-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Cc: Takashi Iwai Cc: Hyeoncheol Lee Cc: Cc: Sangseok Lee Cc: Hugh Dickins Cc: Darrick J. Wong Cc: [4.7+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit e7ccfc4ccb703e0f033bd4617580039898e912dd) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: Iebb6f694e46a797f8ce34029857c01c0c71086c7 Signed-off-by: Amit Pundir --- drivers/block/zram/zram_drv.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 38a6181be4b4..ce56b48cb09d 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1091,14 +1091,8 @@ static ssize_t disksize_store(struct device *dev, zram->comp = comp; zram->disksize = disksize; set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); - up_write(&zram->init_lock); - - /* - * Revalidate disk out of the init_lock to avoid lockdep splat. - * It's okay because disk's capacity is protected by init_lock - * so that revalidate_disk always sees up-to-date capacity. - */ revalidate_disk(zram->disk); + up_write(&zram->init_lock); return len; From 7032a283cab7c98f28ba3630d92ce24f435e0555 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 10 Jan 2017 16:58:21 -0800 Subject: [PATCH 1154/1212] UPSTREAM: zram: support BDI_CAP_STABLE_WRITES zram has used per-cpu stream feature from v4.7. It aims for increasing cache hit ratio of scratch buffer for compressing. Downside of that approach is that zram should ask memory space for compressed page in per-cpu context which requires stricted gfp flag which could be failed. If so, it retries to allocate memory space out of per-cpu context so it could get memory this time and compress the data again, copies it to the memory space. In this scenario, zram assumes the data should never be changed but it is not true without stable page support. So, If the data is changed under us, zram can make buffer overrun so that zsmalloc free object chain is broken so system goes crash like below https://bugzilla.suse.com/show_bug.cgi?id=997574 This patch adds BDI_CAP_STABLE_WRITES to zram for declaring "I am block device needing *stable write*". Fixes: da9556a2367c ("zram: user per-cpu compression streams") Link: http://lkml.kernel.org/r/1482366980-3782-4-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Cc: Takashi Iwai Cc: Hyeoncheol Lee Cc: Cc: Sangseok Lee Cc: Hugh Dickins Cc: Darrick J. Wong Cc: [4.7+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit b09ab054b69b07077bd3292f67e777861ac796e5) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I3134a5882c1939792ffa71b8f31f7ab642a0e9a3 Signed-off-by: Amit Pundir --- drivers/block/zram/zram_drv.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index ce56b48cb09d..3ccab7727e04 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -111,6 +112,14 @@ static inline bool is_partial_io(struct bio_vec *bvec) return bvec->bv_len != PAGE_SIZE; } +static void zram_revalidate_disk(struct zram *zram) +{ + revalidate_disk(zram->disk); + /* revalidate_disk reset the BDI_CAP_STABLE_WRITES so set again */ + zram->disk->queue->backing_dev_info.capabilities |= + BDI_CAP_STABLE_WRITES; +} + /* * Check if request is within bounds and aligned on zram logical blocks. */ @@ -1091,7 +1100,7 @@ static ssize_t disksize_store(struct device *dev, zram->comp = comp; zram->disksize = disksize; set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); - revalidate_disk(zram->disk); + zram_revalidate_disk(zram); up_write(&zram->init_lock); return len; @@ -1139,7 +1148,7 @@ static ssize_t reset_store(struct device *dev, /* Make sure all the pending I/O are finished */ fsync_bdev(bdev); zram_reset_device(zram); - revalidate_disk(zram->disk); + zram_revalidate_disk(zram); bdput(bdev); mutex_lock(&bdev->bd_mutex); From ff26f4e4a0ce0438d7bd20570c327bbb47a8605b Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 22 Feb 2017 15:46:45 -0800 Subject: [PATCH 1155/1212] UPSTREAM: zram: remove obsolete sysfs attrs We had a deprecated_attr_warn() warning for 2 years and now the time has come and we finally can do the cleanup. The plan was as follows: : per-stat sysfs attributes are considered to be deprecated. : The basic strategy is: : -- the existing RW nodes will be downgraded to WO nodes (in linux 4.11) : -- deprecated RO sysfs nodes will eventually be removed (in linux 4.11) : : The list of deprecated attributes can be found here: : Documentation/ABI/obsolete/sysfs-block-zram : : Basically, every attribute that has its own read accessible sysfs : node (e.g. num_reads) *AND* is accessible via one of the stat files : (zram/stat or zram/io_stat or zram/mm_stat) is considered : to be deprecated. The patch also removes `obsolete/sysfs-block-zram', clean ups `testing/sysfs-block-zram' and tweaks zram.txt files. Link: http://lkml.kernel.org/r/20170118035838.11090-1-sergey.senozhatsky@gmail.com Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit c87d1655c29500b459fb135258a93f8309ada9c7) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: Idd86259230d6a4bf0feeee53b5b69f3f3df774d4 Signed-off-by: Amit Pundir --- Documentation/ABI/obsolete/sysfs-block-zram | 119 -------------------- Documentation/ABI/testing/sysfs-block-zram | 101 ++--------------- Documentation/blockdev/zram.txt | 74 ++++++------ drivers/block/zram/zram_drv.c | 101 +---------------- 4 files changed, 42 insertions(+), 353 deletions(-) delete mode 100644 Documentation/ABI/obsolete/sysfs-block-zram diff --git a/Documentation/ABI/obsolete/sysfs-block-zram b/Documentation/ABI/obsolete/sysfs-block-zram deleted file mode 100644 index 720ea92cfb2e..000000000000 --- a/Documentation/ABI/obsolete/sysfs-block-zram +++ /dev/null @@ -1,119 +0,0 @@ -What: /sys/block/zram/num_reads -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The num_reads file is read-only and specifies the number of - reads (failed or successful) done on this device. - Now accessible via zram/stat node. - -What: /sys/block/zram/num_writes -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The num_writes file is read-only and specifies the number of - writes (failed or successful) done on this device. - Now accessible via zram/stat node. - -What: /sys/block/zram/invalid_io -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The invalid_io file is read-only and specifies the number of - non-page-size-aligned I/O requests issued to this device. - Now accessible via zram/io_stat node. - -What: /sys/block/zram/failed_reads -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The failed_reads file is read-only and specifies the number of - failed reads happened on this device. - Now accessible via zram/io_stat node. - -What: /sys/block/zram/failed_writes -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The failed_writes file is read-only and specifies the number of - failed writes happened on this device. - Now accessible via zram/io_stat node. - -What: /sys/block/zram/notify_free -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The notify_free file is read-only. Depending on device usage - scenario it may account a) the number of pages freed because - of swap slot free notifications or b) the number of pages freed - because of REQ_DISCARD requests sent by bio. The former ones - are sent to a swap block device when a swap slot is freed, which - implies that this disk is being used as a swap disk. The latter - ones are sent by filesystem mounted with discard option, - whenever some data blocks are getting discarded. - Now accessible via zram/io_stat node. - -What: /sys/block/zram/zero_pages -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The zero_pages file is read-only and specifies number of zero - filled pages written to this disk. No memory is allocated for - such pages. - Now accessible via zram/mm_stat node. - -What: /sys/block/zram/orig_data_size -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The orig_data_size file is read-only and specifies uncompressed - size of data stored in this disk. This excludes zero-filled - pages (zero_pages) since no memory is allocated for them. - Unit: bytes - Now accessible via zram/mm_stat node. - -What: /sys/block/zram/compr_data_size -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The compr_data_size file is read-only and specifies compressed - size of data stored in this disk. So, compression ratio can be - calculated using orig_data_size and this statistic. - Unit: bytes - Now accessible via zram/mm_stat node. - -What: /sys/block/zram/mem_used_total -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The mem_used_total file is read-only and specifies the amount - of memory, including allocator fragmentation and metadata - overhead, allocated for this disk. So, allocator space - efficiency can be calculated using compr_data_size and this - statistic. - Unit: bytes - Now accessible via zram/mm_stat node. - -What: /sys/block/zram/mem_used_max -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The mem_used_max file is read/write and specifies the amount - of maximum memory zram have consumed to store compressed data. - For resetting the value, you should write "0". Otherwise, - you could see -EINVAL. - Unit: bytes - Downgraded to write-only node: so it's possible to set new - value only; its current value is stored in zram/mm_stat - node. - -What: /sys/block/zram/mem_limit -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The mem_limit file is read/write and specifies the maximum - amount of memory ZRAM can use to store the compressed data. - The limit could be changed in run time and "0" means disable - the limit. No limit is the initial state. Unit: bytes - Downgraded to write-only node: so it's possible to set new - value only; its current value is stored in zram/mm_stat - node. diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram index 4518d30b8c2e..451b6d882b2c 100644 --- a/Documentation/ABI/testing/sysfs-block-zram +++ b/Documentation/ABI/testing/sysfs-block-zram @@ -22,41 +22,6 @@ Description: device. The reset operation frees all the memory associated with this device. -What: /sys/block/zram/num_reads -Date: August 2010 -Contact: Nitin Gupta -Description: - The num_reads file is read-only and specifies the number of - reads (failed or successful) done on this device. - -What: /sys/block/zram/num_writes -Date: August 2010 -Contact: Nitin Gupta -Description: - The num_writes file is read-only and specifies the number of - writes (failed or successful) done on this device. - -What: /sys/block/zram/invalid_io -Date: August 2010 -Contact: Nitin Gupta -Description: - The invalid_io file is read-only and specifies the number of - non-page-size-aligned I/O requests issued to this device. - -What: /sys/block/zram/failed_reads -Date: February 2014 -Contact: Sergey Senozhatsky -Description: - The failed_reads file is read-only and specifies the number of - failed reads happened on this device. - -What: /sys/block/zram/failed_writes -Date: February 2014 -Contact: Sergey Senozhatsky -Description: - The failed_writes file is read-only and specifies the number of - failed writes happened on this device. - What: /sys/block/zram/max_comp_streams Date: February 2014 Contact: Sergey Senozhatsky @@ -73,74 +38,24 @@ Description: available and selected compression algorithms, change compression algorithm selection. -What: /sys/block/zram/notify_free -Date: August 2010 -Contact: Nitin Gupta -Description: - The notify_free file is read-only. Depending on device usage - scenario it may account a) the number of pages freed because - of swap slot free notifications or b) the number of pages freed - because of REQ_DISCARD requests sent by bio. The former ones - are sent to a swap block device when a swap slot is freed, which - implies that this disk is being used as a swap disk. The latter - ones are sent by filesystem mounted with discard option, - whenever some data blocks are getting discarded. - -What: /sys/block/zram/zero_pages -Date: August 2010 -Contact: Nitin Gupta -Description: - The zero_pages file is read-only and specifies number of zero - filled pages written to this disk. No memory is allocated for - such pages. - -What: /sys/block/zram/orig_data_size -Date: August 2010 -Contact: Nitin Gupta -Description: - The orig_data_size file is read-only and specifies uncompressed - size of data stored in this disk. This excludes zero-filled - pages (zero_pages) since no memory is allocated for them. - Unit: bytes - -What: /sys/block/zram/compr_data_size -Date: August 2010 -Contact: Nitin Gupta -Description: - The compr_data_size file is read-only and specifies compressed - size of data stored in this disk. So, compression ratio can be - calculated using orig_data_size and this statistic. - Unit: bytes - -What: /sys/block/zram/mem_used_total -Date: August 2010 -Contact: Nitin Gupta -Description: - The mem_used_total file is read-only and specifies the amount - of memory, including allocator fragmentation and metadata - overhead, allocated for this disk. So, allocator space - efficiency can be calculated using compr_data_size and this - statistic. - Unit: bytes - What: /sys/block/zram/mem_used_max Date: August 2014 Contact: Minchan Kim Description: - The mem_used_max file is read/write and specifies the amount - of maximum memory zram have consumed to store compressed data. - For resetting the value, you should write "0". Otherwise, - you could see -EINVAL. + The mem_used_max file is write-only and is used to reset + the counter of maximum memory zram have consumed to store + compressed data. For resetting the value, you should write + "0". Otherwise, you could see -EINVAL. Unit: bytes What: /sys/block/zram/mem_limit Date: August 2014 Contact: Minchan Kim Description: - The mem_limit file is read/write and specifies the maximum - amount of memory ZRAM can use to store the compressed data. The - limit could be changed in run time and "0" means disable the - limit. No limit is the initial state. Unit: bytes + The mem_limit file is write-only and specifies the maximum + amount of memory ZRAM can use to store the compressed data. + The limit could be changed in run time and "0" means disable + the limit. No limit is the initial state. Unit: bytes What: /sys/block/zram/compact Date: August 2015 diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 0535ae1f73e5..1c0c08d9206b 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -161,42 +161,14 @@ Name access description disksize RW show and set the device's disk size initstate RO shows the initialization state of the device reset WO trigger device reset -num_reads RO the number of reads -failed_reads RO the number of failed reads -num_write RO the number of writes -failed_writes RO the number of failed writes -invalid_io RO the number of non-page-size-aligned I/O requests +mem_used_max WO reset the `mem_used_max' counter (see later) +mem_limit WO specifies the maximum amount of memory ZRAM can use + to store the compressed data max_comp_streams RW the number of possible concurrent compress operations comp_algorithm RW show and change the compression algorithm -notify_free RO the number of notifications to free pages (either - slot free notifications or REQ_DISCARD requests) -zero_pages RO the number of zero filled pages written to this disk -orig_data_size RO uncompressed size of data stored in this disk -compr_data_size RO compressed size of data stored in this disk -mem_used_total RO the amount of memory allocated for this disk -mem_used_max RW the maximum amount of memory zram have consumed to - store the data (to reset this counter to the actual - current value, write 1 to this attribute) -mem_limit RW the maximum amount of memory ZRAM can use to store - the compressed data -pages_compacted RO the number of pages freed during compaction - (available only via zram/mm_stat node) compact WO trigger memory compaction debug_stat RO this file is used for zram debugging purposes -WARNING -======= -per-stat sysfs attributes are considered to be deprecated. -The basic strategy is: --- the existing RW nodes will be downgraded to WO nodes (in linux 4.11) --- deprecated RO sysfs nodes will eventually be removed (in linux 4.11) - -The list of deprecated attributes can be found here: -Documentation/ABI/obsolete/sysfs-block-zram - -Basically, every attribute that has its own read accessible sysfs node -(e.g. num_reads) *AND* is accessible via one of the stat files (zram/stat -or zram/io_stat or zram/mm_stat) is considered to be deprecated. User space is advised to use the following files to read the device statistics. @@ -211,22 +183,40 @@ The stat file represents device's I/O statistics not accounted by block layer and, thus, not available in zram/stat file. It consists of a single line of text and contains the following stats separated by whitespace: - failed_reads - failed_writes - invalid_io - notify_free + failed_reads the number of failed reads + failed_writes the number of failed writes + invalid_io the number of non-page-size-aligned I/O requests + notify_free Depending on device usage scenario it may account + a) the number of pages freed because of swap slot free + notifications or b) the number of pages freed because of + REQ_DISCARD requests sent by bio. The former ones are + sent to a swap block device when a swap slot is freed, + which implies that this disk is being used as a swap disk. + The latter ones are sent by filesystem mounted with + discard option, whenever some data blocks are getting + discarded. File /sys/block/zram/mm_stat The stat file represents device's mm statistics. It consists of a single line of text and contains the following stats separated by whitespace: - orig_data_size - compr_data_size - mem_used_total - mem_limit - mem_used_max - zero_pages - num_migrated + orig_data_size uncompressed size of data stored in this disk. + This excludes zero-filled pages (zero_pages) since no + memory is allocated for them. + Unit: bytes + compr_data_size compressed size of data stored in this disk + mem_used_total the amount of memory allocated for this disk. This + includes allocator fragmentation and metadata overhead, + allocated for this disk. So, allocator space efficiency + can be calculated using compr_data_size and this statistic. + Unit: bytes + mem_limit the maximum amount of memory ZRAM can use to store + the compressed data + mem_used_max the maximum amount of memory zram have consumed to + store the data + zero_pages the number of zero filled pages written to this disk. + No memory is allocated for such pages. + pages_compacted the number of pages freed during compaction 9) Deactivate: swapoff /dev/zram0 diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 3ccab7727e04..022cda34ef34 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -44,27 +44,6 @@ static const char *default_compressor = "lzo"; /* Module params (documentation at end) */ static unsigned int num_devices = 1; -static inline void deprecated_attr_warn(const char *name) -{ - pr_warn_once("%d (%s) Attribute %s (and others) will be removed. %s\n", - task_pid_nr(current), - current->comm, - name, - "See zram documentation."); -} - -#define ZRAM_ATTR_RO(name) \ -static ssize_t name##_show(struct device *d, \ - struct device_attribute *attr, char *b) \ -{ \ - struct zram *zram = dev_to_zram(d); \ - \ - deprecated_attr_warn(__stringify(name)); \ - return scnprintf(b, PAGE_SIZE, "%llu\n", \ - (u64)atomic64_read(&zram->stats.name)); \ -} \ -static DEVICE_ATTR_RO(name); - static inline bool init_done(struct zram *zram) { return zram->disksize; @@ -217,47 +196,6 @@ static ssize_t disksize_show(struct device *dev, return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize); } -static ssize_t orig_data_size_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - deprecated_attr_warn("orig_data_size"); - return scnprintf(buf, PAGE_SIZE, "%llu\n", - (u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT); -} - -static ssize_t mem_used_total_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - u64 val = 0; - struct zram *zram = dev_to_zram(dev); - - deprecated_attr_warn("mem_used_total"); - down_read(&zram->init_lock); - if (init_done(zram)) { - struct zram_meta *meta = zram->meta; - val = zs_get_total_pages(meta->mem_pool); - } - up_read(&zram->init_lock); - - return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT); -} - -static ssize_t mem_limit_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - u64 val; - struct zram *zram = dev_to_zram(dev); - - deprecated_attr_warn("mem_limit"); - down_read(&zram->init_lock); - val = zram->limit_pages; - up_read(&zram->init_lock); - - return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT); -} - static ssize_t mem_limit_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -276,21 +214,6 @@ static ssize_t mem_limit_store(struct device *dev, return len; } -static ssize_t mem_used_max_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - u64 val = 0; - struct zram *zram = dev_to_zram(dev); - - deprecated_attr_warn("mem_used_max"); - down_read(&zram->init_lock); - if (init_done(zram)) - val = atomic_long_read(&zram->stats.max_used_pages); - up_read(&zram->init_lock); - - return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT); -} - static ssize_t mem_used_max_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -466,14 +389,6 @@ static ssize_t debug_stat_show(struct device *dev, static DEVICE_ATTR_RO(io_stat); static DEVICE_ATTR_RO(mm_stat); static DEVICE_ATTR_RO(debug_stat); -ZRAM_ATTR_RO(num_reads); -ZRAM_ATTR_RO(num_writes); -ZRAM_ATTR_RO(failed_reads); -ZRAM_ATTR_RO(failed_writes); -ZRAM_ATTR_RO(invalid_io); -ZRAM_ATTR_RO(notify_free); -ZRAM_ATTR_RO(zero_pages); -ZRAM_ATTR_RO(compr_data_size); static inline bool zram_meta_get(struct zram *zram) { @@ -1184,10 +1099,8 @@ static DEVICE_ATTR_WO(compact); static DEVICE_ATTR_RW(disksize); static DEVICE_ATTR_RO(initstate); static DEVICE_ATTR_WO(reset); -static DEVICE_ATTR_RO(orig_data_size); -static DEVICE_ATTR_RO(mem_used_total); -static DEVICE_ATTR_RW(mem_limit); -static DEVICE_ATTR_RW(mem_used_max); +static DEVICE_ATTR_WO(mem_limit); +static DEVICE_ATTR_WO(mem_used_max); static DEVICE_ATTR_RW(max_comp_streams); static DEVICE_ATTR_RW(comp_algorithm); @@ -1195,17 +1108,7 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_disksize.attr, &dev_attr_initstate.attr, &dev_attr_reset.attr, - &dev_attr_num_reads.attr, - &dev_attr_num_writes.attr, - &dev_attr_failed_reads.attr, - &dev_attr_failed_writes.attr, &dev_attr_compact.attr, - &dev_attr_invalid_io.attr, - &dev_attr_notify_free.attr, - &dev_attr_zero_pages.attr, - &dev_attr_orig_data_size.attr, - &dev_attr_compr_data_size.attr, - &dev_attr_mem_used_total.attr, &dev_attr_mem_limit.attr, &dev_attr_mem_used_max.attr, &dev_attr_max_comp_streams.attr, From ea830271ca69bb8b2e5876e3d9fed4e2eb5ac274 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 24 Feb 2017 14:56:47 -0800 Subject: [PATCH 1156/1212] BACKPORT: zram: remove waitqueue for IO done zram_reset_device() waits for ongoing writepage pages to be completed by zram->refcount logic. However, it's pointless because before the reset, we prevent further opening of zram by zram->claim and flush all of pending IO by fsync_bdev so there should be no pending IO at the zram_reset_device(). So let's remove that code which is even broken due to the lack of wake_up elsewhere. Link: http://lkml.kernel.org/r/1485145031-11661-1-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit a09759acaacf6cf738e1bc6c66d41485c87fd371) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I97170fb576be7baae63f82334af0dd5e91b16763 Signed-off-by: Amit Pundir --- drivers/block/zram/zram_drv.c | 40 +++-------------------------------- drivers/block/zram/zram_drv.h | 3 --- 2 files changed, 3 insertions(+), 40 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 022cda34ef34..c4cd527bc66c 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -390,18 +390,6 @@ static DEVICE_ATTR_RO(io_stat); static DEVICE_ATTR_RO(mm_stat); static DEVICE_ATTR_RO(debug_stat); -static inline bool zram_meta_get(struct zram *zram) -{ - if (atomic_inc_not_zero(&zram->refcount)) - return true; - return false; -} - -static inline void zram_meta_put(struct zram *zram) -{ - atomic_dec(&zram->refcount); -} - static void zram_meta_free(struct zram_meta *meta, u64 disksize) { size_t num_pages = disksize >> PAGE_SHIFT; @@ -855,22 +843,17 @@ static blk_qc_t zram_make_request(struct request_queue *queue, struct bio *bio) { struct zram *zram = queue->queuedata; - if (unlikely(!zram_meta_get(zram))) - goto error; - blk_queue_split(queue, &bio, queue->bio_split); if (!valid_io_request(zram, bio->bi_iter.bi_sector, bio->bi_iter.bi_size)) { atomic64_inc(&zram->stats.invalid_io); - goto put_zram; + goto error; } __zram_make_request(zram, bio); - zram_meta_put(zram); return BLK_QC_T_NONE; -put_zram: - zram_meta_put(zram); + error: bio_io_error(bio); return BLK_QC_T_NONE; @@ -900,13 +883,11 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector, struct bio_vec bv; zram = bdev->bd_disk->private_data; - if (unlikely(!zram_meta_get(zram))) - goto out; if (!valid_io_request(zram, sector, PAGE_SIZE)) { atomic64_inc(&zram->stats.invalid_io); err = -EINVAL; - goto put_zram; + goto out; } index = sector >> SECTORS_PER_PAGE_SHIFT; @@ -917,8 +898,6 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector, bv.bv_offset = 0; err = zram_bvec_rw(zram, &bv, index, offset, rw); -put_zram: - zram_meta_put(zram); out: /* * If I/O fails, just return error(ie, non-zero) without @@ -951,17 +930,6 @@ static void zram_reset_device(struct zram *zram) meta = zram->meta; comp = zram->comp; disksize = zram->disksize; - /* - * Refcount will go down to 0 eventually and r/w handler - * cannot handle further I/O so it will bail out by - * check zram_meta_get. - */ - zram_meta_put(zram); - /* - * We want to free zram_meta in process context to avoid - * deadlock between reclaim path and any other locks. - */ - wait_event(zram->io_done, atomic_read(&zram->refcount) == 0); /* Reset stats */ memset(&zram->stats, 0, sizeof(zram->stats)); @@ -1009,8 +977,6 @@ static ssize_t disksize_store(struct device *dev, goto out_destroy_comp; } - init_waitqueue_head(&zram->io_done); - atomic_set(&zram->refcount, 1); zram->meta = meta; zram->comp = comp; zram->disksize = disksize; diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 74fcf10da374..2692554b7737 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -106,9 +106,6 @@ struct zram { unsigned long limit_pages; struct zram_stats stats; - atomic_t refcount; /* refcount for zram_meta */ - /* wait all IO under all of cpu are done */ - wait_queue_head_t io_done; /* * This is the limit on amount of *uncompressed* worth of data * we can store in a disk. From 7188698948edc229ba9c0b17082f753a575a2148 Mon Sep 17 00:00:00 2001 From: zhouxianrong Date: Fri, 24 Feb 2017 14:59:27 -0800 Subject: [PATCH 1157/1212] BACKPORT: zram: extend zero pages to same element pages The idea is that without doing more calculations we extend zero pages to same element pages for zram. zero page is special case of same element page with zero element. 1. the test is done under android 7.0 2. startup too many applications circularly 3. sample the zero pages, same pages (none-zero element) and total pages in function page_zero_filled the result is listed as below: ZERO SAME TOTAL 36214 17842 598196 ZERO/TOTAL SAME/TOTAL (ZERO+SAME)/TOTAL ZERO/SAME AVERAGE 0.060631909 0.024990816 0.085622726 2.663825038 STDEV 0.00674612 0.005887625 0.009707034 2.115881328 MAX 0.069698422 0.030046087 0.094975336 7.56043956 MIN 0.03959586 0.007332205 0.056055193 1.928985507 from the above data, the benefit is about 2.5% and up to 3% of total swapout pages. The defect of the patch is that when we recovery a page from non-zero element the operations are low efficient for partial read. This patch extends zero_page to same_page so if there is any user to have monitored zero_pages, he will be surprised if the number is increased but it's not harmful, I believe. [minchan@kernel.org: do not free same element pages in zram_meta_free] Link: http://lkml.kernel.org/r/20170207065741.GA2567@bbox Link: http://lkml.kernel.org/r/1483692145-75357-1-git-send-email-zhouxianrong@huawei.com Link: http://lkml.kernel.org/r/1486307804-27903-1-git-send-email-minchan@kernel.org Signed-off-by: zhouxianrong Signed-off-by: Minchan Kim Cc: Sergey Senozhatsky Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 8e19d540d107ee897eb9a874844060c94e2376c0) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I92ebb07a6ad96be82443d6e0c0d4f25cbe936915 Signed-off-by: Amit Pundir --- Documentation/blockdev/zram.txt | 6 +-- drivers/block/zram/zram_drv.c | 87 +++++++++++++++++++++++---------- drivers/block/zram/zram_drv.h | 9 ++-- 3 files changed, 69 insertions(+), 33 deletions(-) diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 1c0c08d9206b..4fced8a21307 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -201,8 +201,8 @@ File /sys/block/zram/mm_stat The stat file represents device's mm statistics. It consists of a single line of text and contains the following stats separated by whitespace: orig_data_size uncompressed size of data stored in this disk. - This excludes zero-filled pages (zero_pages) since no - memory is allocated for them. + This excludes same-element-filled pages (same_pages) since + no memory is allocated for them. Unit: bytes compr_data_size compressed size of data stored in this disk mem_used_total the amount of memory allocated for this disk. This @@ -214,7 +214,7 @@ line of text and contains the following stats separated by whitespace: the compressed data mem_used_max the maximum amount of memory zram have consumed to store the data - zero_pages the number of zero filled pages written to this disk. + same_pages the number of same element filled pages written to this disk. No memory is allocated for such pages. pages_compacted the number of pages freed during compaction diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index c4cd527bc66c..28a0e5169756 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -73,6 +73,17 @@ static void zram_clear_flag(struct zram_meta *meta, u32 index, meta->table[index].value &= ~BIT(flag); } +static inline void zram_set_element(struct zram_meta *meta, u32 index, + unsigned long element) +{ + meta->table[index].element = element; +} + +static inline void zram_clear_element(struct zram_meta *meta, u32 index) +{ + meta->table[index].element = 0; +} + static size_t zram_get_obj_size(struct zram_meta *meta, u32 index) { return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1); @@ -145,31 +156,46 @@ static inline void update_used_max(struct zram *zram, } while (old_max != cur_max); } -static bool page_zero_filled(void *ptr) +static inline void zram_fill_page(char *ptr, unsigned long len, + unsigned long value) +{ + int i; + unsigned long *page = (unsigned long *)ptr; + + WARN_ON_ONCE(!IS_ALIGNED(len, sizeof(unsigned long))); + + if (likely(value == 0)) { + memset(ptr, 0, len); + } else { + for (i = 0; i < len / sizeof(*page); i++) + page[i] = value; + } +} + +static bool page_same_filled(void *ptr, unsigned long *element) { unsigned int pos; unsigned long *page; page = (unsigned long *)ptr; - for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) { - if (page[pos]) + for (pos = 0; pos < PAGE_SIZE / sizeof(*page) - 1; pos++) { + if (page[pos] != page[pos + 1]) return false; } + *element = page[pos]; + return true; } -static void handle_zero_page(struct bio_vec *bvec) +static void handle_same_page(struct bio_vec *bvec, unsigned long element) { struct page *page = bvec->bv_page; void *user_mem; user_mem = kmap_atomic(page); - if (is_partial_io(bvec)) - memset(user_mem + bvec->bv_offset, 0, bvec->bv_len); - else - clear_page(user_mem); + zram_fill_page(user_mem + bvec->bv_offset, bvec->bv_len, element); kunmap_atomic(user_mem); flush_dcache_page(page); @@ -362,7 +388,7 @@ static ssize_t mm_stat_show(struct device *dev, mem_used << PAGE_SHIFT, zram->limit_pages << PAGE_SHIFT, max_used << PAGE_SHIFT, - (u64)atomic64_read(&zram->stats.zero_pages), + (u64)atomic64_read(&zram->stats.same_pages), pool_stats.pages_compacted); up_read(&zram->init_lock); @@ -398,8 +424,11 @@ static void zram_meta_free(struct zram_meta *meta, u64 disksize) /* Free all pages that are still in this zram device */ for (index = 0; index < num_pages; index++) { unsigned long handle = meta->table[index].handle; - - if (!handle) + /* + * No memory is allocated for same element filled pages. + * Simply clear same page flag. + */ + if (!handle || zram_test_flag(meta, index, ZRAM_SAME)) continue; zs_free(meta->mem_pool, handle); @@ -449,18 +478,20 @@ static void zram_free_page(struct zram *zram, size_t index) struct zram_meta *meta = zram->meta; unsigned long handle = meta->table[index].handle; - if (unlikely(!handle)) { - /* - * No memory is allocated for zero filled pages. - * Simply clear zero page flag. - */ - if (zram_test_flag(meta, index, ZRAM_ZERO)) { - zram_clear_flag(meta, index, ZRAM_ZERO); - atomic64_dec(&zram->stats.zero_pages); - } + /* + * No memory is allocated for same element filled pages. + * Simply clear same page flag. + */ + if (zram_test_flag(meta, index, ZRAM_SAME)) { + zram_clear_flag(meta, index, ZRAM_SAME); + zram_clear_element(meta, index); + atomic64_dec(&zram->stats.same_pages); return; } + if (!handle) + return; + zs_free(meta->mem_pool, handle); atomic64_sub(zram_get_obj_size(meta, index), @@ -483,9 +514,9 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) handle = meta->table[index].handle; size = zram_get_obj_size(meta, index); - if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) { + if (!handle || zram_test_flag(meta, index, ZRAM_SAME)) { bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); - memset(mem, 0, PAGE_SIZE); + zram_fill_page(mem, PAGE_SIZE, meta->table[index].element); return 0; } @@ -521,9 +552,9 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); if (unlikely(!meta->table[index].handle) || - zram_test_flag(meta, index, ZRAM_ZERO)) { + zram_test_flag(meta, index, ZRAM_SAME)) { bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); - handle_zero_page(bvec); + handle_same_page(bvec, meta->table[index].element); return 0; } bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); @@ -571,6 +602,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, struct zram_meta *meta = zram->meta; struct zcomp_strm *zstrm = NULL; unsigned long alloced_pages; + unsigned long element; page = bvec->bv_page; if (is_partial_io(bvec)) { @@ -599,16 +631,17 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, uncmem = user_mem; } - if (page_zero_filled(uncmem)) { + if (page_same_filled(uncmem, &element)) { if (user_mem) kunmap_atomic(user_mem); /* Free memory associated with this sector now. */ bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); zram_free_page(zram, index); - zram_set_flag(meta, index, ZRAM_ZERO); + zram_set_flag(meta, index, ZRAM_SAME); + zram_set_element(meta, index, element); bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); - atomic64_inc(&zram->stats.zero_pages); + atomic64_inc(&zram->stats.same_pages); ret = 0; goto out; } diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 2692554b7737..caeff51f1571 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -61,7 +61,7 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3; /* Flags for zram pages (table[page_no].value) */ enum zram_pageflags { /* Page consists entirely of zeros */ - ZRAM_ZERO = ZRAM_FLAG_SHIFT, + ZRAM_SAME = ZRAM_FLAG_SHIFT, ZRAM_ACCESS, /* page is now accessed */ __NR_ZRAM_PAGEFLAGS, @@ -71,7 +71,10 @@ enum zram_pageflags { /* Allocated for each disk page */ struct zram_table_entry { - unsigned long handle; + union { + unsigned long handle; + unsigned long element; + }; unsigned long value; }; @@ -83,7 +86,7 @@ struct zram_stats { atomic64_t failed_writes; /* can happen when memory is too low */ atomic64_t invalid_io; /* non-page-aligned I/O requests */ atomic64_t notify_free; /* no. of swap slot free notifications */ - atomic64_t zero_pages; /* no. of zero filled pages */ + atomic64_t same_pages; /* no. of same element filled pages */ atomic64_t pages_stored; /* no. of pages currently stored */ atomic_long_t max_used_pages; /* no. of maximum pages stored */ atomic64_t writestall; /* no. of write slow paths */ From b200c92fd30858359ddb494e14f6140c4d85c660 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 13 Apr 2017 14:56:35 -0700 Subject: [PATCH 1158/1212] UPSTREAM: zram: fix operator precedence to get offset In zram_rw_page, the logic to get offset is wrong by operator precedence (i.e., "<<" is higher than "&"). With wrong offset, zram can corrupt the user's data. This patch fixes it. Fixes: 8c7f01025 ("zram: implement rw_page operation of zram") Link: http://lkml.kernel.org/r/1492042622-12074-1-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Sergey Senozhatsky Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 4ca82dabc9fbf7bc5322aa54d802cb3cb7b125c5) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I6abb2aef381463976aea1fa8e7f5ca07367190e9 Signed-off-by: Amit Pundir --- drivers/block/zram/zram_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 28a0e5169756..b064c0b3c4c5 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -924,7 +924,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector, } index = sector >> SECTORS_PER_PAGE_SHIFT; - offset = sector & (SECTORS_PER_PAGE - 1) << SECTOR_SHIFT; + offset = (sector & (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT; bv.bv_page = page; bv.bv_len = PAGE_SIZE; From 49ab60b20fbef11ed78ca5dd7b22696f33bc4f6d Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 3 May 2017 14:55:38 -0700 Subject: [PATCH 1159/1212] BACKPORT: zram: handle multiple pages attached bio's bvec Patch series "zram clean up", v2. This patchset aims to clean up zram . [1] clean up multiple pages's bvec handling. [2] clean up partial IO handling [3-6] clean up zram via using accessor and removing pointless structure. With [2-6] applied, we can get a few hundred bytes as well as huge readibility enhance. x86: 708 byte save add/remove: 1/1 grow/shrink: 0/11 up/down: 478/-1186 (-708) function old new delta zram_special_page_read - 478 +478 zram_reset_device 317 314 -3 mem_used_max_store 131 128 -3 compact_store 96 93 -3 mm_stat_show 203 197 -6 zram_add 719 712 -7 zram_slot_free_notify 229 214 -15 zram_make_request 819 803 -16 zram_meta_free 128 111 -17 zram_free_page 180 151 -29 disksize_store 432 361 -71 zram_decompress_page.isra 504 - -504 zram_bvec_rw 2592 2080 -512 Total: Before=25350773, After=25350065, chg -0.00% ppc64: 231 byte save add/remove: 2/0 grow/shrink: 1/9 up/down: 681/-912 (-231) function old new delta zram_special_page_read - 480 +480 zram_slot_lock - 200 +200 vermagic 39 40 +1 mm_stat_show 256 248 -8 zram_meta_free 200 184 -16 zram_add 944 912 -32 zram_free_page 348 308 -40 disksize_store 572 492 -80 zram_decompress_page 664 564 -100 zram_slot_free_notify 292 160 -132 zram_make_request 1132 1000 -132 zram_bvec_rw 2768 2396 -372 Total: Before=17565825, After=17565594, chg -0.00% This patch (of 6): Johannes Thumshirn reported system goes the panic when using NVMe over Fabrics loopback target with zram. The reason is zram expects each bvec in bio contains a single page but nvme can attach a huge bulk of pages attached to the bio's bvec so that zram's index arithmetic could be wrong so that out-of-bound access makes system panic. [1] in mainline solved solved the problem by limiting max_sectors with SECTORS_PER_PAGE but it makes zram slow because bio should split with each pages so this patch makes zram aware of multiple pages in a bvec so it could solve without any regression(ie, bio split). [1] 0bc315381fe9, zram: set physical queue limits to avoid array out of bounds accesses Link: http://lkml.kernel.org/r/20170413134057.GA27499@bbox Signed-off-by: Minchan Kim Reported-by: Johannes Thumshirn Tested-by: Johannes Thumshirn Reviewed-by: Johannes Thumshirn Reviewed-by: Sergey Senozhatsky Cc: Hannes Reinecke Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit e86942c7b6c1e1dd5e539f3bf3cfb63799163048) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: Ibedc9e8163fc16a0c2569e8c3e33dd81bb325ee5 Signed-off-by: Amit Pundir --- drivers/block/zram/zram_drv.c | 36 ++++++++++------------------------- 1 file changed, 10 insertions(+), 26 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index b064c0b3c4c5..90d9c29ebb74 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -136,8 +136,7 @@ static inline bool valid_io_request(struct zram *zram, static void update_position(u32 *index, int *offset, struct bio_vec *bvec) { - if (*offset + bvec->bv_len >= PAGE_SIZE) - (*index)++; + *index += (*offset + bvec->bv_len) / PAGE_SIZE; *offset = (*offset + bvec->bv_len) % PAGE_SIZE; } @@ -835,31 +834,20 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) rw = bio_data_dir(bio); bio_for_each_segment(bvec, bio, iter) { - int max_transfer_size = PAGE_SIZE - offset; - - if (bvec.bv_len > max_transfer_size) { - /* - * zram_bvec_rw() can only make operation on a single - * zram page. Split the bio vector. - */ - struct bio_vec bv; - - bv.bv_page = bvec.bv_page; - bv.bv_len = max_transfer_size; - bv.bv_offset = bvec.bv_offset; + struct bio_vec bv = bvec; + unsigned int unwritten = bvec.bv_len; + do { + bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset, + unwritten); if (zram_bvec_rw(zram, &bv, index, offset, rw) < 0) goto out; - bv.bv_len = bvec.bv_len - max_transfer_size; - bv.bv_offset += max_transfer_size; - if (zram_bvec_rw(zram, &bv, index + 1, 0, rw) < 0) - goto out; - } else - if (zram_bvec_rw(zram, &bvec, index, offset, rw) < 0) - goto out; + bv.bv_offset += bv.bv_len; + unwritten -= bv.bv_len; - update_position(&index, &offset, &bvec); + update_position(&index, &offset, &bv); + } while (unwritten); } bio_endio(bio); @@ -876,8 +864,6 @@ static blk_qc_t zram_make_request(struct request_queue *queue, struct bio *bio) { struct zram *zram = queue->queuedata; - blk_queue_split(queue, &bio, queue->bio_split); - if (!valid_io_request(zram, bio->bi_iter.bi_sector, bio->bi_iter.bi_size)) { atomic64_inc(&zram->stats.invalid_io); @@ -1185,8 +1171,6 @@ static int zram_add(void) blk_queue_io_min(zram->disk->queue, PAGE_SIZE); blk_queue_io_opt(zram->disk->queue, PAGE_SIZE); zram->disk->queue->limits.discard_granularity = PAGE_SIZE; - zram->disk->queue->limits.max_sectors = SECTORS_PER_PAGE; - zram->disk->queue->limits.chunk_sectors = 0; blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX); /* * zram_bio_discard() will clear all logical blocks if logical block From ca119b7f8f3d49a892453cc571faa7601e5eec28 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 3 May 2017 14:55:41 -0700 Subject: [PATCH 1160/1212] BACKPORT: zram: partial IO refactoring For architecture(PAGE_SIZE > 4K), zram have supported partial IO. However, the mixed code for handling normal/partial IO is too mess, error-prone to modify IO handler functions with upcoming feature so this patch aims for cleaning up zram's IO handling functions. Link: http://lkml.kernel.org/r/1492052365-16169-3-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Hannes Reinecke Cc: Johannes Thumshirn Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 1f7319c7427503abe2d365683588827b80f5714e) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I0f023d646d17f8156130cd0507b65f2223768adf Signed-off-by: Amit Pundir --- drivers/block/zram/zram_drv.c | 359 ++++++++++++++++++---------------- 1 file changed, 195 insertions(+), 164 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 90d9c29ebb74..e480eaff1806 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -44,6 +44,8 @@ static const char *default_compressor = "lzo"; /* Module params (documentation at end) */ static unsigned int num_devices = 1; +static void zram_free_page(struct zram *zram, size_t index); + static inline bool init_done(struct zram *zram) { return zram->disksize; @@ -97,10 +99,17 @@ static void zram_set_obj_size(struct zram_meta *meta, meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size; } +#if PAGE_SIZE != 4096 static inline bool is_partial_io(struct bio_vec *bvec) { return bvec->bv_len != PAGE_SIZE; } +#else +static inline bool is_partial_io(struct bio_vec *bvec) +{ + return false; +} +#endif static void zram_revalidate_disk(struct zram *zram) { @@ -188,18 +197,6 @@ static bool page_same_filled(void *ptr, unsigned long *element) return true; } -static void handle_same_page(struct bio_vec *bvec, unsigned long element) -{ - struct page *page = bvec->bv_page; - void *user_mem; - - user_mem = kmap_atomic(page); - zram_fill_page(user_mem + bvec->bv_offset, bvec->bv_len, element); - kunmap_atomic(user_mem); - - flush_dcache_page(page); -} - static ssize_t initstate_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -415,6 +412,53 @@ static DEVICE_ATTR_RO(io_stat); static DEVICE_ATTR_RO(mm_stat); static DEVICE_ATTR_RO(debug_stat); +static bool zram_same_page_read(struct zram *zram, u32 index, + struct page *page, + unsigned int offset, unsigned int len) +{ + struct zram_meta *meta = zram->meta; + + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + if (unlikely(!meta->table[index].handle) || + zram_test_flag(meta, index, ZRAM_SAME)) { + void *mem; + + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + mem = kmap_atomic(page); + zram_fill_page(mem + offset, len, meta->table[index].element); + kunmap_atomic(mem); + return true; + } + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + + return false; +} + +static bool zram_same_page_write(struct zram *zram, u32 index, + struct page *page) +{ + unsigned long element; + void *mem = kmap_atomic(page); + + if (page_same_filled(mem, &element)) { + struct zram_meta *meta = zram->meta; + + kunmap_atomic(mem); + /* Free memory associated with this sector now. */ + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + zram_free_page(zram, index); + zram_set_flag(meta, index, ZRAM_SAME); + zram_set_element(meta, index, element); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + + atomic64_inc(&zram->stats.same_pages); + return true; + } + kunmap_atomic(mem); + + return false; +} + static void zram_meta_free(struct zram_meta *meta, u64 disksize) { size_t num_pages = disksize >> PAGE_SHIFT; @@ -501,169 +545,103 @@ static void zram_free_page(struct zram *zram, size_t index) zram_set_obj_size(meta, index, 0); } -static int zram_decompress_page(struct zram *zram, char *mem, u32 index) +static int zram_decompress_page(struct zram *zram, struct page *page, u32 index) { - int ret = 0; - unsigned char *cmem; - struct zram_meta *meta = zram->meta; + int ret; unsigned long handle; unsigned int size; + void *src, *dst; + struct zram_meta *meta = zram->meta; + + if (zram_same_page_read(zram, index, page, 0, PAGE_SIZE)) + return 0; bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); handle = meta->table[index].handle; size = zram_get_obj_size(meta, index); - if (!handle || zram_test_flag(meta, index, ZRAM_SAME)) { - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); - zram_fill_page(mem, PAGE_SIZE, meta->table[index].element); - return 0; - } - - cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO); + src = zs_map_object(meta->mem_pool, handle, ZS_MM_RO); if (size == PAGE_SIZE) { - memcpy(mem, cmem, PAGE_SIZE); + dst = kmap_atomic(page); + memcpy(dst, src, PAGE_SIZE); + kunmap_atomic(dst); + ret = 0; } else { struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp); - ret = zcomp_decompress(zstrm, cmem, size, mem); + dst = kmap_atomic(page); + ret = zcomp_decompress(zstrm, src, size, dst); + kunmap_atomic(dst); zcomp_stream_put(zram->comp); } zs_unmap_object(meta->mem_pool, handle); bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); - /* Should NEVER happen. Return bio error if it does. */ - if (unlikely(ret)) { - pr_err("Decompression failed! err=%d, page=%u\n", ret, index); - return ret; - } - - return 0; -} - -static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, - u32 index, int offset) -{ - int ret; - struct page *page; - unsigned char *user_mem, *uncmem = NULL; - struct zram_meta *meta = zram->meta; - page = bvec->bv_page; - - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); - if (unlikely(!meta->table[index].handle) || - zram_test_flag(meta, index, ZRAM_SAME)) { - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); - handle_same_page(bvec, meta->table[index].element); - return 0; - } - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); - - if (is_partial_io(bvec)) - /* Use a temporary buffer to decompress the page */ - uncmem = kmalloc(PAGE_SIZE, GFP_NOIO); - - user_mem = kmap_atomic(page); - if (!is_partial_io(bvec)) - uncmem = user_mem; - - if (!uncmem) { - pr_err("Unable to allocate temp memory\n"); - ret = -ENOMEM; - goto out_cleanup; - } - - ret = zram_decompress_page(zram, uncmem, index); /* Should NEVER happen. Return bio error if it does. */ if (unlikely(ret)) - goto out_cleanup; + pr_err("Decompression failed! err=%d, page=%u\n", ret, index); - if (is_partial_io(bvec)) - memcpy(user_mem + bvec->bv_offset, uncmem + offset, - bvec->bv_len); - - flush_dcache_page(page); - ret = 0; -out_cleanup: - kunmap_atomic(user_mem); - if (is_partial_io(bvec)) - kfree(uncmem); return ret; } -static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, - int offset) +static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, + u32 index, int offset) { - int ret = 0; - unsigned int clen; - unsigned long handle = 0; + int ret; struct page *page; - unsigned char *user_mem, *cmem, *src, *uncmem = NULL; - struct zram_meta *meta = zram->meta; - struct zcomp_strm *zstrm = NULL; - unsigned long alloced_pages; - unsigned long element; page = bvec->bv_page; if (is_partial_io(bvec)) { - /* - * This is a partial IO. We need to read the full page - * before to write the changes. - */ - uncmem = kmalloc(PAGE_SIZE, GFP_NOIO); - if (!uncmem) { - ret = -ENOMEM; - goto out; - } - ret = zram_decompress_page(zram, uncmem, index); - if (ret) - goto out; + /* Use a temporary buffer to decompress the page */ + page = alloc_page(GFP_NOIO|__GFP_HIGHMEM); + if (!page) + return -ENOMEM; } + ret = zram_decompress_page(zram, page, index); + if (unlikely(ret)) + goto out; + + if (is_partial_io(bvec)) { + void *dst = kmap_atomic(bvec->bv_page); + void *src = kmap_atomic(page); + + memcpy(dst + bvec->bv_offset, src + offset, bvec->bv_len); + kunmap_atomic(src); + kunmap_atomic(dst); + } +out: + if (is_partial_io(bvec)) + __free_page(page); + + return ret; +} + +static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm, + struct page *page, + unsigned long *out_handle, unsigned int *out_comp_len) +{ + int ret; + unsigned int comp_len; + void *src; + unsigned long alloced_pages; + unsigned long handle = 0; + struct zram_meta *meta = zram->meta; + compress_again: - user_mem = kmap_atomic(page); - if (is_partial_io(bvec)) { - memcpy(uncmem + offset, user_mem + bvec->bv_offset, - bvec->bv_len); - kunmap_atomic(user_mem); - user_mem = NULL; - } else { - uncmem = user_mem; - } - - if (page_same_filled(uncmem, &element)) { - if (user_mem) - kunmap_atomic(user_mem); - /* Free memory associated with this sector now. */ - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); - zram_free_page(zram, index); - zram_set_flag(meta, index, ZRAM_SAME); - zram_set_element(meta, index, element); - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); - - atomic64_inc(&zram->stats.same_pages); - ret = 0; - goto out; - } - - zstrm = zcomp_stream_get(zram->comp); - ret = zcomp_compress(zstrm, uncmem, &clen); - if (!is_partial_io(bvec)) { - kunmap_atomic(user_mem); - user_mem = NULL; - uncmem = NULL; - } + src = kmap_atomic(page); + ret = zcomp_compress(*zstrm, src, &comp_len); + kunmap_atomic(src); if (unlikely(ret)) { pr_err("Compression failed! err=%d\n", ret); - goto out; + if (handle) + zs_free(meta->mem_pool, handle); + return ret; } - src = zstrm->buffer; - if (unlikely(clen > max_zpage_size)) { - clen = PAGE_SIZE; - if (is_partial_io(bvec)) - src = uncmem; - } + if (unlikely(comp_len > max_zpage_size)) + comp_len = PAGE_SIZE; /* * handle allocation has 2 paths: @@ -679,27 +657,21 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, * from the slow path and handle has already been allocated. */ if (!handle) - handle = zs_malloc(meta->mem_pool, clen, + handle = zs_malloc(meta->mem_pool, comp_len, __GFP_KSWAPD_RECLAIM | __GFP_NOWARN | __GFP_HIGHMEM | __GFP_MOVABLE); if (!handle) { zcomp_stream_put(zram->comp); - zstrm = NULL; - atomic64_inc(&zram->stats.writestall); - - handle = zs_malloc(meta->mem_pool, clen, + handle = zs_malloc(meta->mem_pool, comp_len, GFP_NOIO | __GFP_HIGHMEM | __GFP_MOVABLE); + *zstrm = zcomp_stream_get(zram->comp); if (handle) goto compress_again; - - pr_err("Error allocating memory for compressed page: %u, size=%u\n", - index, clen); - ret = -ENOMEM; - goto out; + return -ENOMEM; } alloced_pages = zs_get_total_pages(meta->mem_pool); @@ -707,22 +679,45 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, if (zram->limit_pages && alloced_pages > zram->limit_pages) { zs_free(meta->mem_pool, handle); - ret = -ENOMEM; - goto out; + return -ENOMEM; } - cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO); + *out_handle = handle; + *out_comp_len = comp_len; + return 0; +} - if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) { +static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index) +{ + int ret; + unsigned long handle; + unsigned int comp_len; + void *src, *dst; + struct zcomp_strm *zstrm; + struct zram_meta *meta = zram->meta; + struct page *page = bvec->bv_page; + + if (zram_same_page_write(zram, index, page)) + return 0; + + zstrm = zcomp_stream_get(zram->comp); + ret = zram_compress(zram, &zstrm, page, &handle, &comp_len); + if (ret) { + zcomp_stream_put(zram->comp); + return ret; + } + + + dst = zs_map_object(meta->mem_pool, handle, ZS_MM_WO); + + src = zstrm->buffer; + if (comp_len == PAGE_SIZE) src = kmap_atomic(page); - memcpy(cmem, src, PAGE_SIZE); + memcpy(dst, src, comp_len); + if (comp_len == PAGE_SIZE) kunmap_atomic(src); - } else { - memcpy(cmem, src, clen); - } zcomp_stream_put(zram->comp); - zstrm = NULL; zs_unmap_object(meta->mem_pool, handle); /* @@ -731,19 +726,54 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, */ bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); zram_free_page(zram, index); - meta->table[index].handle = handle; - zram_set_obj_size(meta, index, clen); + zram_set_obj_size(meta, index, comp_len); bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); /* Update stats */ - atomic64_add(clen, &zram->stats.compr_data_size); + atomic64_add(comp_len, &zram->stats.compr_data_size); atomic64_inc(&zram->stats.pages_stored); + return 0; +} + +static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, + u32 index, int offset) +{ + int ret; + struct page *page = NULL; + void *src; + struct bio_vec vec; + + vec = *bvec; + if (is_partial_io(bvec)) { + void *dst; + /* + * This is a partial IO. We need to read the full page + * before to write the changes. + */ + page = alloc_page(GFP_NOIO|__GFP_HIGHMEM); + if (!page) + return -ENOMEM; + + ret = zram_decompress_page(zram, page, index); + if (ret) + goto out; + + src = kmap_atomic(bvec->bv_page); + dst = kmap_atomic(page); + memcpy(dst + offset, src + bvec->bv_offset, bvec->bv_len); + kunmap_atomic(dst); + kunmap_atomic(src); + + vec.bv_page = page; + vec.bv_len = PAGE_SIZE; + vec.bv_offset = 0; + } + + ret = __zram_bvec_write(zram, &vec, index); out: - if (zstrm) - zcomp_stream_put(zram->comp); if (is_partial_io(bvec)) - kfree(uncmem); + __free_page(page); return ret; } @@ -798,6 +828,7 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, if (rw == READ) { atomic64_inc(&zram->stats.num_reads); ret = zram_bvec_read(zram, bvec, index, offset); + flush_dcache_page(bvec->bv_page); } else { atomic64_inc(&zram->stats.num_writes); ret = zram_bvec_write(zram, bvec, index, offset); From 7fa2524c0993b5e7b987cd3f1d8ff96a43abde56 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 3 May 2017 14:55:44 -0700 Subject: [PATCH 1161/1212] UPSTREAM: zram: use zram_slot_lock instead of raw bit_spin_lock op With this clean-up phase, I want to use zram's wrapper function to lock table access which is more consistent with other zram's functions. Link: http://lkml.kernel.org/r/1492052365-16169-4-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 86c49814d449ebc51c7d455ac8e3d17b9fa702eb) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I6afee89dce63dff6d759c78e25926814fc016107 Signed-off-by: Amit Pundir --- drivers/block/zram/zram_drv.c | 41 +++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index e480eaff1806..a883c9876345 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -412,24 +412,38 @@ static DEVICE_ATTR_RO(io_stat); static DEVICE_ATTR_RO(mm_stat); static DEVICE_ATTR_RO(debug_stat); +static void zram_slot_lock(struct zram *zram, u32 index) +{ + struct zram_meta *meta = zram->meta; + + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); +} + +static void zram_slot_unlock(struct zram *zram, u32 index) +{ + struct zram_meta *meta = zram->meta; + + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); +} + static bool zram_same_page_read(struct zram *zram, u32 index, struct page *page, unsigned int offset, unsigned int len) { struct zram_meta *meta = zram->meta; - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + zram_slot_lock(zram, index); if (unlikely(!meta->table[index].handle) || zram_test_flag(meta, index, ZRAM_SAME)) { void *mem; - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + zram_slot_unlock(zram, index); mem = kmap_atomic(page); zram_fill_page(mem + offset, len, meta->table[index].element); kunmap_atomic(mem); return true; } - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + zram_slot_unlock(zram, index); return false; } @@ -445,11 +459,11 @@ static bool zram_same_page_write(struct zram *zram, u32 index, kunmap_atomic(mem); /* Free memory associated with this sector now. */ - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + zram_slot_lock(zram, index); zram_free_page(zram, index); zram_set_flag(meta, index, ZRAM_SAME); zram_set_element(meta, index, element); - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + zram_slot_unlock(zram, index); atomic64_inc(&zram->stats.same_pages); return true; @@ -556,7 +570,7 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index) if (zram_same_page_read(zram, index, page, 0, PAGE_SIZE)) return 0; - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + zram_slot_lock(zram, index); handle = meta->table[index].handle; size = zram_get_obj_size(meta, index); @@ -575,7 +589,7 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index) zcomp_stream_put(zram->comp); } zs_unmap_object(meta->mem_pool, handle); - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + zram_slot_unlock(zram, index); /* Should NEVER happen. Return bio error if it does. */ if (unlikely(ret)) @@ -724,11 +738,11 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index) * Free memory associated with this sector * before overwriting unused sectors. */ - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + zram_slot_lock(zram, index); zram_free_page(zram, index); meta->table[index].handle = handle; zram_set_obj_size(meta, index, comp_len); - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + zram_slot_unlock(zram, index); /* Update stats */ atomic64_add(comp_len, &zram->stats.compr_data_size); @@ -786,7 +800,6 @@ static void zram_bio_discard(struct zram *zram, u32 index, int offset, struct bio *bio) { size_t n = bio->bi_iter.bi_size; - struct zram_meta *meta = zram->meta; /* * zram manages data in physical block size units. Because logical block @@ -807,9 +820,9 @@ static void zram_bio_discard(struct zram *zram, u32 index, } while (n >= PAGE_SIZE) { - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + zram_slot_lock(zram, index); zram_free_page(zram, index); - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + zram_slot_unlock(zram, index); atomic64_inc(&zram->stats.notify_free); index++; n -= PAGE_SIZE; @@ -918,9 +931,9 @@ static void zram_slot_free_notify(struct block_device *bdev, zram = bdev->bd_disk->private_data; meta = zram->meta; - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + zram_slot_lock(zram, index); zram_free_page(zram, index); - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + zram_slot_unlock(zram, index); atomic64_inc(&zram->stats.notify_free); } From bb63c75ee7200e4a951c17ed014976f1a77c802f Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 3 May 2017 14:55:47 -0700 Subject: [PATCH 1162/1212] UPSTREAM: zram: remove zram_meta structure It's redundant now. Instead, remove it and use zram structure directly. Link: http://lkml.kernel.org/r/1492052365-16169-5-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Hannes Reinecke Cc: Johannes Thumshirn Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit beb6602cf87abee547b2692031185111f625153a) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I720a282710b97fd75c156305fd505d4497b89e4c Signed-off-by: Amit Pundir --- drivers/block/zram/zram_drv.c | 191 ++++++++++++++-------------------- drivers/block/zram/zram_drv.h | 6 +- 2 files changed, 79 insertions(+), 118 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index a883c9876345..6690b25d6d29 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -57,46 +57,46 @@ static inline struct zram *dev_to_zram(struct device *dev) } /* flag operations require table entry bit_spin_lock() being held */ -static int zram_test_flag(struct zram_meta *meta, u32 index, +static int zram_test_flag(struct zram *zram, u32 index, enum zram_pageflags flag) { - return meta->table[index].value & BIT(flag); + return zram->table[index].value & BIT(flag); } -static void zram_set_flag(struct zram_meta *meta, u32 index, +static void zram_set_flag(struct zram *zram, u32 index, enum zram_pageflags flag) { - meta->table[index].value |= BIT(flag); + zram->table[index].value |= BIT(flag); } -static void zram_clear_flag(struct zram_meta *meta, u32 index, +static void zram_clear_flag(struct zram *zram, u32 index, enum zram_pageflags flag) { - meta->table[index].value &= ~BIT(flag); + zram->table[index].value &= ~BIT(flag); } -static inline void zram_set_element(struct zram_meta *meta, u32 index, +static inline void zram_set_element(struct zram *zram, u32 index, unsigned long element) { - meta->table[index].element = element; + zram->table[index].element = element; } -static inline void zram_clear_element(struct zram_meta *meta, u32 index) +static inline void zram_clear_element(struct zram *zram, u32 index) { - meta->table[index].element = 0; + zram->table[index].element = 0; } -static size_t zram_get_obj_size(struct zram_meta *meta, u32 index) +static size_t zram_get_obj_size(struct zram *zram, u32 index) { - return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1); + return zram->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1); } -static void zram_set_obj_size(struct zram_meta *meta, +static void zram_set_obj_size(struct zram *zram, u32 index, size_t size) { - unsigned long flags = meta->table[index].value >> ZRAM_FLAG_SHIFT; + unsigned long flags = zram->table[index].value >> ZRAM_FLAG_SHIFT; - meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size; + zram->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size; } #if PAGE_SIZE != 4096 @@ -249,9 +249,8 @@ static ssize_t mem_used_max_store(struct device *dev, down_read(&zram->init_lock); if (init_done(zram)) { - struct zram_meta *meta = zram->meta; atomic_long_set(&zram->stats.max_used_pages, - zs_get_total_pages(meta->mem_pool)); + zs_get_total_pages(zram->mem_pool)); } up_read(&zram->init_lock); @@ -324,7 +323,6 @@ static ssize_t compact_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct zram *zram = dev_to_zram(dev); - struct zram_meta *meta; down_read(&zram->init_lock); if (!init_done(zram)) { @@ -332,8 +330,7 @@ static ssize_t compact_store(struct device *dev, return -EINVAL; } - meta = zram->meta; - zs_compact(meta->mem_pool); + zs_compact(zram->mem_pool); up_read(&zram->init_lock); return len; @@ -370,8 +367,8 @@ static ssize_t mm_stat_show(struct device *dev, down_read(&zram->init_lock); if (init_done(zram)) { - mem_used = zs_get_total_pages(zram->meta->mem_pool); - zs_pool_stats(zram->meta->mem_pool, &pool_stats); + mem_used = zs_get_total_pages(zram->mem_pool); + zs_pool_stats(zram->mem_pool, &pool_stats); } orig_size = atomic64_read(&zram->stats.pages_stored); @@ -414,32 +411,26 @@ static DEVICE_ATTR_RO(debug_stat); static void zram_slot_lock(struct zram *zram, u32 index) { - struct zram_meta *meta = zram->meta; - - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + bit_spin_lock(ZRAM_ACCESS, &zram->table[index].value); } static void zram_slot_unlock(struct zram *zram, u32 index) { - struct zram_meta *meta = zram->meta; - - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + bit_spin_unlock(ZRAM_ACCESS, &zram->table[index].value); } static bool zram_same_page_read(struct zram *zram, u32 index, struct page *page, unsigned int offset, unsigned int len) { - struct zram_meta *meta = zram->meta; - zram_slot_lock(zram, index); - if (unlikely(!meta->table[index].handle) || - zram_test_flag(meta, index, ZRAM_SAME)) { + if (unlikely(!zram->table[index].handle) || + zram_test_flag(zram, index, ZRAM_SAME)) { void *mem; zram_slot_unlock(zram, index); mem = kmap_atomic(page); - zram_fill_page(mem + offset, len, meta->table[index].element); + zram_fill_page(mem + offset, len, zram->table[index].element); kunmap_atomic(mem); return true; } @@ -455,14 +446,12 @@ static bool zram_same_page_write(struct zram *zram, u32 index, void *mem = kmap_atomic(page); if (page_same_filled(mem, &element)) { - struct zram_meta *meta = zram->meta; - kunmap_atomic(mem); /* Free memory associated with this sector now. */ zram_slot_lock(zram, index); zram_free_page(zram, index); - zram_set_flag(meta, index, ZRAM_SAME); - zram_set_element(meta, index, element); + zram_set_flag(zram, index, ZRAM_SAME); + zram_set_element(zram, index, element); zram_slot_unlock(zram, index); atomic64_inc(&zram->stats.same_pages); @@ -473,56 +462,44 @@ static bool zram_same_page_write(struct zram *zram, u32 index, return false; } -static void zram_meta_free(struct zram_meta *meta, u64 disksize) +static void zram_meta_free(struct zram *zram, u64 disksize) { size_t num_pages = disksize >> PAGE_SHIFT; size_t index; /* Free all pages that are still in this zram device */ for (index = 0; index < num_pages; index++) { - unsigned long handle = meta->table[index].handle; + unsigned long handle = zram->table[index].handle; /* * No memory is allocated for same element filled pages. * Simply clear same page flag. */ - if (!handle || zram_test_flag(meta, index, ZRAM_SAME)) + if (!handle || zram_test_flag(zram, index, ZRAM_SAME)) continue; - zs_free(meta->mem_pool, handle); + zs_free(zram->mem_pool, handle); } - zs_destroy_pool(meta->mem_pool); - vfree(meta->table); - kfree(meta); + zs_destroy_pool(zram->mem_pool); + vfree(zram->table); } -static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize) +static bool zram_meta_alloc(struct zram *zram, u64 disksize) { size_t num_pages; - struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL); - - if (!meta) - return NULL; num_pages = disksize >> PAGE_SHIFT; - meta->table = vzalloc(num_pages * sizeof(*meta->table)); - if (!meta->table) { - pr_err("Error allocating zram address table\n"); - goto out_error; + zram->table = vzalloc(num_pages * sizeof(*zram->table)); + if (!zram->table) + return false; + + zram->mem_pool = zs_create_pool(zram->disk->disk_name); + if (!zram->mem_pool) { + vfree(zram->table); + return false; } - meta->mem_pool = zs_create_pool(pool_name); - if (!meta->mem_pool) { - pr_err("Error creating memory pool\n"); - goto out_error; - } - - return meta; - -out_error: - vfree(meta->table); - kfree(meta); - return NULL; + return true; } /* @@ -532,16 +509,15 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize) */ static void zram_free_page(struct zram *zram, size_t index) { - struct zram_meta *meta = zram->meta; - unsigned long handle = meta->table[index].handle; + unsigned long handle = zram->table[index].handle; /* * No memory is allocated for same element filled pages. * Simply clear same page flag. */ - if (zram_test_flag(meta, index, ZRAM_SAME)) { - zram_clear_flag(meta, index, ZRAM_SAME); - zram_clear_element(meta, index); + if (zram_test_flag(zram, index, ZRAM_SAME)) { + zram_clear_flag(zram, index, ZRAM_SAME); + zram_clear_element(zram, index); atomic64_dec(&zram->stats.same_pages); return; } @@ -549,14 +525,14 @@ static void zram_free_page(struct zram *zram, size_t index) if (!handle) return; - zs_free(meta->mem_pool, handle); + zs_free(zram->mem_pool, handle); - atomic64_sub(zram_get_obj_size(meta, index), + atomic64_sub(zram_get_obj_size(zram, index), &zram->stats.compr_data_size); atomic64_dec(&zram->stats.pages_stored); - meta->table[index].handle = 0; - zram_set_obj_size(meta, index, 0); + zram->table[index].handle = 0; + zram_set_obj_size(zram, index, 0); } static int zram_decompress_page(struct zram *zram, struct page *page, u32 index) @@ -565,16 +541,15 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index) unsigned long handle; unsigned int size; void *src, *dst; - struct zram_meta *meta = zram->meta; if (zram_same_page_read(zram, index, page, 0, PAGE_SIZE)) return 0; zram_slot_lock(zram, index); - handle = meta->table[index].handle; - size = zram_get_obj_size(meta, index); + handle = zram->table[index].handle; + size = zram_get_obj_size(zram, index); - src = zs_map_object(meta->mem_pool, handle, ZS_MM_RO); + src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO); if (size == PAGE_SIZE) { dst = kmap_atomic(page); memcpy(dst, src, PAGE_SIZE); @@ -588,7 +563,7 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index) kunmap_atomic(dst); zcomp_stream_put(zram->comp); } - zs_unmap_object(meta->mem_pool, handle); + zs_unmap_object(zram->mem_pool, handle); zram_slot_unlock(zram, index); /* Should NEVER happen. Return bio error if it does. */ @@ -640,7 +615,6 @@ static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm, void *src; unsigned long alloced_pages; unsigned long handle = 0; - struct zram_meta *meta = zram->meta; compress_again: src = kmap_atomic(page); @@ -650,7 +624,7 @@ static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm, if (unlikely(ret)) { pr_err("Compression failed! err=%d\n", ret); if (handle) - zs_free(meta->mem_pool, handle); + zs_free(zram->mem_pool, handle); return ret; } @@ -671,7 +645,7 @@ static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm, * from the slow path and handle has already been allocated. */ if (!handle) - handle = zs_malloc(meta->mem_pool, comp_len, + handle = zs_malloc(zram->mem_pool, comp_len, __GFP_KSWAPD_RECLAIM | __GFP_NOWARN | __GFP_HIGHMEM | @@ -679,7 +653,7 @@ static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm, if (!handle) { zcomp_stream_put(zram->comp); atomic64_inc(&zram->stats.writestall); - handle = zs_malloc(meta->mem_pool, comp_len, + handle = zs_malloc(zram->mem_pool, comp_len, GFP_NOIO | __GFP_HIGHMEM | __GFP_MOVABLE); *zstrm = zcomp_stream_get(zram->comp); @@ -688,11 +662,11 @@ static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm, return -ENOMEM; } - alloced_pages = zs_get_total_pages(meta->mem_pool); + alloced_pages = zs_get_total_pages(zram->mem_pool); update_used_max(zram, alloced_pages); if (zram->limit_pages && alloced_pages > zram->limit_pages) { - zs_free(meta->mem_pool, handle); + zs_free(zram->mem_pool, handle); return -ENOMEM; } @@ -708,7 +682,6 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index) unsigned int comp_len; void *src, *dst; struct zcomp_strm *zstrm; - struct zram_meta *meta = zram->meta; struct page *page = bvec->bv_page; if (zram_same_page_write(zram, index, page)) @@ -721,8 +694,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index) return ret; } - - dst = zs_map_object(meta->mem_pool, handle, ZS_MM_WO); + dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO); src = zstrm->buffer; if (comp_len == PAGE_SIZE) @@ -732,7 +704,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index) kunmap_atomic(src); zcomp_stream_put(zram->comp); - zs_unmap_object(meta->mem_pool, handle); + zs_unmap_object(zram->mem_pool, handle); /* * Free memory associated with this sector @@ -740,8 +712,8 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index) */ zram_slot_lock(zram, index); zram_free_page(zram, index); - meta->table[index].handle = handle; - zram_set_obj_size(meta, index, comp_len); + zram->table[index].handle = handle; + zram_set_obj_size(zram, index, comp_len); zram_slot_unlock(zram, index); /* Update stats */ @@ -926,10 +898,8 @@ static void zram_slot_free_notify(struct block_device *bdev, unsigned long index) { struct zram *zram; - struct zram_meta *meta; zram = bdev->bd_disk->private_data; - meta = zram->meta; zram_slot_lock(zram, index); zram_free_page(zram, index); @@ -977,7 +947,6 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector, static void zram_reset_device(struct zram *zram) { - struct zram_meta *meta; struct zcomp *comp; u64 disksize; @@ -990,7 +959,6 @@ static void zram_reset_device(struct zram *zram) return; } - meta = zram->meta; comp = zram->comp; disksize = zram->disksize; @@ -1003,7 +971,7 @@ static void zram_reset_device(struct zram *zram) up_write(&zram->init_lock); /* I/O operation under all of CPU are done so let's free */ - zram_meta_free(meta, disksize); + zram_meta_free(zram, disksize); zcomp_destroy(comp); } @@ -1012,7 +980,6 @@ static ssize_t disksize_store(struct device *dev, { u64 disksize; struct zcomp *comp; - struct zram_meta *meta; struct zram *zram = dev_to_zram(dev); int err; @@ -1020,10 +987,18 @@ static ssize_t disksize_store(struct device *dev, if (!disksize) return -EINVAL; + down_write(&zram->init_lock); + if (init_done(zram)) { + pr_info("Cannot change disksize for initialized device\n"); + err = -EBUSY; + goto out_unlock; + } + disksize = PAGE_ALIGN(disksize); - meta = zram_meta_alloc(zram->disk->disk_name, disksize); - if (!meta) - return -ENOMEM; + if (!zram_meta_alloc(zram, disksize)) { + err = -ENOMEM; + goto out_unlock; + } comp = zcomp_create(zram->compressor); if (IS_ERR(comp)) { @@ -1033,14 +1008,6 @@ static ssize_t disksize_store(struct device *dev, goto out_free_meta; } - down_write(&zram->init_lock); - if (init_done(zram)) { - pr_info("Cannot change disksize for initialized device\n"); - err = -EBUSY; - goto out_destroy_comp; - } - - zram->meta = meta; zram->comp = comp; zram->disksize = disksize; set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); @@ -1049,11 +1016,10 @@ static ssize_t disksize_store(struct device *dev, return len; -out_destroy_comp: - up_write(&zram->init_lock); - zcomp_destroy(comp); out_free_meta: - zram_meta_free(meta, disksize); + zram_meta_free(zram, disksize); +out_unlock: + up_write(&zram->init_lock); return err; } @@ -1240,7 +1206,6 @@ static int zram_add(void) goto out_free_disk; } strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor)); - zram->meta = NULL; pr_info("Added device: %s\n", zram->disk->disk_name); return device_id; diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index caeff51f1571..e34e44d02e3e 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -92,13 +92,9 @@ struct zram_stats { atomic64_t writestall; /* no. of write slow paths */ }; -struct zram_meta { +struct zram { struct zram_table_entry *table; struct zs_pool *mem_pool; -}; - -struct zram { - struct zram_meta *meta; struct zcomp *comp; struct gendisk *disk; /* Prevent concurrent execution of device init */ From 84f48a884b42de75eee774b81c259a4ec1f6d5a9 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 3 May 2017 14:55:50 -0700 Subject: [PATCH 1163/1212] UPSTREAM: zram: introduce zram data accessor With element, sometime I got confused handle and element access. It might be my bad but I think it's time to introduce accessor to prevent future idiot like me. This patch is just clean-up patch so it shouldn't change any behavior. Link: http://lkml.kernel.org/r/1492052365-16169-6-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 643ae61d0f41c48aa7179921fe15ba4b4d8ddfec) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I3916d5561ab9fb2917455cac74bee431fbe84b5d Signed-off-by: Amit Pundir --- drivers/block/zram/zram_drv.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 6690b25d6d29..cde2129304d6 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -56,6 +56,16 @@ static inline struct zram *dev_to_zram(struct device *dev) return (struct zram *)dev_to_disk(dev)->private_data; } +static unsigned long zram_get_handle(struct zram *zram, u32 index) +{ + return zram->table[index].handle; +} + +static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle) +{ + zram->table[index].handle = handle; +} + /* flag operations require table entry bit_spin_lock() being held */ static int zram_test_flag(struct zram *zram, u32 index, enum zram_pageflags flag) @@ -81,9 +91,9 @@ static inline void zram_set_element(struct zram *zram, u32 index, zram->table[index].element = element; } -static inline void zram_clear_element(struct zram *zram, u32 index) +static unsigned long zram_get_element(struct zram *zram, u32 index) { - zram->table[index].element = 0; + return zram->table[index].element; } static size_t zram_get_obj_size(struct zram *zram, u32 index) @@ -424,13 +434,14 @@ static bool zram_same_page_read(struct zram *zram, u32 index, unsigned int offset, unsigned int len) { zram_slot_lock(zram, index); - if (unlikely(!zram->table[index].handle) || - zram_test_flag(zram, index, ZRAM_SAME)) { + if (unlikely(!zram_get_handle(zram, index) || + zram_test_flag(zram, index, ZRAM_SAME))) { void *mem; zram_slot_unlock(zram, index); mem = kmap_atomic(page); - zram_fill_page(mem + offset, len, zram->table[index].element); + zram_fill_page(mem + offset, len, + zram_get_element(zram, index)); kunmap_atomic(mem); return true; } @@ -469,7 +480,7 @@ static void zram_meta_free(struct zram *zram, u64 disksize) /* Free all pages that are still in this zram device */ for (index = 0; index < num_pages; index++) { - unsigned long handle = zram->table[index].handle; + unsigned long handle = zram_get_handle(zram, index); /* * No memory is allocated for same element filled pages. * Simply clear same page flag. @@ -509,7 +520,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) */ static void zram_free_page(struct zram *zram, size_t index) { - unsigned long handle = zram->table[index].handle; + unsigned long handle = zram_get_handle(zram, index); /* * No memory is allocated for same element filled pages. @@ -517,7 +528,7 @@ static void zram_free_page(struct zram *zram, size_t index) */ if (zram_test_flag(zram, index, ZRAM_SAME)) { zram_clear_flag(zram, index, ZRAM_SAME); - zram_clear_element(zram, index); + zram_set_element(zram, index, 0); atomic64_dec(&zram->stats.same_pages); return; } @@ -531,7 +542,7 @@ static void zram_free_page(struct zram *zram, size_t index) &zram->stats.compr_data_size); atomic64_dec(&zram->stats.pages_stored); - zram->table[index].handle = 0; + zram_set_handle(zram, index, 0); zram_set_obj_size(zram, index, 0); } @@ -546,7 +557,7 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index) return 0; zram_slot_lock(zram, index); - handle = zram->table[index].handle; + handle = zram_get_handle(zram, index); size = zram_get_obj_size(zram, index); src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO); @@ -712,7 +723,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index) */ zram_slot_lock(zram, index); zram_free_page(zram, index); - zram->table[index].handle = handle; + zram_set_handle(zram, index, handle); zram_set_obj_size(zram, index, comp_len); zram_slot_unlock(zram, index); From 12306d58fe6d3254b4233e5c445514a363cdbcf9 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 3 May 2017 14:55:53 -0700 Subject: [PATCH 1164/1212] UPSTREAM: zram: use zram_free_page instead of open-coded The zram_free_page already handles NULL handle case and same page so use it to reduce error probability. (Acutaully, I made a mistake when I handled same page feature) Link: http://lkml.kernel.org/r/1492052365-16169-7-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Hannes Reinecke Cc: Johannes Thumshirn Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 302128dce142d780417aa548bfd7ef4dfb89fa80) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: Ie38c52dfb1959377936b7cd9158ad1b5a02219bd Signed-off-by: Amit Pundir --- drivers/block/zram/zram_drv.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index cde2129304d6..2dbd39781254 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -479,17 +479,8 @@ static void zram_meta_free(struct zram *zram, u64 disksize) size_t index; /* Free all pages that are still in this zram device */ - for (index = 0; index < num_pages; index++) { - unsigned long handle = zram_get_handle(zram, index); - /* - * No memory is allocated for same element filled pages. - * Simply clear same page flag. - */ - if (!handle || zram_test_flag(zram, index, ZRAM_SAME)) - continue; - - zs_free(zram->mem_pool, handle); - } + for (index = 0; index < num_pages; index++) + zram_free_page(zram, index); zs_destroy_pool(zram->mem_pool); vfree(zram->table); @@ -972,9 +963,6 @@ static void zram_reset_device(struct zram *zram) comp = zram->comp; disksize = zram->disksize; - - /* Reset stats */ - memset(&zram->stats, 0, sizeof(zram->stats)); zram->disksize = 0; set_capacity(zram->disk, 0); @@ -983,6 +971,7 @@ static void zram_reset_device(struct zram *zram) up_write(&zram->init_lock); /* I/O operation under all of CPU are done so let's free */ zram_meta_free(zram, disksize); + memset(&zram->stats, 0, sizeof(zram->stats)); zcomp_destroy(comp); } From 88befbacdb1218dda6a9b7923d50da99998431d2 Mon Sep 17 00:00:00 2001 From: Sangwoo Park Date: Wed, 3 May 2017 14:55:56 -0700 Subject: [PATCH 1165/1212] UPSTREAM: zram: reduce load operation in page_same_filled In page_same_filled function, all elements in the page is compared with next index value. The current comparison routine compares the (i)th and (i+1)th values of the page. In this case, two load operaions occur for each comparison. But if we store first value of the page stores at 'val' variable and using it to compare with others, the load opearation is reduced. It reduce load operation per page by up to 64times. Link: http://lkml.kernel.org/r/1488428104-7257-1-git-send-email-sangwoo2.park@lge.com Signed-off-by: Sangwoo Park Reviewed-by: Sergey Senozhatsky Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit f0fe9984656604ea8effd5ff82709ff8ce1f954b) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I6b58b583e83139eee9f0540da12850c43510cb8e Signed-off-by: Amit Pundir --- drivers/block/zram/zram_drv.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 2dbd39781254..2db2459857fb 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -194,15 +194,17 @@ static bool page_same_filled(void *ptr, unsigned long *element) { unsigned int pos; unsigned long *page; + unsigned long val; page = (unsigned long *)ptr; + val = page[0]; - for (pos = 0; pos < PAGE_SIZE / sizeof(*page) - 1; pos++) { - if (page[pos] != page[pos + 1]) + for (pos = 1; pos < PAGE_SIZE / sizeof(*page); pos++) { + if (val != page[pos]) return false; } - *element = page[pos]; + *element = val; return true; } From 2b90f2dd08393e43b117f149dc6a1ed34069db8f Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 6 Jul 2017 15:37:12 -0700 Subject: [PATCH 1166/1212] UPSTREAM: zram: count same page write as page_stored Regardless of whether it is same page or not, it's surely write and stored to zram so we should increase pages_stored stat. Otherwise, user can see zero value via mm_stats although he writes a lot of pages to zram. Link: http://lkml.kernel.org/r/1494834068-27004-1-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 51f9f82c855d65ef14c2af10e0d2c86ec332a182) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I006d80df413a0fe0fd7dd58e535c6a2c03ab2c9d Signed-off-by: Amit Pundir --- drivers/block/zram/zram_drv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 2db2459857fb..ac9b1cacb970 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -468,6 +468,7 @@ static bool zram_same_page_write(struct zram *zram, u32 index, zram_slot_unlock(zram, index); atomic64_inc(&zram->stats.same_pages); + atomic64_inc(&zram->stats.pages_stored); return true; } kunmap_atomic(mem); @@ -523,6 +524,7 @@ static void zram_free_page(struct zram *zram, size_t index) zram_clear_flag(zram, index, ZRAM_SAME); zram_set_element(zram, index, 0); atomic64_dec(&zram->stats.same_pages); + atomic64_dec(&zram->stats.pages_stored); return; } From ca03d6eae2bc199d7b51d3a39869a0687b726f6f Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Mon, 10 Jul 2017 15:50:15 -0700 Subject: [PATCH 1167/1212] UPSTREAM: zram: constify attribute_group structures. attribute_groups are not supposed to change at runtime. All functions working with attribute_groups provided by work with const attribute_group. So mark the non-const structs as const. File size before: text data bss dec hex filename 8293 841 4 9138 23b2 drivers/block/zram/zram_drv.o File size After adding 'const': text data bss dec hex filename 8357 777 4 9138 23b2 drivers/block/zram/zram_drv.o Link: http://lkml.kernel.org/r/65680c1c4d85818f7094cbfa31c91bf28185ba1b.1499061182.git.arvind.yadav.cs@gmail.com Signed-off-by: Arvind Yadav Acked-by: Minchan Kim Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit bc1bb362334ebc4c65dd4301f10fb70902b3db7d) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: Ic0765dea8c2fadb18623605ba48748a9b33df3fa Signed-off-by: Amit Pundir --- drivers/block/zram/zram_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index ac9b1cacb970..5e3ee189f9cf 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1118,7 +1118,7 @@ static struct attribute *zram_disk_attrs[] = { NULL, }; -static struct attribute_group zram_disk_attr_group = { +static const struct attribute_group zram_disk_attr_group = { .attrs = zram_disk_attrs, }; From 0ef897677ba14a1c1bfb9cc8c8a81041d4186ce5 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Thu, 10 Aug 2017 15:24:29 -0700 Subject: [PATCH 1168/1212] UPSTREAM: zram: rework copy of compressor name in comp_algorithm_store() comp_algorithm_store() passes the size of the source buffer to strlcpy() instead of the destination buffer size. Make it explicit that the two buffers have the same size and use strcpy() instead of strlcpy(). The latter can be done safely since the function ensures that the string in the source buffer is terminated. Link: http://lkml.kernel.org/r/20170803163350.45245-1-mka@chromium.org Signed-off-by: Matthias Kaehlcke Reviewed-by: Douglas Anderson Reviewed-by: Sergey Senozhatsky Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit f357e345eef7863da037e0243f2d3df4ba6df986) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: Ic9667b215ce5e0717bc6829d65e43e9b79602362 Signed-off-by: Amit Pundir --- drivers/block/zram/zram_drv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 5e3ee189f9cf..7bed8e73c376 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -307,7 +307,7 @@ static ssize_t comp_algorithm_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct zram *zram = dev_to_zram(dev); - char compressor[CRYPTO_MAX_ALG_NAME]; + char compressor[ARRAY_SIZE(zram->compressor)]; size_t sz; strlcpy(compressor, buf, sizeof(compressor)); @@ -326,7 +326,7 @@ static ssize_t comp_algorithm_store(struct device *dev, return -EBUSY; } - strlcpy(zram->compressor, compressor, sizeof(compressor)); + strcpy(zram->compressor, compressor); up_write(&zram->init_lock); return len; } From 34aef16229fb77ab7b91b17bbe8003798dccae23 Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Fri, 4 Aug 2017 13:19:17 -0700 Subject: [PATCH 1169/1212] UPSTREAM: lib: Add xxhash module Adds xxhash kernel module with xxh32 and xxh64 hashes. xxhash is an extremely fast non-cryptographic hash algorithm for checksumming. The zstd compression and decompression modules added in the next patch require xxhash. I extracted it out from zstd since it is useful on its own. I copied the code from the upstream XXHash source repository and translated it into kernel style. I ran benchmarks and tests in the kernel and tests in userland. I benchmarked xxhash as a special character device. I ran in four modes, no-op, xxh32, xxh64, and crc32. The no-op mode simply copies the data to kernel space and ignores it. The xxh32, xxh64, and crc32 modes compute hashes on the copied data. I also ran it with four different buffer sizes. The benchmark file is located in the upstream zstd source repository under `contrib/linux-kernel/xxhash_test.c` [1]. I ran the benchmarks on a Ubuntu 14.04 VM with 2 cores and 4 GiB of RAM. The VM is running on a MacBook Pro with a 3.1 GHz Intel Core i7 processor, 16 GB of RAM, and a SSD. I benchmarked using the file `filesystem.squashfs` from `ubuntu-16.10-desktop-amd64.iso`, which is 1,536,217,088 B large. Run the following commands for the benchmark: modprobe xxhash_test mknod xxhash_test c 245 0 time cp filesystem.squashfs xxhash_test The time is reported by the time of the userland `cp`. The GB/s is computed with 1,536,217,008 B / time(buffer size, hash) which includes the time to copy from userland. The Normalized GB/s is computed with 1,536,217,088 B / (time(buffer size, hash) - time(buffer size, none)). | Buffer Size (B) | Hash | Time (s) | GB/s | Adjusted GB/s | |-----------------|-------|----------|------|---------------| | 1024 | none | 0.408 | 3.77 | - | | 1024 | xxh32 | 0.649 | 2.37 | 6.37 | | 1024 | xxh64 | 0.542 | 2.83 | 11.46 | | 1024 | crc32 | 1.290 | 1.19 | 1.74 | | 4096 | none | 0.380 | 4.04 | - | | 4096 | xxh32 | 0.645 | 2.38 | 5.79 | | 4096 | xxh64 | 0.500 | 3.07 | 12.80 | | 4096 | crc32 | 1.168 | 1.32 | 1.95 | | 8192 | none | 0.351 | 4.38 | - | | 8192 | xxh32 | 0.614 | 2.50 | 5.84 | | 8192 | xxh64 | 0.464 | 3.31 | 13.60 | | 8192 | crc32 | 1.163 | 1.32 | 1.89 | | 16384 | none | 0.346 | 4.43 | - | | 16384 | xxh32 | 0.590 | 2.60 | 6.30 | | 16384 | xxh64 | 0.466 | 3.30 | 12.80 | | 16384 | crc32 | 1.183 | 1.30 | 1.84 | Tested in userland using the test-suite in the zstd repo under `contrib/linux-kernel/test/XXHashUserlandTest.cpp` [2] by mocking the kernel functions. A line in each branch of every function in `xxhash.c` was commented out to ensure that the test-suite fails. Additionally tested while testing zstd and with SMHasher [3]. [1] https://phabricator.intern.facebook.com/P57526246 [2] https://github.com/facebook/zstd/blob/dev/contrib/linux-kernel/test/XXHashUserlandTest.cpp [3] https://github.com/aappleby/smhasher zstd source repository: https://github.com/facebook/zstd XXHash source repository: https://github.com/cyan4973/xxhash Signed-off-by: Nick Terrell Signed-off-by: Chris Mason (cherry picked from commit 5d2405227a9eaea48e8cc95756a06d407b11f141) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I4b63e96457f17cf455591e8f35058dacd7aa9004 Signed-off-by: Amit Pundir --- include/linux/xxhash.h | 236 +++++++++++++++++++ lib/Kconfig | 3 + lib/Makefile | 1 + lib/xxhash.c | 500 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 740 insertions(+) create mode 100644 include/linux/xxhash.h create mode 100644 lib/xxhash.c diff --git a/include/linux/xxhash.h b/include/linux/xxhash.h new file mode 100644 index 000000000000..9e1f42cb57e9 --- /dev/null +++ b/include/linux/xxhash.h @@ -0,0 +1,236 @@ +/* + * xxHash - Extremely Fast Hash algorithm + * Copyright (C) 2012-2016, Yann Collet. + * + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * This program is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License version 2 as published by the + * Free Software Foundation. This program is dual-licensed; you may select + * either version 2 of the GNU General Public License ("GPL") or BSD license + * ("BSD"). + * + * You can contact the author at: + * - xxHash homepage: http://cyan4973.github.io/xxHash/ + * - xxHash source repository: https://github.com/Cyan4973/xxHash + */ + +/* + * Notice extracted from xxHash homepage: + * + * xxHash is an extremely fast Hash algorithm, running at RAM speed limits. + * It also successfully passes all tests from the SMHasher suite. + * + * Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 + * Duo @3GHz) + * + * Name Speed Q.Score Author + * xxHash 5.4 GB/s 10 + * CrapWow 3.2 GB/s 2 Andrew + * MumurHash 3a 2.7 GB/s 10 Austin Appleby + * SpookyHash 2.0 GB/s 10 Bob Jenkins + * SBox 1.4 GB/s 9 Bret Mulvey + * Lookup3 1.2 GB/s 9 Bob Jenkins + * SuperFastHash 1.2 GB/s 1 Paul Hsieh + * CityHash64 1.05 GB/s 10 Pike & Alakuijala + * FNV 0.55 GB/s 5 Fowler, Noll, Vo + * CRC32 0.43 GB/s 9 + * MD5-32 0.33 GB/s 10 Ronald L. Rivest + * SHA1-32 0.28 GB/s 10 + * + * Q.Score is a measure of quality of the hash function. + * It depends on successfully passing SMHasher test set. + * 10 is a perfect score. + * + * A 64-bits version, named xxh64 offers much better speed, + * but for 64-bits applications only. + * Name Speed on 64 bits Speed on 32 bits + * xxh64 13.8 GB/s 1.9 GB/s + * xxh32 6.8 GB/s 6.0 GB/s + */ + +#ifndef XXHASH_H +#define XXHASH_H + +#include + +/*-**************************** + * Simple Hash Functions + *****************************/ + +/** + * xxh32() - calculate the 32-bit hash of the input with a given seed. + * + * @input: The data to hash. + * @length: The length of the data to hash. + * @seed: The seed can be used to alter the result predictably. + * + * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s + * + * Return: The 32-bit hash of the data. + */ +uint32_t xxh32(const void *input, size_t length, uint32_t seed); + +/** + * xxh64() - calculate the 64-bit hash of the input with a given seed. + * + * @input: The data to hash. + * @length: The length of the data to hash. + * @seed: The seed can be used to alter the result predictably. + * + * This function runs 2x faster on 64-bit systems, but slower on 32-bit systems. + * + * Return: The 64-bit hash of the data. + */ +uint64_t xxh64(const void *input, size_t length, uint64_t seed); + +/*-**************************** + * Streaming Hash Functions + *****************************/ + +/* + * These definitions are only meant to allow allocation of XXH state + * statically, on stack, or in a struct for example. + * Do not use members directly. + */ + +/** + * struct xxh32_state - private xxh32 state, do not use members directly + */ +struct xxh32_state { + uint32_t total_len_32; + uint32_t large_len; + uint32_t v1; + uint32_t v2; + uint32_t v3; + uint32_t v4; + uint32_t mem32[4]; + uint32_t memsize; +}; + +/** + * struct xxh32_state - private xxh64 state, do not use members directly + */ +struct xxh64_state { + uint64_t total_len; + uint64_t v1; + uint64_t v2; + uint64_t v3; + uint64_t v4; + uint64_t mem64[4]; + uint32_t memsize; +}; + +/** + * xxh32_reset() - reset the xxh32 state to start a new hashing operation + * + * @state: The xxh32 state to reset. + * @seed: Initialize the hash state with this seed. + * + * Call this function on any xxh32_state to prepare for a new hashing operation. + */ +void xxh32_reset(struct xxh32_state *state, uint32_t seed); + +/** + * xxh32_update() - hash the data given and update the xxh32 state + * + * @state: The xxh32 state to update. + * @input: The data to hash. + * @length: The length of the data to hash. + * + * After calling xxh32_reset() call xxh32_update() as many times as necessary. + * + * Return: Zero on success, otherwise an error code. + */ +int xxh32_update(struct xxh32_state *state, const void *input, size_t length); + +/** + * xxh32_digest() - produce the current xxh32 hash + * + * @state: Produce the current xxh32 hash of this state. + * + * A hash value can be produced at any time. It is still possible to continue + * inserting input into the hash state after a call to xxh32_digest(), and + * generate new hashes later on, by calling xxh32_digest() again. + * + * Return: The xxh32 hash stored in the state. + */ +uint32_t xxh32_digest(const struct xxh32_state *state); + +/** + * xxh64_reset() - reset the xxh64 state to start a new hashing operation + * + * @state: The xxh64 state to reset. + * @seed: Initialize the hash state with this seed. + */ +void xxh64_reset(struct xxh64_state *state, uint64_t seed); + +/** + * xxh64_update() - hash the data given and update the xxh64 state + * @state: The xxh64 state to update. + * @input: The data to hash. + * @length: The length of the data to hash. + * + * After calling xxh64_reset() call xxh64_update() as many times as necessary. + * + * Return: Zero on success, otherwise an error code. + */ +int xxh64_update(struct xxh64_state *state, const void *input, size_t length); + +/** + * xxh64_digest() - produce the current xxh64 hash + * + * @state: Produce the current xxh64 hash of this state. + * + * A hash value can be produced at any time. It is still possible to continue + * inserting input into the hash state after a call to xxh64_digest(), and + * generate new hashes later on, by calling xxh64_digest() again. + * + * Return: The xxh64 hash stored in the state. + */ +uint64_t xxh64_digest(const struct xxh64_state *state); + +/*-************************** + * Utils + ***************************/ + +/** + * xxh32_copy_state() - copy the source state into the destination state + * + * @src: The source xxh32 state. + * @dst: The destination xxh32 state. + */ +void xxh32_copy_state(struct xxh32_state *dst, const struct xxh32_state *src); + +/** + * xxh64_copy_state() - copy the source state into the destination state + * + * @src: The source xxh64 state. + * @dst: The destination xxh64 state. + */ +void xxh64_copy_state(struct xxh64_state *dst, const struct xxh64_state *src); + +#endif /* XXHASH_H */ diff --git a/lib/Kconfig b/lib/Kconfig index 48635688046f..45f6b279d4a0 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -185,6 +185,9 @@ config CRC8 when they need to do cyclic redundancy check according CRC8 algorithm. Module will be called crc8. +config XXHASH + tristate + config AUDIT_GENERIC bool depends on AUDIT && !AUDIT_ARCH diff --git a/lib/Makefile b/lib/Makefile index b9ad86add71d..eb5af4a37c7f 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -93,6 +93,7 @@ obj-$(CONFIG_CRC32) += crc32.o obj-$(CONFIG_CRC7) += crc7.o obj-$(CONFIG_LIBCRC32C) += libcrc32c.o obj-$(CONFIG_CRC8) += crc8.o +obj-$(CONFIG_XXHASH) += xxhash.o obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o obj-$(CONFIG_842_COMPRESS) += 842/ diff --git a/lib/xxhash.c b/lib/xxhash.c new file mode 100644 index 000000000000..aa61e2a3802f --- /dev/null +++ b/lib/xxhash.c @@ -0,0 +1,500 @@ +/* + * xxHash - Extremely Fast Hash algorithm + * Copyright (C) 2012-2016, Yann Collet. + * + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * This program is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License version 2 as published by the + * Free Software Foundation. This program is dual-licensed; you may select + * either version 2 of the GNU General Public License ("GPL") or BSD license + * ("BSD"). + * + * You can contact the author at: + * - xxHash homepage: http://cyan4973.github.io/xxHash/ + * - xxHash source repository: https://github.com/Cyan4973/xxHash + */ + +#include +#include +#include +#include +#include +#include +#include + +/*-************************************* + * Macros + **************************************/ +#define xxh_rotl32(x, r) ((x << r) | (x >> (32 - r))) +#define xxh_rotl64(x, r) ((x << r) | (x >> (64 - r))) + +#ifdef __LITTLE_ENDIAN +# define XXH_CPU_LITTLE_ENDIAN 1 +#else +# define XXH_CPU_LITTLE_ENDIAN 0 +#endif + +/*-************************************* + * Constants + **************************************/ +static const uint32_t PRIME32_1 = 2654435761U; +static const uint32_t PRIME32_2 = 2246822519U; +static const uint32_t PRIME32_3 = 3266489917U; +static const uint32_t PRIME32_4 = 668265263U; +static const uint32_t PRIME32_5 = 374761393U; + +static const uint64_t PRIME64_1 = 11400714785074694791ULL; +static const uint64_t PRIME64_2 = 14029467366897019727ULL; +static const uint64_t PRIME64_3 = 1609587929392839161ULL; +static const uint64_t PRIME64_4 = 9650029242287828579ULL; +static const uint64_t PRIME64_5 = 2870177450012600261ULL; + +/*-************************** + * Utils + ***************************/ +void xxh32_copy_state(struct xxh32_state *dst, const struct xxh32_state *src) +{ + memcpy(dst, src, sizeof(*dst)); +} +EXPORT_SYMBOL(xxh32_copy_state); + +void xxh64_copy_state(struct xxh64_state *dst, const struct xxh64_state *src) +{ + memcpy(dst, src, sizeof(*dst)); +} +EXPORT_SYMBOL(xxh64_copy_state); + +/*-*************************** + * Simple Hash Functions + ****************************/ +static uint32_t xxh32_round(uint32_t seed, const uint32_t input) +{ + seed += input * PRIME32_2; + seed = xxh_rotl32(seed, 13); + seed *= PRIME32_1; + return seed; +} + +uint32_t xxh32(const void *input, const size_t len, const uint32_t seed) +{ + const uint8_t *p = (const uint8_t *)input; + const uint8_t *b_end = p + len; + uint32_t h32; + + if (len >= 16) { + const uint8_t *const limit = b_end - 16; + uint32_t v1 = seed + PRIME32_1 + PRIME32_2; + uint32_t v2 = seed + PRIME32_2; + uint32_t v3 = seed + 0; + uint32_t v4 = seed - PRIME32_1; + + do { + v1 = xxh32_round(v1, get_unaligned_le32(p)); + p += 4; + v2 = xxh32_round(v2, get_unaligned_le32(p)); + p += 4; + v3 = xxh32_round(v3, get_unaligned_le32(p)); + p += 4; + v4 = xxh32_round(v4, get_unaligned_le32(p)); + p += 4; + } while (p <= limit); + + h32 = xxh_rotl32(v1, 1) + xxh_rotl32(v2, 7) + + xxh_rotl32(v3, 12) + xxh_rotl32(v4, 18); + } else { + h32 = seed + PRIME32_5; + } + + h32 += (uint32_t)len; + + while (p + 4 <= b_end) { + h32 += get_unaligned_le32(p) * PRIME32_3; + h32 = xxh_rotl32(h32, 17) * PRIME32_4; + p += 4; + } + + while (p < b_end) { + h32 += (*p) * PRIME32_5; + h32 = xxh_rotl32(h32, 11) * PRIME32_1; + p++; + } + + h32 ^= h32 >> 15; + h32 *= PRIME32_2; + h32 ^= h32 >> 13; + h32 *= PRIME32_3; + h32 ^= h32 >> 16; + + return h32; +} +EXPORT_SYMBOL(xxh32); + +static uint64_t xxh64_round(uint64_t acc, const uint64_t input) +{ + acc += input * PRIME64_2; + acc = xxh_rotl64(acc, 31); + acc *= PRIME64_1; + return acc; +} + +static uint64_t xxh64_merge_round(uint64_t acc, uint64_t val) +{ + val = xxh64_round(0, val); + acc ^= val; + acc = acc * PRIME64_1 + PRIME64_4; + return acc; +} + +uint64_t xxh64(const void *input, const size_t len, const uint64_t seed) +{ + const uint8_t *p = (const uint8_t *)input; + const uint8_t *const b_end = p + len; + uint64_t h64; + + if (len >= 32) { + const uint8_t *const limit = b_end - 32; + uint64_t v1 = seed + PRIME64_1 + PRIME64_2; + uint64_t v2 = seed + PRIME64_2; + uint64_t v3 = seed + 0; + uint64_t v4 = seed - PRIME64_1; + + do { + v1 = xxh64_round(v1, get_unaligned_le64(p)); + p += 8; + v2 = xxh64_round(v2, get_unaligned_le64(p)); + p += 8; + v3 = xxh64_round(v3, get_unaligned_le64(p)); + p += 8; + v4 = xxh64_round(v4, get_unaligned_le64(p)); + p += 8; + } while (p <= limit); + + h64 = xxh_rotl64(v1, 1) + xxh_rotl64(v2, 7) + + xxh_rotl64(v3, 12) + xxh_rotl64(v4, 18); + h64 = xxh64_merge_round(h64, v1); + h64 = xxh64_merge_round(h64, v2); + h64 = xxh64_merge_round(h64, v3); + h64 = xxh64_merge_round(h64, v4); + + } else { + h64 = seed + PRIME64_5; + } + + h64 += (uint64_t)len; + + while (p + 8 <= b_end) { + const uint64_t k1 = xxh64_round(0, get_unaligned_le64(p)); + + h64 ^= k1; + h64 = xxh_rotl64(h64, 27) * PRIME64_1 + PRIME64_4; + p += 8; + } + + if (p + 4 <= b_end) { + h64 ^= (uint64_t)(get_unaligned_le32(p)) * PRIME64_1; + h64 = xxh_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; + p += 4; + } + + while (p < b_end) { + h64 ^= (*p) * PRIME64_5; + h64 = xxh_rotl64(h64, 11) * PRIME64_1; + p++; + } + + h64 ^= h64 >> 33; + h64 *= PRIME64_2; + h64 ^= h64 >> 29; + h64 *= PRIME64_3; + h64 ^= h64 >> 32; + + return h64; +} +EXPORT_SYMBOL(xxh64); + +/*-************************************************** + * Advanced Hash Functions + ***************************************************/ +void xxh32_reset(struct xxh32_state *statePtr, const uint32_t seed) +{ + /* use a local state for memcpy() to avoid strict-aliasing warnings */ + struct xxh32_state state; + + memset(&state, 0, sizeof(state)); + state.v1 = seed + PRIME32_1 + PRIME32_2; + state.v2 = seed + PRIME32_2; + state.v3 = seed + 0; + state.v4 = seed - PRIME32_1; + memcpy(statePtr, &state, sizeof(state)); +} +EXPORT_SYMBOL(xxh32_reset); + +void xxh64_reset(struct xxh64_state *statePtr, const uint64_t seed) +{ + /* use a local state for memcpy() to avoid strict-aliasing warnings */ + struct xxh64_state state; + + memset(&state, 0, sizeof(state)); + state.v1 = seed + PRIME64_1 + PRIME64_2; + state.v2 = seed + PRIME64_2; + state.v3 = seed + 0; + state.v4 = seed - PRIME64_1; + memcpy(statePtr, &state, sizeof(state)); +} +EXPORT_SYMBOL(xxh64_reset); + +int xxh32_update(struct xxh32_state *state, const void *input, const size_t len) +{ + const uint8_t *p = (const uint8_t *)input; + const uint8_t *const b_end = p + len; + + if (input == NULL) + return -EINVAL; + + state->total_len_32 += (uint32_t)len; + state->large_len |= (len >= 16) | (state->total_len_32 >= 16); + + if (state->memsize + len < 16) { /* fill in tmp buffer */ + memcpy((uint8_t *)(state->mem32) + state->memsize, input, len); + state->memsize += (uint32_t)len; + return 0; + } + + if (state->memsize) { /* some data left from previous update */ + const uint32_t *p32 = state->mem32; + + memcpy((uint8_t *)(state->mem32) + state->memsize, input, + 16 - state->memsize); + + state->v1 = xxh32_round(state->v1, get_unaligned_le32(p32)); + p32++; + state->v2 = xxh32_round(state->v2, get_unaligned_le32(p32)); + p32++; + state->v3 = xxh32_round(state->v3, get_unaligned_le32(p32)); + p32++; + state->v4 = xxh32_round(state->v4, get_unaligned_le32(p32)); + p32++; + + p += 16-state->memsize; + state->memsize = 0; + } + + if (p <= b_end - 16) { + const uint8_t *const limit = b_end - 16; + uint32_t v1 = state->v1; + uint32_t v2 = state->v2; + uint32_t v3 = state->v3; + uint32_t v4 = state->v4; + + do { + v1 = xxh32_round(v1, get_unaligned_le32(p)); + p += 4; + v2 = xxh32_round(v2, get_unaligned_le32(p)); + p += 4; + v3 = xxh32_round(v3, get_unaligned_le32(p)); + p += 4; + v4 = xxh32_round(v4, get_unaligned_le32(p)); + p += 4; + } while (p <= limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < b_end) { + memcpy(state->mem32, p, (size_t)(b_end-p)); + state->memsize = (uint32_t)(b_end-p); + } + + return 0; +} +EXPORT_SYMBOL(xxh32_update); + +uint32_t xxh32_digest(const struct xxh32_state *state) +{ + const uint8_t *p = (const uint8_t *)state->mem32; + const uint8_t *const b_end = (const uint8_t *)(state->mem32) + + state->memsize; + uint32_t h32; + + if (state->large_len) { + h32 = xxh_rotl32(state->v1, 1) + xxh_rotl32(state->v2, 7) + + xxh_rotl32(state->v3, 12) + xxh_rotl32(state->v4, 18); + } else { + h32 = state->v3 /* == seed */ + PRIME32_5; + } + + h32 += state->total_len_32; + + while (p + 4 <= b_end) { + h32 += get_unaligned_le32(p) * PRIME32_3; + h32 = xxh_rotl32(h32, 17) * PRIME32_4; + p += 4; + } + + while (p < b_end) { + h32 += (*p) * PRIME32_5; + h32 = xxh_rotl32(h32, 11) * PRIME32_1; + p++; + } + + h32 ^= h32 >> 15; + h32 *= PRIME32_2; + h32 ^= h32 >> 13; + h32 *= PRIME32_3; + h32 ^= h32 >> 16; + + return h32; +} +EXPORT_SYMBOL(xxh32_digest); + +int xxh64_update(struct xxh64_state *state, const void *input, const size_t len) +{ + const uint8_t *p = (const uint8_t *)input; + const uint8_t *const b_end = p + len; + + if (input == NULL) + return -EINVAL; + + state->total_len += len; + + if (state->memsize + len < 32) { /* fill in tmp buffer */ + memcpy(((uint8_t *)state->mem64) + state->memsize, input, len); + state->memsize += (uint32_t)len; + return 0; + } + + if (state->memsize) { /* tmp buffer is full */ + uint64_t *p64 = state->mem64; + + memcpy(((uint8_t *)p64) + state->memsize, input, + 32 - state->memsize); + + state->v1 = xxh64_round(state->v1, get_unaligned_le64(p64)); + p64++; + state->v2 = xxh64_round(state->v2, get_unaligned_le64(p64)); + p64++; + state->v3 = xxh64_round(state->v3, get_unaligned_le64(p64)); + p64++; + state->v4 = xxh64_round(state->v4, get_unaligned_le64(p64)); + + p += 32 - state->memsize; + state->memsize = 0; + } + + if (p + 32 <= b_end) { + const uint8_t *const limit = b_end - 32; + uint64_t v1 = state->v1; + uint64_t v2 = state->v2; + uint64_t v3 = state->v3; + uint64_t v4 = state->v4; + + do { + v1 = xxh64_round(v1, get_unaligned_le64(p)); + p += 8; + v2 = xxh64_round(v2, get_unaligned_le64(p)); + p += 8; + v3 = xxh64_round(v3, get_unaligned_le64(p)); + p += 8; + v4 = xxh64_round(v4, get_unaligned_le64(p)); + p += 8; + } while (p <= limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < b_end) { + memcpy(state->mem64, p, (size_t)(b_end-p)); + state->memsize = (uint32_t)(b_end - p); + } + + return 0; +} +EXPORT_SYMBOL(xxh64_update); + +uint64_t xxh64_digest(const struct xxh64_state *state) +{ + const uint8_t *p = (const uint8_t *)state->mem64; + const uint8_t *const b_end = (const uint8_t *)state->mem64 + + state->memsize; + uint64_t h64; + + if (state->total_len >= 32) { + const uint64_t v1 = state->v1; + const uint64_t v2 = state->v2; + const uint64_t v3 = state->v3; + const uint64_t v4 = state->v4; + + h64 = xxh_rotl64(v1, 1) + xxh_rotl64(v2, 7) + + xxh_rotl64(v3, 12) + xxh_rotl64(v4, 18); + h64 = xxh64_merge_round(h64, v1); + h64 = xxh64_merge_round(h64, v2); + h64 = xxh64_merge_round(h64, v3); + h64 = xxh64_merge_round(h64, v4); + } else { + h64 = state->v3 + PRIME64_5; + } + + h64 += (uint64_t)state->total_len; + + while (p + 8 <= b_end) { + const uint64_t k1 = xxh64_round(0, get_unaligned_le64(p)); + + h64 ^= k1; + h64 = xxh_rotl64(h64, 27) * PRIME64_1 + PRIME64_4; + p += 8; + } + + if (p + 4 <= b_end) { + h64 ^= (uint64_t)(get_unaligned_le32(p)) * PRIME64_1; + h64 = xxh_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; + p += 4; + } + + while (p < b_end) { + h64 ^= (*p) * PRIME64_5; + h64 = xxh_rotl64(h64, 11) * PRIME64_1; + p++; + } + + h64 ^= h64 >> 33; + h64 *= PRIME64_2; + h64 ^= h64 >> 29; + h64 *= PRIME64_3; + h64 ^= h64 >> 32; + + return h64; +} +EXPORT_SYMBOL(xxh64_digest); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("xxHash"); From e2aeff657807083632442c1749a9064bb7c1c9b4 Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Wed, 9 Aug 2017 19:35:53 -0700 Subject: [PATCH 1170/1212] UPSTREAM: lib: Add zstd modules Add zstd compression and decompression kernel modules. zstd offers a wide varity of compression speed and quality trade-offs. It can compress at speeds approaching lz4, and quality approaching lzma. zstd decompressions at speeds more than twice as fast as zlib, and decompression speed remains roughly the same across all compression levels. The code was ported from the upstream zstd source repository. The `linux/zstd.h` header was modified to match linux kernel style. The cross-platform and allocation code was stripped out. Instead zstd requires the caller to pass a preallocated workspace. The source files were clang-formatted [1] to match the Linux Kernel style as much as possible. Otherwise, the code was unmodified. We would like to avoid as much further manual modification to the source code as possible, so it will be easier to keep the kernel zstd up to date. I benchmarked zstd compression as a special character device. I ran zstd and zlib compression at several levels, as well as performing no compression, which measure the time spent copying the data to kernel space. Data is passed to the compresser 4096 B at a time. The benchmark file is located in the upstream zstd source repository under `contrib/linux-kernel/zstd_compress_test.c` [2]. I ran the benchmarks on a Ubuntu 14.04 VM with 2 cores and 4 GiB of RAM. The VM is running on a MacBook Pro with a 3.1 GHz Intel Core i7 processor, 16 GB of RAM, and a SSD. I benchmarked using `silesia.tar` [3], which is 211,988,480 B large. Run the following commands for the benchmark: sudo modprobe zstd_compress_test sudo mknod zstd_compress_test c 245 0 sudo cp silesia.tar zstd_compress_test The time is reported by the time of the userland `cp`. The MB/s is computed with 1,536,217,008 B / time(buffer size, hash) which includes the time to copy from userland. The Adjusted MB/s is computed with 1,536,217,088 B / (time(buffer size, hash) - time(buffer size, none)). The memory reported is the amount of memory the compressor requests. | Method | Size (B) | Time (s) | Ratio | MB/s | Adj MB/s | Mem (MB) | |----------|----------|----------|-------|---------|----------|----------| | none | 11988480 | 0.100 | 1 | 2119.88 | - | - | | zstd -1 | 73645762 | 1.044 | 2.878 | 203.05 | 224.56 | 1.23 | | zstd -3 | 66988878 | 1.761 | 3.165 | 120.38 | 127.63 | 2.47 | | zstd -5 | 65001259 | 2.563 | 3.261 | 82.71 | 86.07 | 2.86 | | zstd -10 | 60165346 | 13.242 | 3.523 | 16.01 | 16.13 | 13.22 | | zstd -15 | 58009756 | 47.601 | 3.654 | 4.45 | 4.46 | 21.61 | | zstd -19 | 54014593 | 102.835 | 3.925 | 2.06 | 2.06 | 60.15 | | zlib -1 | 77260026 | 2.895 | 2.744 | 73.23 | 75.85 | 0.27 | | zlib -3 | 72972206 | 4.116 | 2.905 | 51.50 | 52.79 | 0.27 | | zlib -6 | 68190360 | 9.633 | 3.109 | 22.01 | 22.24 | 0.27 | | zlib -9 | 67613382 | 22.554 | 3.135 | 9.40 | 9.44 | 0.27 | I benchmarked zstd decompression using the same method on the same machine. The benchmark file is located in the upstream zstd repo under `contrib/linux-kernel/zstd_decompress_test.c` [4]. The memory reported is the amount of memory required to decompress data compressed with the given compression level. If you know the maximum size of your input, you can reduce the memory usage of decompression irrespective of the compression level. | Method | Time (s) | MB/s | Adjusted MB/s | Memory (MB) | |----------|----------|---------|---------------|-------------| | none | 0.025 | 8479.54 | - | - | | zstd -1 | 0.358 | 592.15 | 636.60 | 0.84 | | zstd -3 | 0.396 | 535.32 | 571.40 | 1.46 | | zstd -5 | 0.396 | 535.32 | 571.40 | 1.46 | | zstd -10 | 0.374 | 566.81 | 607.42 | 2.51 | | zstd -15 | 0.379 | 559.34 | 598.84 | 4.61 | | zstd -19 | 0.412 | 514.54 | 547.77 | 8.80 | | zlib -1 | 0.940 | 225.52 | 231.68 | 0.04 | | zlib -3 | 0.883 | 240.08 | 247.07 | 0.04 | | zlib -6 | 0.844 | 251.17 | 258.84 | 0.04 | | zlib -9 | 0.837 | 253.27 | 287.64 | 0.04 | Tested in userland using the test-suite in the zstd repo under `contrib/linux-kernel/test/UserlandTest.cpp` [5] by mocking the kernel functions. Fuzz tested using libfuzzer [6] with the fuzz harnesses under `contrib/linux-kernel/test/{RoundTripCrash.c,DecompressCrash.c}` [7] [8] with ASAN, UBSAN, and MSAN. Additionaly, it was tested while testing the BtrFS and SquashFS patches coming next. [1] https://clang.llvm.org/docs/ClangFormat.html [2] https://github.com/facebook/zstd/blob/dev/contrib/linux-kernel/zstd_compress_test.c [3] http://sun.aei.polsl.pl/~sdeor/index.php?page=silesia [4] https://github.com/facebook/zstd/blob/dev/contrib/linux-kernel/zstd_decompress_test.c [5] https://github.com/facebook/zstd/blob/dev/contrib/linux-kernel/test/UserlandTest.cpp [6] http://llvm.org/docs/LibFuzzer.html [7] https://github.com/facebook/zstd/blob/dev/contrib/linux-kernel/test/RoundTripCrash.c [8] https://github.com/facebook/zstd/blob/dev/contrib/linux-kernel/test/DecompressCrash.c zstd source repository: https://github.com/facebook/zstd Signed-off-by: Nick Terrell Signed-off-by: Chris Mason (cherry picked from commit 73f3d1b48f5069d46ba48aa28c2898dc93185560) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I47b9d43a8065b2b5a1362f8458065f0811cf70b9 Signed-off-by: Amit Pundir --- include/linux/zstd.h | 1157 ++++++++++++ lib/Kconfig | 8 + lib/Makefile | 2 + lib/zstd/Makefile | 18 + lib/zstd/bitstream.h | 374 ++++ lib/zstd/compress.c | 3484 +++++++++++++++++++++++++++++++++++++ lib/zstd/decompress.c | 2528 +++++++++++++++++++++++++++ lib/zstd/entropy_common.c | 243 +++ lib/zstd/error_private.h | 53 + lib/zstd/fse.h | 575 ++++++ lib/zstd/fse_compress.c | 795 +++++++++ lib/zstd/fse_decompress.c | 332 ++++ lib/zstd/huf.h | 212 +++ lib/zstd/huf_compress.c | 770 ++++++++ lib/zstd/huf_decompress.c | 960 ++++++++++ lib/zstd/mem.h | 151 ++ lib/zstd/zstd_common.c | 75 + lib/zstd/zstd_internal.h | 263 +++ lib/zstd/zstd_opt.h | 1014 +++++++++++ 19 files changed, 13014 insertions(+) create mode 100644 include/linux/zstd.h create mode 100644 lib/zstd/Makefile create mode 100644 lib/zstd/bitstream.h create mode 100644 lib/zstd/compress.c create mode 100644 lib/zstd/decompress.c create mode 100644 lib/zstd/entropy_common.c create mode 100644 lib/zstd/error_private.h create mode 100644 lib/zstd/fse.h create mode 100644 lib/zstd/fse_compress.c create mode 100644 lib/zstd/fse_decompress.c create mode 100644 lib/zstd/huf.h create mode 100644 lib/zstd/huf_compress.c create mode 100644 lib/zstd/huf_decompress.c create mode 100644 lib/zstd/mem.h create mode 100644 lib/zstd/zstd_common.c create mode 100644 lib/zstd/zstd_internal.h create mode 100644 lib/zstd/zstd_opt.h diff --git a/include/linux/zstd.h b/include/linux/zstd.h new file mode 100644 index 000000000000..249575e2485f --- /dev/null +++ b/include/linux/zstd.h @@ -0,0 +1,1157 @@ +/* + * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of https://github.com/facebook/zstd. + * An additional grant of patent rights can be found in the PATENTS file in the + * same directory. + * + * This program is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License version 2 as published by the + * Free Software Foundation. This program is dual-licensed; you may select + * either version 2 of the GNU General Public License ("GPL") or BSD license + * ("BSD"). + */ + +#ifndef ZSTD_H +#define ZSTD_H + +/* ====== Dependency ======*/ +#include /* size_t */ + + +/*-***************************************************************************** + * Introduction + * + * zstd, short for Zstandard, is a fast lossless compression algorithm, + * targeting real-time compression scenarios at zlib-level and better + * compression ratios. The zstd compression library provides in-memory + * compression and decompression functions. The library supports compression + * levels from 1 up to ZSTD_maxCLevel() which is 22. Levels >= 20, labeled + * ultra, should be used with caution, as they require more memory. + * Compression can be done in: + * - a single step, reusing a context (described as Explicit memory management) + * - unbounded multiple steps (described as Streaming compression) + * The compression ratio achievable on small data can be highly improved using + * compression with a dictionary in: + * - a single step (described as Simple dictionary API) + * - a single step, reusing a dictionary (described as Fast dictionary API) + ******************************************************************************/ + +/*====== Helper functions ======*/ + +/** + * enum ZSTD_ErrorCode - zstd error codes + * + * Functions that return size_t can be checked for errors using ZSTD_isError() + * and the ZSTD_ErrorCode can be extracted using ZSTD_getErrorCode(). + */ +typedef enum { + ZSTD_error_no_error, + ZSTD_error_GENERIC, + ZSTD_error_prefix_unknown, + ZSTD_error_version_unsupported, + ZSTD_error_parameter_unknown, + ZSTD_error_frameParameter_unsupported, + ZSTD_error_frameParameter_unsupportedBy32bits, + ZSTD_error_frameParameter_windowTooLarge, + ZSTD_error_compressionParameter_unsupported, + ZSTD_error_init_missing, + ZSTD_error_memory_allocation, + ZSTD_error_stage_wrong, + ZSTD_error_dstSize_tooSmall, + ZSTD_error_srcSize_wrong, + ZSTD_error_corruption_detected, + ZSTD_error_checksum_wrong, + ZSTD_error_tableLog_tooLarge, + ZSTD_error_maxSymbolValue_tooLarge, + ZSTD_error_maxSymbolValue_tooSmall, + ZSTD_error_dictionary_corrupted, + ZSTD_error_dictionary_wrong, + ZSTD_error_dictionaryCreation_failed, + ZSTD_error_maxCode +} ZSTD_ErrorCode; + +/** + * ZSTD_maxCLevel() - maximum compression level available + * + * Return: Maximum compression level available. + */ +int ZSTD_maxCLevel(void); +/** + * ZSTD_compressBound() - maximum compressed size in worst case scenario + * @srcSize: The size of the data to compress. + * + * Return: The maximum compressed size in the worst case scenario. + */ +size_t ZSTD_compressBound(size_t srcSize); +/** + * ZSTD_isError() - tells if a size_t function result is an error code + * @code: The function result to check for error. + * + * Return: Non-zero iff the code is an error. + */ +static __attribute__((unused)) unsigned int ZSTD_isError(size_t code) +{ + return code > (size_t)-ZSTD_error_maxCode; +} +/** + * ZSTD_getErrorCode() - translates an error function result to a ZSTD_ErrorCode + * @functionResult: The result of a function for which ZSTD_isError() is true. + * + * Return: The ZSTD_ErrorCode corresponding to the functionResult or 0 + * if the functionResult isn't an error. + */ +static __attribute__((unused)) ZSTD_ErrorCode ZSTD_getErrorCode( + size_t functionResult) +{ + if (!ZSTD_isError(functionResult)) + return (ZSTD_ErrorCode)0; + return (ZSTD_ErrorCode)(0 - functionResult); +} + +/** + * enum ZSTD_strategy - zstd compression search strategy + * + * From faster to stronger. + */ +typedef enum { + ZSTD_fast, + ZSTD_dfast, + ZSTD_greedy, + ZSTD_lazy, + ZSTD_lazy2, + ZSTD_btlazy2, + ZSTD_btopt, + ZSTD_btopt2 +} ZSTD_strategy; + +/** + * struct ZSTD_compressionParameters - zstd compression parameters + * @windowLog: Log of the largest match distance. Larger means more + * compression, and more memory needed during decompression. + * @chainLog: Fully searched segment. Larger means more compression, slower, + * and more memory (useless for fast). + * @hashLog: Dispatch table. Larger means more compression, + * slower, and more memory. + * @searchLog: Number of searches. Larger means more compression and slower. + * @searchLength: Match length searched. Larger means faster decompression, + * sometimes less compression. + * @targetLength: Acceptable match size for optimal parser (only). Larger means + * more compression, and slower. + * @strategy: The zstd compression strategy. + */ +typedef struct { + unsigned int windowLog; + unsigned int chainLog; + unsigned int hashLog; + unsigned int searchLog; + unsigned int searchLength; + unsigned int targetLength; + ZSTD_strategy strategy; +} ZSTD_compressionParameters; + +/** + * struct ZSTD_frameParameters - zstd frame parameters + * @contentSizeFlag: Controls whether content size will be present in the frame + * header (when known). + * @checksumFlag: Controls whether a 32-bit checksum is generated at the end + * of the frame for error detection. + * @noDictIDFlag: Controls whether dictID will be saved into the frame header + * when using dictionary compression. + * + * The default value is all fields set to 0. + */ +typedef struct { + unsigned int contentSizeFlag; + unsigned int checksumFlag; + unsigned int noDictIDFlag; +} ZSTD_frameParameters; + +/** + * struct ZSTD_parameters - zstd parameters + * @cParams: The compression parameters. + * @fParams: The frame parameters. + */ +typedef struct { + ZSTD_compressionParameters cParams; + ZSTD_frameParameters fParams; +} ZSTD_parameters; + +/** + * ZSTD_getCParams() - returns ZSTD_compressionParameters for selected level + * @compressionLevel: The compression level from 1 to ZSTD_maxCLevel(). + * @estimatedSrcSize: The estimated source size to compress or 0 if unknown. + * @dictSize: The dictionary size or 0 if a dictionary isn't being used. + * + * Return: The selected ZSTD_compressionParameters. + */ +ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, + unsigned long long estimatedSrcSize, size_t dictSize); + +/** + * ZSTD_getParams() - returns ZSTD_parameters for selected level + * @compressionLevel: The compression level from 1 to ZSTD_maxCLevel(). + * @estimatedSrcSize: The estimated source size to compress or 0 if unknown. + * @dictSize: The dictionary size or 0 if a dictionary isn't being used. + * + * The same as ZSTD_getCParams() except also selects the default frame + * parameters (all zero). + * + * Return: The selected ZSTD_parameters. + */ +ZSTD_parameters ZSTD_getParams(int compressionLevel, + unsigned long long estimatedSrcSize, size_t dictSize); + +/*-************************************* + * Explicit memory management + **************************************/ + +/** + * ZSTD_CCtxWorkspaceBound() - amount of memory needed to initialize a ZSTD_CCtx + * @cParams: The compression parameters to be used for compression. + * + * If multiple compression parameters might be used, the caller must call + * ZSTD_CCtxWorkspaceBound() for each set of parameters and use the maximum + * size. + * + * Return: A lower bound on the size of the workspace that is passed to + * ZSTD_initCCtx(). + */ +size_t ZSTD_CCtxWorkspaceBound(ZSTD_compressionParameters cParams); + +/** + * struct ZSTD_CCtx - the zstd compression context + * + * When compressing many times it is recommended to allocate a context just once + * and reuse it for each successive compression operation. + */ +typedef struct ZSTD_CCtx_s ZSTD_CCtx; +/** + * ZSTD_initCCtx() - initialize a zstd compression context + * @workspace: The workspace to emplace the context into. It must outlive + * the returned context. + * @workspaceSize: The size of workspace. Use ZSTD_CCtxWorkspaceBound() to + * determine how large the workspace must be. + * + * Return: A compression context emplaced into workspace. + */ +ZSTD_CCtx *ZSTD_initCCtx(void *workspace, size_t workspaceSize); + +/** + * ZSTD_compressCCtx() - compress src into dst + * @ctx: The context. Must have been initialized with a workspace at + * least as large as ZSTD_CCtxWorkspaceBound(params.cParams). + * @dst: The buffer to compress src into. + * @dstCapacity: The size of the destination buffer. May be any size, but + * ZSTD_compressBound(srcSize) is guaranteed to be large enough. + * @src: The data to compress. + * @srcSize: The size of the data to compress. + * @params: The parameters to use for compression. See ZSTD_getParams(). + * + * Return: The compressed size or an error, which can be checked using + * ZSTD_isError(). + */ +size_t ZSTD_compressCCtx(ZSTD_CCtx *ctx, void *dst, size_t dstCapacity, + const void *src, size_t srcSize, ZSTD_parameters params); + +/** + * ZSTD_DCtxWorkspaceBound() - amount of memory needed to initialize a ZSTD_DCtx + * + * Return: A lower bound on the size of the workspace that is passed to + * ZSTD_initDCtx(). + */ +size_t ZSTD_DCtxWorkspaceBound(void); + +/** + * struct ZSTD_DCtx - the zstd decompression context + * + * When decompressing many times it is recommended to allocate a context just + * once and reuse it for each successive decompression operation. + */ +typedef struct ZSTD_DCtx_s ZSTD_DCtx; +/** + * ZSTD_initDCtx() - initialize a zstd decompression context + * @workspace: The workspace to emplace the context into. It must outlive + * the returned context. + * @workspaceSize: The size of workspace. Use ZSTD_DCtxWorkspaceBound() to + * determine how large the workspace must be. + * + * Return: A decompression context emplaced into workspace. + */ +ZSTD_DCtx *ZSTD_initDCtx(void *workspace, size_t workspaceSize); + +/** + * ZSTD_decompressDCtx() - decompress zstd compressed src into dst + * @ctx: The decompression context. + * @dst: The buffer to decompress src into. + * @dstCapacity: The size of the destination buffer. Must be at least as large + * as the decompressed size. If the caller cannot upper bound the + * decompressed size, then it's better to use the streaming API. + * @src: The zstd compressed data to decompress. Multiple concatenated + * frames and skippable frames are allowed. + * @srcSize: The exact size of the data to decompress. + * + * Return: The decompressed size or an error, which can be checked using + * ZSTD_isError(). + */ +size_t ZSTD_decompressDCtx(ZSTD_DCtx *ctx, void *dst, size_t dstCapacity, + const void *src, size_t srcSize); + +/*-************************ + * Simple dictionary API + **************************/ + +/** + * ZSTD_compress_usingDict() - compress src into dst using a dictionary + * @ctx: The context. Must have been initialized with a workspace at + * least as large as ZSTD_CCtxWorkspaceBound(params.cParams). + * @dst: The buffer to compress src into. + * @dstCapacity: The size of the destination buffer. May be any size, but + * ZSTD_compressBound(srcSize) is guaranteed to be large enough. + * @src: The data to compress. + * @srcSize: The size of the data to compress. + * @dict: The dictionary to use for compression. + * @dictSize: The size of the dictionary. + * @params: The parameters to use for compression. See ZSTD_getParams(). + * + * Compression using a predefined dictionary. The same dictionary must be used + * during decompression. + * + * Return: The compressed size or an error, which can be checked using + * ZSTD_isError(). + */ +size_t ZSTD_compress_usingDict(ZSTD_CCtx *ctx, void *dst, size_t dstCapacity, + const void *src, size_t srcSize, const void *dict, size_t dictSize, + ZSTD_parameters params); + +/** + * ZSTD_decompress_usingDict() - decompress src into dst using a dictionary + * @ctx: The decompression context. + * @dst: The buffer to decompress src into. + * @dstCapacity: The size of the destination buffer. Must be at least as large + * as the decompressed size. If the caller cannot upper bound the + * decompressed size, then it's better to use the streaming API. + * @src: The zstd compressed data to decompress. Multiple concatenated + * frames and skippable frames are allowed. + * @srcSize: The exact size of the data to decompress. + * @dict: The dictionary to use for decompression. The same dictionary + * must've been used to compress the data. + * @dictSize: The size of the dictionary. + * + * Return: The decompressed size or an error, which can be checked using + * ZSTD_isError(). + */ +size_t ZSTD_decompress_usingDict(ZSTD_DCtx *ctx, void *dst, size_t dstCapacity, + const void *src, size_t srcSize, const void *dict, size_t dictSize); + +/*-************************** + * Fast dictionary API + ***************************/ + +/** + * ZSTD_CDictWorkspaceBound() - memory needed to initialize a ZSTD_CDict + * @cParams: The compression parameters to be used for compression. + * + * Return: A lower bound on the size of the workspace that is passed to + * ZSTD_initCDict(). + */ +size_t ZSTD_CDictWorkspaceBound(ZSTD_compressionParameters cParams); + +/** + * struct ZSTD_CDict - a digested dictionary to be used for compression + */ +typedef struct ZSTD_CDict_s ZSTD_CDict; + +/** + * ZSTD_initCDict() - initialize a digested dictionary for compression + * @dictBuffer: The dictionary to digest. The buffer is referenced by the + * ZSTD_CDict so it must outlive the returned ZSTD_CDict. + * @dictSize: The size of the dictionary. + * @params: The parameters to use for compression. See ZSTD_getParams(). + * @workspace: The workspace. It must outlive the returned ZSTD_CDict. + * @workspaceSize: The workspace size. Must be at least + * ZSTD_CDictWorkspaceBound(params.cParams). + * + * When compressing multiple messages / blocks with the same dictionary it is + * recommended to load it just once. The ZSTD_CDict merely references the + * dictBuffer, so it must outlive the returned ZSTD_CDict. + * + * Return: The digested dictionary emplaced into workspace. + */ +ZSTD_CDict *ZSTD_initCDict(const void *dictBuffer, size_t dictSize, + ZSTD_parameters params, void *workspace, size_t workspaceSize); + +/** + * ZSTD_compress_usingCDict() - compress src into dst using a ZSTD_CDict + * @ctx: The context. Must have been initialized with a workspace at + * least as large as ZSTD_CCtxWorkspaceBound(cParams) where + * cParams are the compression parameters used to initialize the + * cdict. + * @dst: The buffer to compress src into. + * @dstCapacity: The size of the destination buffer. May be any size, but + * ZSTD_compressBound(srcSize) is guaranteed to be large enough. + * @src: The data to compress. + * @srcSize: The size of the data to compress. + * @cdict: The digested dictionary to use for compression. + * @params: The parameters to use for compression. See ZSTD_getParams(). + * + * Compression using a digested dictionary. The same dictionary must be used + * during decompression. + * + * Return: The compressed size or an error, which can be checked using + * ZSTD_isError(). + */ +size_t ZSTD_compress_usingCDict(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, + const void *src, size_t srcSize, const ZSTD_CDict *cdict); + + +/** + * ZSTD_DDictWorkspaceBound() - memory needed to initialize a ZSTD_DDict + * + * Return: A lower bound on the size of the workspace that is passed to + * ZSTD_initDDict(). + */ +size_t ZSTD_DDictWorkspaceBound(void); + +/** + * struct ZSTD_DDict - a digested dictionary to be used for decompression + */ +typedef struct ZSTD_DDict_s ZSTD_DDict; + +/** + * ZSTD_initDDict() - initialize a digested dictionary for decompression + * @dictBuffer: The dictionary to digest. The buffer is referenced by the + * ZSTD_DDict so it must outlive the returned ZSTD_DDict. + * @dictSize: The size of the dictionary. + * @workspace: The workspace. It must outlive the returned ZSTD_DDict. + * @workspaceSize: The workspace size. Must be at least + * ZSTD_DDictWorkspaceBound(). + * + * When decompressing multiple messages / blocks with the same dictionary it is + * recommended to load it just once. The ZSTD_DDict merely references the + * dictBuffer, so it must outlive the returned ZSTD_DDict. + * + * Return: The digested dictionary emplaced into workspace. + */ +ZSTD_DDict *ZSTD_initDDict(const void *dictBuffer, size_t dictSize, + void *workspace, size_t workspaceSize); + +/** + * ZSTD_decompress_usingDDict() - decompress src into dst using a ZSTD_DDict + * @ctx: The decompression context. + * @dst: The buffer to decompress src into. + * @dstCapacity: The size of the destination buffer. Must be at least as large + * as the decompressed size. If the caller cannot upper bound the + * decompressed size, then it's better to use the streaming API. + * @src: The zstd compressed data to decompress. Multiple concatenated + * frames and skippable frames are allowed. + * @srcSize: The exact size of the data to decompress. + * @ddict: The digested dictionary to use for decompression. The same + * dictionary must've been used to compress the data. + * + * Return: The decompressed size or an error, which can be checked using + * ZSTD_isError(). + */ +size_t ZSTD_decompress_usingDDict(ZSTD_DCtx *dctx, void *dst, + size_t dstCapacity, const void *src, size_t srcSize, + const ZSTD_DDict *ddict); + + +/*-************************** + * Streaming + ***************************/ + +/** + * struct ZSTD_inBuffer - input buffer for streaming + * @src: Start of the input buffer. + * @size: Size of the input buffer. + * @pos: Position where reading stopped. Will be updated. + * Necessarily 0 <= pos <= size. + */ +typedef struct ZSTD_inBuffer_s { + const void *src; + size_t size; + size_t pos; +} ZSTD_inBuffer; + +/** + * struct ZSTD_outBuffer - output buffer for streaming + * @dst: Start of the output buffer. + * @size: Size of the output buffer. + * @pos: Position where writing stopped. Will be updated. + * Necessarily 0 <= pos <= size. + */ +typedef struct ZSTD_outBuffer_s { + void *dst; + size_t size; + size_t pos; +} ZSTD_outBuffer; + + + +/*-***************************************************************************** + * Streaming compression - HowTo + * + * A ZSTD_CStream object is required to track streaming operation. + * Use ZSTD_initCStream() to initialize a ZSTD_CStream object. + * ZSTD_CStream objects can be reused multiple times on consecutive compression + * operations. It is recommended to re-use ZSTD_CStream in situations where many + * streaming operations will be achieved consecutively. Use one separate + * ZSTD_CStream per thread for parallel execution. + * + * Use ZSTD_compressStream() repetitively to consume input stream. + * The function will automatically update both `pos` fields. + * Note that it may not consume the entire input, in which case `pos < size`, + * and it's up to the caller to present again remaining data. + * It returns a hint for the preferred number of bytes to use as an input for + * the next function call. + * + * At any moment, it's possible to flush whatever data remains within internal + * buffer, using ZSTD_flushStream(). `output->pos` will be updated. There might + * still be some content left within the internal buffer if `output->size` is + * too small. It returns the number of bytes left in the internal buffer and + * must be called until it returns 0. + * + * ZSTD_endStream() instructs to finish a frame. It will perform a flush and + * write frame epilogue. The epilogue is required for decoders to consider a + * frame completed. Similar to ZSTD_flushStream(), it may not be able to flush + * the full content if `output->size` is too small. In which case, call again + * ZSTD_endStream() to complete the flush. It returns the number of bytes left + * in the internal buffer and must be called until it returns 0. + ******************************************************************************/ + +/** + * ZSTD_CStreamWorkspaceBound() - memory needed to initialize a ZSTD_CStream + * @cParams: The compression parameters to be used for compression. + * + * Return: A lower bound on the size of the workspace that is passed to + * ZSTD_initCStream() and ZSTD_initCStream_usingCDict(). + */ +size_t ZSTD_CStreamWorkspaceBound(ZSTD_compressionParameters cParams); + +/** + * struct ZSTD_CStream - the zstd streaming compression context + */ +typedef struct ZSTD_CStream_s ZSTD_CStream; + +/*===== ZSTD_CStream management functions =====*/ +/** + * ZSTD_initCStream() - initialize a zstd streaming compression context + * @params: The zstd compression parameters. + * @pledgedSrcSize: If params.fParams.contentSizeFlag == 1 then the caller must + * pass the source size (zero means empty source). Otherwise, + * the caller may optionally pass the source size, or zero if + * unknown. + * @workspace: The workspace to emplace the context into. It must outlive + * the returned context. + * @workspaceSize: The size of workspace. + * Use ZSTD_CStreamWorkspaceBound(params.cParams) to determine + * how large the workspace must be. + * + * Return: The zstd streaming compression context. + */ +ZSTD_CStream *ZSTD_initCStream(ZSTD_parameters params, + unsigned long long pledgedSrcSize, void *workspace, + size_t workspaceSize); + +/** + * ZSTD_initCStream_usingCDict() - initialize a streaming compression context + * @cdict: The digested dictionary to use for compression. + * @pledgedSrcSize: Optionally the source size, or zero if unknown. + * @workspace: The workspace to emplace the context into. It must outlive + * the returned context. + * @workspaceSize: The size of workspace. Call ZSTD_CStreamWorkspaceBound() + * with the cParams used to initialize the cdict to determine + * how large the workspace must be. + * + * Return: The zstd streaming compression context. + */ +ZSTD_CStream *ZSTD_initCStream_usingCDict(const ZSTD_CDict *cdict, + unsigned long long pledgedSrcSize, void *workspace, + size_t workspaceSize); + +/*===== Streaming compression functions =====*/ +/** + * ZSTD_resetCStream() - reset the context using parameters from creation + * @zcs: The zstd streaming compression context to reset. + * @pledgedSrcSize: Optionally the source size, or zero if unknown. + * + * Resets the context using the parameters from creation. Skips dictionary + * loading, since it can be reused. If `pledgedSrcSize` is non-zero the frame + * content size is always written into the frame header. + * + * Return: Zero or an error, which can be checked using ZSTD_isError(). + */ +size_t ZSTD_resetCStream(ZSTD_CStream *zcs, unsigned long long pledgedSrcSize); +/** + * ZSTD_compressStream() - streaming compress some of input into output + * @zcs: The zstd streaming compression context. + * @output: Destination buffer. `output->pos` is updated to indicate how much + * compressed data was written. + * @input: Source buffer. `input->pos` is updated to indicate how much data was + * read. Note that it may not consume the entire input, in which case + * `input->pos < input->size`, and it's up to the caller to present + * remaining data again. + * + * The `input` and `output` buffers may be any size. Guaranteed to make some + * forward progress if `input` and `output` are not empty. + * + * Return: A hint for the number of bytes to use as the input for the next + * function call or an error, which can be checked using + * ZSTD_isError(). + */ +size_t ZSTD_compressStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output, + ZSTD_inBuffer *input); +/** + * ZSTD_flushStream() - flush internal buffers into output + * @zcs: The zstd streaming compression context. + * @output: Destination buffer. `output->pos` is updated to indicate how much + * compressed data was written. + * + * ZSTD_flushStream() must be called until it returns 0, meaning all the data + * has been flushed. Since ZSTD_flushStream() causes a block to be ended, + * calling it too often will degrade the compression ratio. + * + * Return: The number of bytes still present within internal buffers or an + * error, which can be checked using ZSTD_isError(). + */ +size_t ZSTD_flushStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output); +/** + * ZSTD_endStream() - flush internal buffers into output and end the frame + * @zcs: The zstd streaming compression context. + * @output: Destination buffer. `output->pos` is updated to indicate how much + * compressed data was written. + * + * ZSTD_endStream() must be called until it returns 0, meaning all the data has + * been flushed and the frame epilogue has been written. + * + * Return: The number of bytes still present within internal buffers or an + * error, which can be checked using ZSTD_isError(). + */ +size_t ZSTD_endStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output); + +/** + * ZSTD_CStreamInSize() - recommended size for the input buffer + * + * Return: The recommended size for the input buffer. + */ +size_t ZSTD_CStreamInSize(void); +/** + * ZSTD_CStreamOutSize() - recommended size for the output buffer + * + * When the output buffer is at least this large, it is guaranteed to be large + * enough to flush at least one complete compressed block. + * + * Return: The recommended size for the output buffer. + */ +size_t ZSTD_CStreamOutSize(void); + + + +/*-***************************************************************************** + * Streaming decompression - HowTo + * + * A ZSTD_DStream object is required to track streaming operations. + * Use ZSTD_initDStream() to initialize a ZSTD_DStream object. + * ZSTD_DStream objects can be re-used multiple times. + * + * Use ZSTD_decompressStream() repetitively to consume your input. + * The function will update both `pos` fields. + * If `input->pos < input->size`, some input has not been consumed. + * It's up to the caller to present again remaining data. + * If `output->pos < output->size`, decoder has flushed everything it could. + * Returns 0 iff a frame is completely decoded and fully flushed. + * Otherwise it returns a suggested next input size that will never load more + * than the current frame. + ******************************************************************************/ + +/** + * ZSTD_DStreamWorkspaceBound() - memory needed to initialize a ZSTD_DStream + * @maxWindowSize: The maximum window size allowed for compressed frames. + * + * Return: A lower bound on the size of the workspace that is passed to + * ZSTD_initDStream() and ZSTD_initDStream_usingDDict(). + */ +size_t ZSTD_DStreamWorkspaceBound(size_t maxWindowSize); + +/** + * struct ZSTD_DStream - the zstd streaming decompression context + */ +typedef struct ZSTD_DStream_s ZSTD_DStream; +/*===== ZSTD_DStream management functions =====*/ +/** + * ZSTD_initDStream() - initialize a zstd streaming decompression context + * @maxWindowSize: The maximum window size allowed for compressed frames. + * @workspace: The workspace to emplace the context into. It must outlive + * the returned context. + * @workspaceSize: The size of workspace. + * Use ZSTD_DStreamWorkspaceBound(maxWindowSize) to determine + * how large the workspace must be. + * + * Return: The zstd streaming decompression context. + */ +ZSTD_DStream *ZSTD_initDStream(size_t maxWindowSize, void *workspace, + size_t workspaceSize); +/** + * ZSTD_initDStream_usingDDict() - initialize streaming decompression context + * @maxWindowSize: The maximum window size allowed for compressed frames. + * @ddict: The digested dictionary to use for decompression. + * @workspace: The workspace to emplace the context into. It must outlive + * the returned context. + * @workspaceSize: The size of workspace. + * Use ZSTD_DStreamWorkspaceBound(maxWindowSize) to determine + * how large the workspace must be. + * + * Return: The zstd streaming decompression context. + */ +ZSTD_DStream *ZSTD_initDStream_usingDDict(size_t maxWindowSize, + const ZSTD_DDict *ddict, void *workspace, size_t workspaceSize); + +/*===== Streaming decompression functions =====*/ +/** + * ZSTD_resetDStream() - reset the context using parameters from creation + * @zds: The zstd streaming decompression context to reset. + * + * Resets the context using the parameters from creation. Skips dictionary + * loading, since it can be reused. + * + * Return: Zero or an error, which can be checked using ZSTD_isError(). + */ +size_t ZSTD_resetDStream(ZSTD_DStream *zds); +/** + * ZSTD_decompressStream() - streaming decompress some of input into output + * @zds: The zstd streaming decompression context. + * @output: Destination buffer. `output.pos` is updated to indicate how much + * decompressed data was written. + * @input: Source buffer. `input.pos` is updated to indicate how much data was + * read. Note that it may not consume the entire input, in which case + * `input.pos < input.size`, and it's up to the caller to present + * remaining data again. + * + * The `input` and `output` buffers may be any size. Guaranteed to make some + * forward progress if `input` and `output` are not empty. + * ZSTD_decompressStream() will not consume the last byte of the frame until + * the entire frame is flushed. + * + * Return: Returns 0 iff a frame is completely decoded and fully flushed. + * Otherwise returns a hint for the number of bytes to use as the input + * for the next function call or an error, which can be checked using + * ZSTD_isError(). The size hint will never load more than the frame. + */ +size_t ZSTD_decompressStream(ZSTD_DStream *zds, ZSTD_outBuffer *output, + ZSTD_inBuffer *input); + +/** + * ZSTD_DStreamInSize() - recommended size for the input buffer + * + * Return: The recommended size for the input buffer. + */ +size_t ZSTD_DStreamInSize(void); +/** + * ZSTD_DStreamOutSize() - recommended size for the output buffer + * + * When the output buffer is at least this large, it is guaranteed to be large + * enough to flush at least one complete decompressed block. + * + * Return: The recommended size for the output buffer. + */ +size_t ZSTD_DStreamOutSize(void); + + +/* --- Constants ---*/ +#define ZSTD_MAGICNUMBER 0xFD2FB528 /* >= v0.8.0 */ +#define ZSTD_MAGIC_SKIPPABLE_START 0x184D2A50U + +#define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1) +#define ZSTD_CONTENTSIZE_ERROR (0ULL - 2) + +#define ZSTD_WINDOWLOG_MAX_32 27 +#define ZSTD_WINDOWLOG_MAX_64 27 +#define ZSTD_WINDOWLOG_MAX \ + ((unsigned int)(sizeof(size_t) == 4 \ + ? ZSTD_WINDOWLOG_MAX_32 \ + : ZSTD_WINDOWLOG_MAX_64)) +#define ZSTD_WINDOWLOG_MIN 10 +#define ZSTD_HASHLOG_MAX ZSTD_WINDOWLOG_MAX +#define ZSTD_HASHLOG_MIN 6 +#define ZSTD_CHAINLOG_MAX (ZSTD_WINDOWLOG_MAX+1) +#define ZSTD_CHAINLOG_MIN ZSTD_HASHLOG_MIN +#define ZSTD_HASHLOG3_MAX 17 +#define ZSTD_SEARCHLOG_MAX (ZSTD_WINDOWLOG_MAX-1) +#define ZSTD_SEARCHLOG_MIN 1 +/* only for ZSTD_fast, other strategies are limited to 6 */ +#define ZSTD_SEARCHLENGTH_MAX 7 +/* only for ZSTD_btopt, other strategies are limited to 4 */ +#define ZSTD_SEARCHLENGTH_MIN 3 +#define ZSTD_TARGETLENGTH_MIN 4 +#define ZSTD_TARGETLENGTH_MAX 999 + +/* for static allocation */ +#define ZSTD_FRAMEHEADERSIZE_MAX 18 +#define ZSTD_FRAMEHEADERSIZE_MIN 6 +static const size_t ZSTD_frameHeaderSize_prefix = 5; +static const size_t ZSTD_frameHeaderSize_min = ZSTD_FRAMEHEADERSIZE_MIN; +static const size_t ZSTD_frameHeaderSize_max = ZSTD_FRAMEHEADERSIZE_MAX; +/* magic number + skippable frame length */ +static const size_t ZSTD_skippableHeaderSize = 8; + + +/*-************************************* + * Compressed size functions + **************************************/ + +/** + * ZSTD_findFrameCompressedSize() - returns the size of a compressed frame + * @src: Source buffer. It should point to the start of a zstd encoded frame + * or a skippable frame. + * @srcSize: The size of the source buffer. It must be at least as large as the + * size of the frame. + * + * Return: The compressed size of the frame pointed to by `src` or an error, + * which can be check with ZSTD_isError(). + * Suitable to pass to ZSTD_decompress() or similar functions. + */ +size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize); + +/*-************************************* + * Decompressed size functions + **************************************/ +/** + * ZSTD_getFrameContentSize() - returns the content size in a zstd frame header + * @src: It should point to the start of a zstd encoded frame. + * @srcSize: The size of the source buffer. It must be at least as large as the + * frame header. `ZSTD_frameHeaderSize_max` is always large enough. + * + * Return: The frame content size stored in the frame header if known. + * `ZSTD_CONTENTSIZE_UNKNOWN` if the content size isn't stored in the + * frame header. `ZSTD_CONTENTSIZE_ERROR` on invalid input. + */ +unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize); + +/** + * ZSTD_findDecompressedSize() - returns decompressed size of a series of frames + * @src: It should point to the start of a series of zstd encoded and/or + * skippable frames. + * @srcSize: The exact size of the series of frames. + * + * If any zstd encoded frame in the series doesn't have the frame content size + * set, `ZSTD_CONTENTSIZE_UNKNOWN` is returned. But frame content size is always + * set when using ZSTD_compress(). The decompressed size can be very large. + * If the source is untrusted, the decompressed size could be wrong or + * intentionally modified. Always ensure the result fits within the + * application's authorized limits. ZSTD_findDecompressedSize() handles multiple + * frames, and so it must traverse the input to read each frame header. This is + * efficient as most of the data is skipped, however it does mean that all frame + * data must be present and valid. + * + * Return: Decompressed size of all the data contained in the frames if known. + * `ZSTD_CONTENTSIZE_UNKNOWN` if the decompressed size is unknown. + * `ZSTD_CONTENTSIZE_ERROR` if an error occurred. + */ +unsigned long long ZSTD_findDecompressedSize(const void *src, size_t srcSize); + +/*-************************************* + * Advanced compression functions + **************************************/ +/** + * ZSTD_checkCParams() - ensure parameter values remain within authorized range + * @cParams: The zstd compression parameters. + * + * Return: Zero or an error, which can be checked using ZSTD_isError(). + */ +size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams); + +/** + * ZSTD_adjustCParams() - optimize parameters for a given srcSize and dictSize + * @srcSize: Optionally the estimated source size, or zero if unknown. + * @dictSize: Optionally the estimated dictionary size, or zero if unknown. + * + * Return: The optimized parameters. + */ +ZSTD_compressionParameters ZSTD_adjustCParams( + ZSTD_compressionParameters cParams, unsigned long long srcSize, + size_t dictSize); + +/*--- Advanced decompression functions ---*/ + +/** + * ZSTD_isFrame() - returns true iff the buffer starts with a valid frame + * @buffer: The source buffer to check. + * @size: The size of the source buffer, must be at least 4 bytes. + * + * Return: True iff the buffer starts with a zstd or skippable frame identifier. + */ +unsigned int ZSTD_isFrame(const void *buffer, size_t size); + +/** + * ZSTD_getDictID_fromDict() - returns the dictionary id stored in a dictionary + * @dict: The dictionary buffer. + * @dictSize: The size of the dictionary buffer. + * + * Return: The dictionary id stored within the dictionary or 0 if the + * dictionary is not a zstd dictionary. If it returns 0 the + * dictionary can still be loaded as a content-only dictionary. + */ +unsigned int ZSTD_getDictID_fromDict(const void *dict, size_t dictSize); + +/** + * ZSTD_getDictID_fromDDict() - returns the dictionary id stored in a ZSTD_DDict + * @ddict: The ddict to find the id of. + * + * Return: The dictionary id stored within `ddict` or 0 if the dictionary is not + * a zstd dictionary. If it returns 0 `ddict` will be loaded as a + * content-only dictionary. + */ +unsigned int ZSTD_getDictID_fromDDict(const ZSTD_DDict *ddict); + +/** + * ZSTD_getDictID_fromFrame() - returns the dictionary id stored in a zstd frame + * @src: Source buffer. It must be a zstd encoded frame. + * @srcSize: The size of the source buffer. It must be at least as large as the + * frame header. `ZSTD_frameHeaderSize_max` is always large enough. + * + * Return: The dictionary id required to decompress the frame stored within + * `src` or 0 if the dictionary id could not be decoded. It can return + * 0 if the frame does not require a dictionary, the dictionary id + * wasn't stored in the frame, `src` is not a zstd frame, or `srcSize` + * is too small. + */ +unsigned int ZSTD_getDictID_fromFrame(const void *src, size_t srcSize); + +/** + * struct ZSTD_frameParams - zstd frame parameters stored in the frame header + * @frameContentSize: The frame content size, or 0 if not present. + * @windowSize: The window size, or 0 if the frame is a skippable frame. + * @dictID: The dictionary id, or 0 if not present. + * @checksumFlag: Whether a checksum was used. + */ +typedef struct { + unsigned long long frameContentSize; + unsigned int windowSize; + unsigned int dictID; + unsigned int checksumFlag; +} ZSTD_frameParams; + +/** + * ZSTD_getFrameParams() - extracts parameters from a zstd or skippable frame + * @fparamsPtr: On success the frame parameters are written here. + * @src: The source buffer. It must point to a zstd or skippable frame. + * @srcSize: The size of the source buffer. `ZSTD_frameHeaderSize_max` is + * always large enough to succeed. + * + * Return: 0 on success. If more data is required it returns how many bytes + * must be provided to make forward progress. Otherwise it returns + * an error, which can be checked using ZSTD_isError(). + */ +size_t ZSTD_getFrameParams(ZSTD_frameParams *fparamsPtr, const void *src, + size_t srcSize); + +/*-***************************************************************************** + * Buffer-less and synchronous inner streaming functions + * + * This is an advanced API, giving full control over buffer management, for + * users which need direct control over memory. + * But it's also a complex one, with many restrictions (documented below). + * Prefer using normal streaming API for an easier experience + ******************************************************************************/ + +/*-***************************************************************************** + * Buffer-less streaming compression (synchronous mode) + * + * A ZSTD_CCtx object is required to track streaming operations. + * Use ZSTD_initCCtx() to initialize a context. + * ZSTD_CCtx object can be re-used multiple times within successive compression + * operations. + * + * Start by initializing a context. + * Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary + * compression, + * or ZSTD_compressBegin_advanced(), for finer parameter control. + * It's also possible to duplicate a reference context which has already been + * initialized, using ZSTD_copyCCtx() + * + * Then, consume your input using ZSTD_compressContinue(). + * There are some important considerations to keep in mind when using this + * advanced function : + * - ZSTD_compressContinue() has no internal buffer. It uses externally provided + * buffer only. + * - Interface is synchronous : input is consumed entirely and produce 1+ + * (or more) compressed blocks. + * - Caller must ensure there is enough space in `dst` to store compressed data + * under worst case scenario. Worst case evaluation is provided by + * ZSTD_compressBound(). + * ZSTD_compressContinue() doesn't guarantee recover after a failed + * compression. + * - ZSTD_compressContinue() presumes prior input ***is still accessible and + * unmodified*** (up to maximum distance size, see WindowLog). + * It remembers all previous contiguous blocks, plus one separated memory + * segment (which can itself consists of multiple contiguous blocks) + * - ZSTD_compressContinue() detects that prior input has been overwritten when + * `src` buffer overlaps. In which case, it will "discard" the relevant memory + * section from its history. + * + * Finish a frame with ZSTD_compressEnd(), which will write the last block(s) + * and optional checksum. It's possible to use srcSize==0, in which case, it + * will write a final empty block to end the frame. Without last block mark, + * frames will be considered unfinished (corrupted) by decoders. + * + * `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress some new + * frame. + ******************************************************************************/ + +/*===== Buffer-less streaming compression functions =====*/ +size_t ZSTD_compressBegin(ZSTD_CCtx *cctx, int compressionLevel); +size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx *cctx, const void *dict, + size_t dictSize, int compressionLevel); +size_t ZSTD_compressBegin_advanced(ZSTD_CCtx *cctx, const void *dict, + size_t dictSize, ZSTD_parameters params, + unsigned long long pledgedSrcSize); +size_t ZSTD_copyCCtx(ZSTD_CCtx *cctx, const ZSTD_CCtx *preparedCCtx, + unsigned long long pledgedSrcSize); +size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx *cctx, const ZSTD_CDict *cdict, + unsigned long long pledgedSrcSize); +size_t ZSTD_compressContinue(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, + const void *src, size_t srcSize); +size_t ZSTD_compressEnd(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, + const void *src, size_t srcSize); + + + +/*-***************************************************************************** + * Buffer-less streaming decompression (synchronous mode) + * + * A ZSTD_DCtx object is required to track streaming operations. + * Use ZSTD_initDCtx() to initialize a context. + * A ZSTD_DCtx object can be re-used multiple times. + * + * First typical operation is to retrieve frame parameters, using + * ZSTD_getFrameParams(). It fills a ZSTD_frameParams structure which provide + * important information to correctly decode the frame, such as the minimum + * rolling buffer size to allocate to decompress data (`windowSize`), and the + * dictionary ID used. + * Note: content size is optional, it may not be present. 0 means unknown. + * Note that these values could be wrong, either because of data malformation, + * or because an attacker is spoofing deliberate false information. As a + * consequence, check that values remain within valid application range, + * especially `windowSize`, before allocation. Each application can set its own + * limit, depending on local restrictions. For extended interoperability, it is + * recommended to support at least 8 MB. + * Frame parameters are extracted from the beginning of the compressed frame. + * Data fragment must be large enough to ensure successful decoding, typically + * `ZSTD_frameHeaderSize_max` bytes. + * Result: 0: successful decoding, the `ZSTD_frameParams` structure is filled. + * >0: `srcSize` is too small, provide at least this many bytes. + * errorCode, which can be tested using ZSTD_isError(). + * + * Start decompression, with ZSTD_decompressBegin() or + * ZSTD_decompressBegin_usingDict(). Alternatively, you can copy a prepared + * context, using ZSTD_copyDCtx(). + * + * Then use ZSTD_nextSrcSizeToDecompress() and ZSTD_decompressContinue() + * alternatively. + * ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' + * to ZSTD_decompressContinue(). + * ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will + * fail. + * + * The result of ZSTD_decompressContinue() is the number of bytes regenerated + * within 'dst' (necessarily <= dstCapacity). It can be zero, which is not an + * error; it just means ZSTD_decompressContinue() has decoded some metadata + * item. It can also be an error code, which can be tested with ZSTD_isError(). + * + * ZSTD_decompressContinue() needs previous data blocks during decompression, up + * to `windowSize`. They should preferably be located contiguously, prior to + * current block. Alternatively, a round buffer of sufficient size is also + * possible. Sufficient size is determined by frame parameters. + * ZSTD_decompressContinue() is very sensitive to contiguity, if 2 blocks don't + * follow each other, make sure that either the compressor breaks contiguity at + * the same place, or that previous contiguous segment is large enough to + * properly handle maximum back-reference. + * + * A frame is fully decoded when ZSTD_nextSrcSizeToDecompress() returns zero. + * Context can then be reset to start a new decompression. + * + * Note: it's possible to know if next input to present is a header or a block, + * using ZSTD_nextInputType(). This information is not required to properly + * decode a frame. + * + * == Special case: skippable frames == + * + * Skippable frames allow integration of user-defined data into a flow of + * concatenated frames. Skippable frames will be ignored (skipped) by a + * decompressor. The format of skippable frames is as follows: + * a) Skippable frame ID - 4 Bytes, Little endian format, any value from + * 0x184D2A50 to 0x184D2A5F + * b) Frame Size - 4 Bytes, Little endian format, unsigned 32-bits + * c) Frame Content - any content (User Data) of length equal to Frame Size + * For skippable frames ZSTD_decompressContinue() always returns 0. + * For skippable frames ZSTD_getFrameParams() returns fparamsPtr->windowLog==0 + * what means that a frame is skippable. + * Note: If fparamsPtr->frameContentSize==0, it is ambiguous: the frame might + * actually be a zstd encoded frame with no content. For purposes of + * decompression, it is valid in both cases to skip the frame using + * ZSTD_findFrameCompressedSize() to find its size in bytes. + * It also returns frame size as fparamsPtr->frameContentSize. + ******************************************************************************/ + +/*===== Buffer-less streaming decompression functions =====*/ +size_t ZSTD_decompressBegin(ZSTD_DCtx *dctx); +size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx *dctx, const void *dict, + size_t dictSize); +void ZSTD_copyDCtx(ZSTD_DCtx *dctx, const ZSTD_DCtx *preparedDCtx); +size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx *dctx); +size_t ZSTD_decompressContinue(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, + const void *src, size_t srcSize); +typedef enum { + ZSTDnit_frameHeader, + ZSTDnit_blockHeader, + ZSTDnit_block, + ZSTDnit_lastBlock, + ZSTDnit_checksum, + ZSTDnit_skippableFrame +} ZSTD_nextInputType_e; +ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx *dctx); + +/*-***************************************************************************** + * Block functions + * + * Block functions produce and decode raw zstd blocks, without frame metadata. + * Frame metadata cost is typically ~18 bytes, which can be non-negligible for + * very small blocks (< 100 bytes). User will have to take in charge required + * information to regenerate data, such as compressed and content sizes. + * + * A few rules to respect: + * - Compressing and decompressing require a context structure + * + Use ZSTD_initCCtx() and ZSTD_initDCtx() + * - It is necessary to init context before starting + * + compression : ZSTD_compressBegin() + * + decompression : ZSTD_decompressBegin() + * + variants _usingDict() are also allowed + * + copyCCtx() and copyDCtx() work too + * - Block size is limited, it must be <= ZSTD_getBlockSizeMax() + * + If you need to compress more, cut data into multiple blocks + * + Consider using the regular ZSTD_compress() instead, as frame metadata + * costs become negligible when source size is large. + * - When a block is considered not compressible enough, ZSTD_compressBlock() + * result will be zero. In which case, nothing is produced into `dst`. + * + User must test for such outcome and deal directly with uncompressed data + * + ZSTD_decompressBlock() doesn't accept uncompressed data as input!!! + * + In case of multiple successive blocks, decoder must be informed of + * uncompressed block existence to follow proper history. Use + * ZSTD_insertBlock() in such a case. + ******************************************************************************/ + +/* Define for static allocation */ +#define ZSTD_BLOCKSIZE_ABSOLUTEMAX (128 * 1024) +/*===== Raw zstd block functions =====*/ +size_t ZSTD_getBlockSizeMax(ZSTD_CCtx *cctx); +size_t ZSTD_compressBlock(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, + const void *src, size_t srcSize); +size_t ZSTD_decompressBlock(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, + const void *src, size_t srcSize); +size_t ZSTD_insertBlock(ZSTD_DCtx *dctx, const void *blockStart, + size_t blockSize); + +#endif /* ZSTD_H */ diff --git a/lib/Kconfig b/lib/Kconfig index 45f6b279d4a0..e1f158d7cd78 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -242,6 +242,14 @@ config LZ4HC_COMPRESS config LZ4_DECOMPRESS tristate +config ZSTD_COMPRESS + select XXHASH + tristate + +config ZSTD_DECOMPRESS + select XXHASH + tristate + source "lib/xz/Kconfig" # diff --git a/lib/Makefile b/lib/Makefile index eb5af4a37c7f..5b6027433688 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -107,6 +107,8 @@ obj-$(CONFIG_LZO_DECOMPRESS) += lzo/ obj-$(CONFIG_LZ4_COMPRESS) += lz4/ obj-$(CONFIG_LZ4HC_COMPRESS) += lz4/ obj-$(CONFIG_LZ4_DECOMPRESS) += lz4/ +obj-$(CONFIG_ZSTD_COMPRESS) += zstd/ +obj-$(CONFIG_ZSTD_DECOMPRESS) += zstd/ obj-$(CONFIG_XZ_DEC) += xz/ obj-$(CONFIG_RAID6_PQ) += raid6/ diff --git a/lib/zstd/Makefile b/lib/zstd/Makefile new file mode 100644 index 000000000000..dd0a359c135b --- /dev/null +++ b/lib/zstd/Makefile @@ -0,0 +1,18 @@ +obj-$(CONFIG_ZSTD_COMPRESS) += zstd_compress.o +obj-$(CONFIG_ZSTD_DECOMPRESS) += zstd_decompress.o + +ccflags-y += -O3 + +# Object files unique to zstd_compress and zstd_decompress +zstd_compress-y := fse_compress.o huf_compress.o compress.o +zstd_decompress-y := huf_decompress.o decompress.o + +# These object files are shared between the modules. +# Always add them to zstd_compress. +# Unless both zstd_compress and zstd_decompress are built in +# then also add them to zstd_decompress. +zstd_compress-y += entropy_common.o fse_decompress.o zstd_common.o + +ifneq ($(CONFIG_ZSTD_COMPRESS)$(CONFIG_ZSTD_DECOMPRESS),yy) + zstd_decompress-y += entropy_common.o fse_decompress.o zstd_common.o +endif diff --git a/lib/zstd/bitstream.h b/lib/zstd/bitstream.h new file mode 100644 index 000000000000..a826b99e1d63 --- /dev/null +++ b/lib/zstd/bitstream.h @@ -0,0 +1,374 @@ +/* + * bitstream + * Part of FSE library + * header file (to include) + * Copyright (C) 2013-2016, Yann Collet. + * + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * This program is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License version 2 as published by the + * Free Software Foundation. This program is dual-licensed; you may select + * either version 2 of the GNU General Public License ("GPL") or BSD license + * ("BSD"). + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy + */ +#ifndef BITSTREAM_H_MODULE +#define BITSTREAM_H_MODULE + +/* +* This API consists of small unitary functions, which must be inlined for best performance. +* Since link-time-optimization is not available for all compilers, +* these functions are defined into a .h to be included. +*/ + +/*-**************************************** +* Dependencies +******************************************/ +#include "error_private.h" /* error codes and messages */ +#include "mem.h" /* unaligned access routines */ + +/*========================================= +* Target specific +=========================================*/ +#define STREAM_ACCUMULATOR_MIN_32 25 +#define STREAM_ACCUMULATOR_MIN_64 57 +#define STREAM_ACCUMULATOR_MIN ((U32)(ZSTD_32bits() ? STREAM_ACCUMULATOR_MIN_32 : STREAM_ACCUMULATOR_MIN_64)) + +/*-****************************************** +* bitStream encoding API (write forward) +********************************************/ +/* bitStream can mix input from multiple sources. +* A critical property of these streams is that they encode and decode in **reverse** direction. +* So the first bit sequence you add will be the last to be read, like a LIFO stack. +*/ +typedef struct { + size_t bitContainer; + int bitPos; + char *startPtr; + char *ptr; + char *endPtr; +} BIT_CStream_t; + +ZSTD_STATIC size_t BIT_initCStream(BIT_CStream_t *bitC, void *dstBuffer, size_t dstCapacity); +ZSTD_STATIC void BIT_addBits(BIT_CStream_t *bitC, size_t value, unsigned nbBits); +ZSTD_STATIC void BIT_flushBits(BIT_CStream_t *bitC); +ZSTD_STATIC size_t BIT_closeCStream(BIT_CStream_t *bitC); + +/* Start with initCStream, providing the size of buffer to write into. +* bitStream will never write outside of this buffer. +* `dstCapacity` must be >= sizeof(bitD->bitContainer), otherwise @return will be an error code. +* +* bits are first added to a local register. +* Local register is size_t, hence 64-bits on 64-bits systems, or 32-bits on 32-bits systems. +* Writing data into memory is an explicit operation, performed by the flushBits function. +* Hence keep track how many bits are potentially stored into local register to avoid register overflow. +* After a flushBits, a maximum of 7 bits might still be stored into local register. +* +* Avoid storing elements of more than 24 bits if you want compatibility with 32-bits bitstream readers. +* +* Last operation is to close the bitStream. +* The function returns the final size of CStream in bytes. +* If data couldn't fit into `dstBuffer`, it will return a 0 ( == not storable) +*/ + +/*-******************************************** +* bitStream decoding API (read backward) +**********************************************/ +typedef struct { + size_t bitContainer; + unsigned bitsConsumed; + const char *ptr; + const char *start; +} BIT_DStream_t; + +typedef enum { + BIT_DStream_unfinished = 0, + BIT_DStream_endOfBuffer = 1, + BIT_DStream_completed = 2, + BIT_DStream_overflow = 3 +} BIT_DStream_status; /* result of BIT_reloadDStream() */ +/* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */ + +ZSTD_STATIC size_t BIT_initDStream(BIT_DStream_t *bitD, const void *srcBuffer, size_t srcSize); +ZSTD_STATIC size_t BIT_readBits(BIT_DStream_t *bitD, unsigned nbBits); +ZSTD_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t *bitD); +ZSTD_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t *bitD); + +/* Start by invoking BIT_initDStream(). +* A chunk of the bitStream is then stored into a local register. +* Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t). +* You can then retrieve bitFields stored into the local register, **in reverse order**. +* Local register is explicitly reloaded from memory by the BIT_reloadDStream() method. +* A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished. +* Otherwise, it can be less than that, so proceed accordingly. +* Checking if DStream has reached its end can be performed with BIT_endOfDStream(). +*/ + +/*-**************************************** +* unsafe API +******************************************/ +ZSTD_STATIC void BIT_addBitsFast(BIT_CStream_t *bitC, size_t value, unsigned nbBits); +/* faster, but works only if value is "clean", meaning all high bits above nbBits are 0 */ + +ZSTD_STATIC void BIT_flushBitsFast(BIT_CStream_t *bitC); +/* unsafe version; does not check buffer overflow */ + +ZSTD_STATIC size_t BIT_readBitsFast(BIT_DStream_t *bitD, unsigned nbBits); +/* faster, but works only if nbBits >= 1 */ + +/*-************************************************************** +* Internal functions +****************************************************************/ +ZSTD_STATIC unsigned BIT_highbit32(register U32 val) { return 31 - __builtin_clz(val); } + +/*===== Local Constants =====*/ +static const unsigned BIT_mask[] = {0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, + 0x1FF, 0x3FF, 0x7FF, 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF, 0x1FFFF, + 0x3FFFF, 0x7FFFF, 0xFFFFF, 0x1FFFFF, 0x3FFFFF, 0x7FFFFF, 0xFFFFFF, 0x1FFFFFF, 0x3FFFFFF}; /* up to 26 bits */ + +/*-************************************************************** +* bitStream encoding +****************************************************************/ +/*! BIT_initCStream() : + * `dstCapacity` must be > sizeof(void*) + * @return : 0 if success, + otherwise an error code (can be tested using ERR_isError() ) */ +ZSTD_STATIC size_t BIT_initCStream(BIT_CStream_t *bitC, void *startPtr, size_t dstCapacity) +{ + bitC->bitContainer = 0; + bitC->bitPos = 0; + bitC->startPtr = (char *)startPtr; + bitC->ptr = bitC->startPtr; + bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->ptr); + if (dstCapacity <= sizeof(bitC->ptr)) + return ERROR(dstSize_tooSmall); + return 0; +} + +/*! BIT_addBits() : + can add up to 26 bits into `bitC`. + Does not check for register overflow ! */ +ZSTD_STATIC void BIT_addBits(BIT_CStream_t *bitC, size_t value, unsigned nbBits) +{ + bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos; + bitC->bitPos += nbBits; +} + +/*! BIT_addBitsFast() : + * works only if `value` is _clean_, meaning all high bits above nbBits are 0 */ +ZSTD_STATIC void BIT_addBitsFast(BIT_CStream_t *bitC, size_t value, unsigned nbBits) +{ + bitC->bitContainer |= value << bitC->bitPos; + bitC->bitPos += nbBits; +} + +/*! BIT_flushBitsFast() : + * unsafe version; does not check buffer overflow */ +ZSTD_STATIC void BIT_flushBitsFast(BIT_CStream_t *bitC) +{ + size_t const nbBytes = bitC->bitPos >> 3; + ZSTD_writeLEST(bitC->ptr, bitC->bitContainer); + bitC->ptr += nbBytes; + bitC->bitPos &= 7; + bitC->bitContainer >>= nbBytes * 8; /* if bitPos >= sizeof(bitContainer)*8 --> undefined behavior */ +} + +/*! BIT_flushBits() : + * safe version; check for buffer overflow, and prevents it. + * note : does not signal buffer overflow. This will be revealed later on using BIT_closeCStream() */ +ZSTD_STATIC void BIT_flushBits(BIT_CStream_t *bitC) +{ + size_t const nbBytes = bitC->bitPos >> 3; + ZSTD_writeLEST(bitC->ptr, bitC->bitContainer); + bitC->ptr += nbBytes; + if (bitC->ptr > bitC->endPtr) + bitC->ptr = bitC->endPtr; + bitC->bitPos &= 7; + bitC->bitContainer >>= nbBytes * 8; /* if bitPos >= sizeof(bitContainer)*8 --> undefined behavior */ +} + +/*! BIT_closeCStream() : + * @return : size of CStream, in bytes, + or 0 if it could not fit into dstBuffer */ +ZSTD_STATIC size_t BIT_closeCStream(BIT_CStream_t *bitC) +{ + BIT_addBitsFast(bitC, 1, 1); /* endMark */ + BIT_flushBits(bitC); + + if (bitC->ptr >= bitC->endPtr) + return 0; /* doesn't fit within authorized budget : cancel */ + + return (bitC->ptr - bitC->startPtr) + (bitC->bitPos > 0); +} + +/*-******************************************************** +* bitStream decoding +**********************************************************/ +/*! BIT_initDStream() : +* Initialize a BIT_DStream_t. +* `bitD` : a pointer to an already allocated BIT_DStream_t structure. +* `srcSize` must be the *exact* size of the bitStream, in bytes. +* @return : size of stream (== srcSize) or an errorCode if a problem is detected +*/ +ZSTD_STATIC size_t BIT_initDStream(BIT_DStream_t *bitD, const void *srcBuffer, size_t srcSize) +{ + if (srcSize < 1) { + memset(bitD, 0, sizeof(*bitD)); + return ERROR(srcSize_wrong); + } + + if (srcSize >= sizeof(bitD->bitContainer)) { /* normal case */ + bitD->start = (const char *)srcBuffer; + bitD->ptr = (const char *)srcBuffer + srcSize - sizeof(bitD->bitContainer); + bitD->bitContainer = ZSTD_readLEST(bitD->ptr); + { + BYTE const lastByte = ((const BYTE *)srcBuffer)[srcSize - 1]; + bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; /* ensures bitsConsumed is always set */ + if (lastByte == 0) + return ERROR(GENERIC); /* endMark not present */ + } + } else { + bitD->start = (const char *)srcBuffer; + bitD->ptr = bitD->start; + bitD->bitContainer = *(const BYTE *)(bitD->start); + switch (srcSize) { + case 7: bitD->bitContainer += (size_t)(((const BYTE *)(srcBuffer))[6]) << (sizeof(bitD->bitContainer) * 8 - 16); + case 6: bitD->bitContainer += (size_t)(((const BYTE *)(srcBuffer))[5]) << (sizeof(bitD->bitContainer) * 8 - 24); + case 5: bitD->bitContainer += (size_t)(((const BYTE *)(srcBuffer))[4]) << (sizeof(bitD->bitContainer) * 8 - 32); + case 4: bitD->bitContainer += (size_t)(((const BYTE *)(srcBuffer))[3]) << 24; + case 3: bitD->bitContainer += (size_t)(((const BYTE *)(srcBuffer))[2]) << 16; + case 2: bitD->bitContainer += (size_t)(((const BYTE *)(srcBuffer))[1]) << 8; + default:; + } + { + BYTE const lastByte = ((const BYTE *)srcBuffer)[srcSize - 1]; + bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; + if (lastByte == 0) + return ERROR(GENERIC); /* endMark not present */ + } + bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize) * 8; + } + + return srcSize; +} + +ZSTD_STATIC size_t BIT_getUpperBits(size_t bitContainer, U32 const start) { return bitContainer >> start; } + +ZSTD_STATIC size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits) { return (bitContainer >> start) & BIT_mask[nbBits]; } + +ZSTD_STATIC size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits) { return bitContainer & BIT_mask[nbBits]; } + +/*! BIT_lookBits() : + * Provides next n bits from local register. + * local register is not modified. + * On 32-bits, maxNbBits==24. + * On 64-bits, maxNbBits==56. + * @return : value extracted + */ +ZSTD_STATIC size_t BIT_lookBits(const BIT_DStream_t *bitD, U32 nbBits) +{ + U32 const bitMask = sizeof(bitD->bitContainer) * 8 - 1; + return ((bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> 1) >> ((bitMask - nbBits) & bitMask); +} + +/*! BIT_lookBitsFast() : +* unsafe version; only works only if nbBits >= 1 */ +ZSTD_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t *bitD, U32 nbBits) +{ + U32 const bitMask = sizeof(bitD->bitContainer) * 8 - 1; + return (bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> (((bitMask + 1) - nbBits) & bitMask); +} + +ZSTD_STATIC void BIT_skipBits(BIT_DStream_t *bitD, U32 nbBits) { bitD->bitsConsumed += nbBits; } + +/*! BIT_readBits() : + * Read (consume) next n bits from local register and update. + * Pay attention to not read more than nbBits contained into local register. + * @return : extracted value. + */ +ZSTD_STATIC size_t BIT_readBits(BIT_DStream_t *bitD, U32 nbBits) +{ + size_t const value = BIT_lookBits(bitD, nbBits); + BIT_skipBits(bitD, nbBits); + return value; +} + +/*! BIT_readBitsFast() : +* unsafe version; only works only if nbBits >= 1 */ +ZSTD_STATIC size_t BIT_readBitsFast(BIT_DStream_t *bitD, U32 nbBits) +{ + size_t const value = BIT_lookBitsFast(bitD, nbBits); + BIT_skipBits(bitD, nbBits); + return value; +} + +/*! BIT_reloadDStream() : +* Refill `bitD` from buffer previously set in BIT_initDStream() . +* This function is safe, it guarantees it will not read beyond src buffer. +* @return : status of `BIT_DStream_t` internal register. + if status == BIT_DStream_unfinished, internal register is filled with >= (sizeof(bitD->bitContainer)*8 - 7) bits */ +ZSTD_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t *bitD) +{ + if (bitD->bitsConsumed > (sizeof(bitD->bitContainer) * 8)) /* should not happen => corruption detected */ + return BIT_DStream_overflow; + + if (bitD->ptr >= bitD->start + sizeof(bitD->bitContainer)) { + bitD->ptr -= bitD->bitsConsumed >> 3; + bitD->bitsConsumed &= 7; + bitD->bitContainer = ZSTD_readLEST(bitD->ptr); + return BIT_DStream_unfinished; + } + if (bitD->ptr == bitD->start) { + if (bitD->bitsConsumed < sizeof(bitD->bitContainer) * 8) + return BIT_DStream_endOfBuffer; + return BIT_DStream_completed; + } + { + U32 nbBytes = bitD->bitsConsumed >> 3; + BIT_DStream_status result = BIT_DStream_unfinished; + if (bitD->ptr - nbBytes < bitD->start) { + nbBytes = (U32)(bitD->ptr - bitD->start); /* ptr > start */ + result = BIT_DStream_endOfBuffer; + } + bitD->ptr -= nbBytes; + bitD->bitsConsumed -= nbBytes * 8; + bitD->bitContainer = ZSTD_readLEST(bitD->ptr); /* reminder : srcSize > sizeof(bitD) */ + return result; + } +} + +/*! BIT_endOfDStream() : +* @return Tells if DStream has exactly reached its end (all bits consumed). +*/ +ZSTD_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t *DStream) +{ + return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer) * 8)); +} + +#endif /* BITSTREAM_H_MODULE */ diff --git a/lib/zstd/compress.c b/lib/zstd/compress.c new file mode 100644 index 000000000000..f9166cf4f7a9 --- /dev/null +++ b/lib/zstd/compress.c @@ -0,0 +1,3484 @@ +/** + * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of https://github.com/facebook/zstd. + * An additional grant of patent rights can be found in the PATENTS file in the + * same directory. + * + * This program is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License version 2 as published by the + * Free Software Foundation. This program is dual-licensed; you may select + * either version 2 of the GNU General Public License ("GPL") or BSD license + * ("BSD"). + */ + +/*-************************************* +* Dependencies +***************************************/ +#include "fse.h" +#include "huf.h" +#include "mem.h" +#include "zstd_internal.h" /* includes zstd.h */ +#include +#include +#include /* memset */ + +/*-************************************* +* Constants +***************************************/ +static const U32 g_searchStrength = 8; /* control skip over incompressible data */ +#define HASH_READ_SIZE 8 +typedef enum { ZSTDcs_created = 0, ZSTDcs_init, ZSTDcs_ongoing, ZSTDcs_ending } ZSTD_compressionStage_e; + +/*-************************************* +* Helper functions +***************************************/ +size_t ZSTD_compressBound(size_t srcSize) { return FSE_compressBound(srcSize) + 12; } + +/*-************************************* +* Sequence storage +***************************************/ +static void ZSTD_resetSeqStore(seqStore_t *ssPtr) +{ + ssPtr->lit = ssPtr->litStart; + ssPtr->sequences = ssPtr->sequencesStart; + ssPtr->longLengthID = 0; +} + +/*-************************************* +* Context memory management +***************************************/ +struct ZSTD_CCtx_s { + const BYTE *nextSrc; /* next block here to continue on curr prefix */ + const BYTE *base; /* All regular indexes relative to this position */ + const BYTE *dictBase; /* extDict indexes relative to this position */ + U32 dictLimit; /* below that point, need extDict */ + U32 lowLimit; /* below that point, no more data */ + U32 nextToUpdate; /* index from which to continue dictionary update */ + U32 nextToUpdate3; /* index from which to continue dictionary update */ + U32 hashLog3; /* dispatch table : larger == faster, more memory */ + U32 loadedDictEnd; /* index of end of dictionary */ + U32 forceWindow; /* force back-references to respect limit of 1< 3) ? 0 : MIN(ZSTD_HASHLOG3_MAX, cParams.windowLog); + size_t const h3Size = ((size_t)1) << hashLog3; + size_t const tableSpace = (chainSize + hSize + h3Size) * sizeof(U32); + size_t const optSpace = + ((MaxML + 1) + (MaxLL + 1) + (MaxOff + 1) + (1 << Litbits)) * sizeof(U32) + (ZSTD_OPT_NUM + 1) * (sizeof(ZSTD_match_t) + sizeof(ZSTD_optimal_t)); + size_t const workspaceSize = tableSpace + (256 * sizeof(U32)) /* huffTable */ + tokenSpace + + (((cParams.strategy == ZSTD_btopt) || (cParams.strategy == ZSTD_btopt2)) ? optSpace : 0); + + return ZSTD_ALIGN(sizeof(ZSTD_stack)) + ZSTD_ALIGN(sizeof(ZSTD_CCtx)) + ZSTD_ALIGN(workspaceSize); +} + +static ZSTD_CCtx *ZSTD_createCCtx_advanced(ZSTD_customMem customMem) +{ + ZSTD_CCtx *cctx; + if (!customMem.customAlloc || !customMem.customFree) + return NULL; + cctx = (ZSTD_CCtx *)ZSTD_malloc(sizeof(ZSTD_CCtx), customMem); + if (!cctx) + return NULL; + memset(cctx, 0, sizeof(ZSTD_CCtx)); + cctx->customMem = customMem; + return cctx; +} + +ZSTD_CCtx *ZSTD_initCCtx(void *workspace, size_t workspaceSize) +{ + ZSTD_customMem const stackMem = ZSTD_initStack(workspace, workspaceSize); + ZSTD_CCtx *cctx = ZSTD_createCCtx_advanced(stackMem); + if (cctx) { + cctx->workSpace = ZSTD_stackAllocAll(cctx->customMem.opaque, &cctx->workSpaceSize); + } + return cctx; +} + +size_t ZSTD_freeCCtx(ZSTD_CCtx *cctx) +{ + if (cctx == NULL) + return 0; /* support free on NULL */ + ZSTD_free(cctx->workSpace, cctx->customMem); + ZSTD_free(cctx, cctx->customMem); + return 0; /* reserved as a potential error code in the future */ +} + +const seqStore_t *ZSTD_getSeqStore(const ZSTD_CCtx *ctx) /* hidden interface */ { return &(ctx->seqStore); } + +static ZSTD_parameters ZSTD_getParamsFromCCtx(const ZSTD_CCtx *cctx) { return cctx->params; } + +/** ZSTD_checkParams() : + ensure param values remain within authorized range. + @return : 0, or an error code if one value is beyond authorized range */ +size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams) +{ +#define CLAMPCHECK(val, min, max) \ + { \ + if ((val < min) | (val > max)) \ + return ERROR(compressionParameter_unsupported); \ + } + CLAMPCHECK(cParams.windowLog, ZSTD_WINDOWLOG_MIN, ZSTD_WINDOWLOG_MAX); + CLAMPCHECK(cParams.chainLog, ZSTD_CHAINLOG_MIN, ZSTD_CHAINLOG_MAX); + CLAMPCHECK(cParams.hashLog, ZSTD_HASHLOG_MIN, ZSTD_HASHLOG_MAX); + CLAMPCHECK(cParams.searchLog, ZSTD_SEARCHLOG_MIN, ZSTD_SEARCHLOG_MAX); + CLAMPCHECK(cParams.searchLength, ZSTD_SEARCHLENGTH_MIN, ZSTD_SEARCHLENGTH_MAX); + CLAMPCHECK(cParams.targetLength, ZSTD_TARGETLENGTH_MIN, ZSTD_TARGETLENGTH_MAX); + if ((U32)(cParams.strategy) > (U32)ZSTD_btopt2) + return ERROR(compressionParameter_unsupported); + return 0; +} + +/** ZSTD_cycleLog() : + * condition for correct operation : hashLog > 1 */ +static U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat) +{ + U32 const btScale = ((U32)strat >= (U32)ZSTD_btlazy2); + return hashLog - btScale; +} + +/** ZSTD_adjustCParams() : + optimize `cPar` for a given input (`srcSize` and `dictSize`). + mostly downsizing to reduce memory consumption and initialization. + Both `srcSize` and `dictSize` are optional (use 0 if unknown), + but if both are 0, no optimization can be done. + Note : cPar is considered validated at this stage. Use ZSTD_checkParams() to ensure that. */ +ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize) +{ + if (srcSize + dictSize == 0) + return cPar; /* no size information available : no adjustment */ + + /* resize params, to use less memory when necessary */ + { + U32 const minSrcSize = (srcSize == 0) ? 500 : 0; + U64 const rSize = srcSize + dictSize + minSrcSize; + if (rSize < ((U64)1 << ZSTD_WINDOWLOG_MAX)) { + U32 const srcLog = MAX(ZSTD_HASHLOG_MIN, ZSTD_highbit32((U32)(rSize)-1) + 1); + if (cPar.windowLog > srcLog) + cPar.windowLog = srcLog; + } + } + if (cPar.hashLog > cPar.windowLog) + cPar.hashLog = cPar.windowLog; + { + U32 const cycleLog = ZSTD_cycleLog(cPar.chainLog, cPar.strategy); + if (cycleLog > cPar.windowLog) + cPar.chainLog -= (cycleLog - cPar.windowLog); + } + + if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN) + cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN; /* required for frame header */ + + return cPar; +} + +static U32 ZSTD_equivalentParams(ZSTD_parameters param1, ZSTD_parameters param2) +{ + return (param1.cParams.hashLog == param2.cParams.hashLog) & (param1.cParams.chainLog == param2.cParams.chainLog) & + (param1.cParams.strategy == param2.cParams.strategy) & ((param1.cParams.searchLength == 3) == (param2.cParams.searchLength == 3)); +} + +/*! ZSTD_continueCCtx() : + reuse CCtx without reset (note : requires no dictionary) */ +static size_t ZSTD_continueCCtx(ZSTD_CCtx *cctx, ZSTD_parameters params, U64 frameContentSize) +{ + U32 const end = (U32)(cctx->nextSrc - cctx->base); + cctx->params = params; + cctx->frameContentSize = frameContentSize; + cctx->lowLimit = end; + cctx->dictLimit = end; + cctx->nextToUpdate = end + 1; + cctx->stage = ZSTDcs_init; + cctx->dictID = 0; + cctx->loadedDictEnd = 0; + { + int i; + for (i = 0; i < ZSTD_REP_NUM; i++) + cctx->rep[i] = repStartValue[i]; + } + cctx->seqStore.litLengthSum = 0; /* force reset of btopt stats */ + xxh64_reset(&cctx->xxhState, 0); + return 0; +} + +typedef enum { ZSTDcrp_continue, ZSTDcrp_noMemset, ZSTDcrp_fullReset } ZSTD_compResetPolicy_e; + +/*! ZSTD_resetCCtx_advanced() : + note : `params` must be validated */ +static size_t ZSTD_resetCCtx_advanced(ZSTD_CCtx *zc, ZSTD_parameters params, U64 frameContentSize, ZSTD_compResetPolicy_e const crp) +{ + if (crp == ZSTDcrp_continue) + if (ZSTD_equivalentParams(params, zc->params)) { + zc->flagStaticTables = 0; + zc->flagStaticHufTable = HUF_repeat_none; + return ZSTD_continueCCtx(zc, params, frameContentSize); + } + + { + size_t const blockSize = MIN(ZSTD_BLOCKSIZE_ABSOLUTEMAX, (size_t)1 << params.cParams.windowLog); + U32 const divider = (params.cParams.searchLength == 3) ? 3 : 4; + size_t const maxNbSeq = blockSize / divider; + size_t const tokenSpace = blockSize + 11 * maxNbSeq; + size_t const chainSize = (params.cParams.strategy == ZSTD_fast) ? 0 : (1 << params.cParams.chainLog); + size_t const hSize = ((size_t)1) << params.cParams.hashLog; + U32 const hashLog3 = (params.cParams.searchLength > 3) ? 0 : MIN(ZSTD_HASHLOG3_MAX, params.cParams.windowLog); + size_t const h3Size = ((size_t)1) << hashLog3; + size_t const tableSpace = (chainSize + hSize + h3Size) * sizeof(U32); + void *ptr; + + /* Check if workSpace is large enough, alloc a new one if needed */ + { + size_t const optSpace = ((MaxML + 1) + (MaxLL + 1) + (MaxOff + 1) + (1 << Litbits)) * sizeof(U32) + + (ZSTD_OPT_NUM + 1) * (sizeof(ZSTD_match_t) + sizeof(ZSTD_optimal_t)); + size_t const neededSpace = tableSpace + (256 * sizeof(U32)) /* huffTable */ + tokenSpace + + (((params.cParams.strategy == ZSTD_btopt) || (params.cParams.strategy == ZSTD_btopt2)) ? optSpace : 0); + if (zc->workSpaceSize < neededSpace) { + ZSTD_free(zc->workSpace, zc->customMem); + zc->workSpace = ZSTD_malloc(neededSpace, zc->customMem); + if (zc->workSpace == NULL) + return ERROR(memory_allocation); + zc->workSpaceSize = neededSpace; + } + } + + if (crp != ZSTDcrp_noMemset) + memset(zc->workSpace, 0, tableSpace); /* reset tables only */ + xxh64_reset(&zc->xxhState, 0); + zc->hashLog3 = hashLog3; + zc->hashTable = (U32 *)(zc->workSpace); + zc->chainTable = zc->hashTable + hSize; + zc->hashTable3 = zc->chainTable + chainSize; + ptr = zc->hashTable3 + h3Size; + zc->hufTable = (HUF_CElt *)ptr; + zc->flagStaticTables = 0; + zc->flagStaticHufTable = HUF_repeat_none; + ptr = ((U32 *)ptr) + 256; /* note : HUF_CElt* is incomplete type, size is simulated using U32 */ + + zc->nextToUpdate = 1; + zc->nextSrc = NULL; + zc->base = NULL; + zc->dictBase = NULL; + zc->dictLimit = 0; + zc->lowLimit = 0; + zc->params = params; + zc->blockSize = blockSize; + zc->frameContentSize = frameContentSize; + { + int i; + for (i = 0; i < ZSTD_REP_NUM; i++) + zc->rep[i] = repStartValue[i]; + } + + if ((params.cParams.strategy == ZSTD_btopt) || (params.cParams.strategy == ZSTD_btopt2)) { + zc->seqStore.litFreq = (U32 *)ptr; + zc->seqStore.litLengthFreq = zc->seqStore.litFreq + (1 << Litbits); + zc->seqStore.matchLengthFreq = zc->seqStore.litLengthFreq + (MaxLL + 1); + zc->seqStore.offCodeFreq = zc->seqStore.matchLengthFreq + (MaxML + 1); + ptr = zc->seqStore.offCodeFreq + (MaxOff + 1); + zc->seqStore.matchTable = (ZSTD_match_t *)ptr; + ptr = zc->seqStore.matchTable + ZSTD_OPT_NUM + 1; + zc->seqStore.priceTable = (ZSTD_optimal_t *)ptr; + ptr = zc->seqStore.priceTable + ZSTD_OPT_NUM + 1; + zc->seqStore.litLengthSum = 0; + } + zc->seqStore.sequencesStart = (seqDef *)ptr; + ptr = zc->seqStore.sequencesStart + maxNbSeq; + zc->seqStore.llCode = (BYTE *)ptr; + zc->seqStore.mlCode = zc->seqStore.llCode + maxNbSeq; + zc->seqStore.ofCode = zc->seqStore.mlCode + maxNbSeq; + zc->seqStore.litStart = zc->seqStore.ofCode + maxNbSeq; + + zc->stage = ZSTDcs_init; + zc->dictID = 0; + zc->loadedDictEnd = 0; + + return 0; + } +} + +/* ZSTD_invalidateRepCodes() : + * ensures next compression will not use repcodes from previous block. + * Note : only works with regular variant; + * do not use with extDict variant ! */ +void ZSTD_invalidateRepCodes(ZSTD_CCtx *cctx) +{ + int i; + for (i = 0; i < ZSTD_REP_NUM; i++) + cctx->rep[i] = 0; +} + +/*! ZSTD_copyCCtx() : +* Duplicate an existing context `srcCCtx` into another one `dstCCtx`. +* Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()). +* @return : 0, or an error code */ +size_t ZSTD_copyCCtx(ZSTD_CCtx *dstCCtx, const ZSTD_CCtx *srcCCtx, unsigned long long pledgedSrcSize) +{ + if (srcCCtx->stage != ZSTDcs_init) + return ERROR(stage_wrong); + + memcpy(&dstCCtx->customMem, &srcCCtx->customMem, sizeof(ZSTD_customMem)); + { + ZSTD_parameters params = srcCCtx->params; + params.fParams.contentSizeFlag = (pledgedSrcSize > 0); + ZSTD_resetCCtx_advanced(dstCCtx, params, pledgedSrcSize, ZSTDcrp_noMemset); + } + + /* copy tables */ + { + size_t const chainSize = (srcCCtx->params.cParams.strategy == ZSTD_fast) ? 0 : (1 << srcCCtx->params.cParams.chainLog); + size_t const hSize = ((size_t)1) << srcCCtx->params.cParams.hashLog; + size_t const h3Size = (size_t)1 << srcCCtx->hashLog3; + size_t const tableSpace = (chainSize + hSize + h3Size) * sizeof(U32); + memcpy(dstCCtx->workSpace, srcCCtx->workSpace, tableSpace); + } + + /* copy dictionary offsets */ + dstCCtx->nextToUpdate = srcCCtx->nextToUpdate; + dstCCtx->nextToUpdate3 = srcCCtx->nextToUpdate3; + dstCCtx->nextSrc = srcCCtx->nextSrc; + dstCCtx->base = srcCCtx->base; + dstCCtx->dictBase = srcCCtx->dictBase; + dstCCtx->dictLimit = srcCCtx->dictLimit; + dstCCtx->lowLimit = srcCCtx->lowLimit; + dstCCtx->loadedDictEnd = srcCCtx->loadedDictEnd; + dstCCtx->dictID = srcCCtx->dictID; + + /* copy entropy tables */ + dstCCtx->flagStaticTables = srcCCtx->flagStaticTables; + dstCCtx->flagStaticHufTable = srcCCtx->flagStaticHufTable; + if (srcCCtx->flagStaticTables) { + memcpy(dstCCtx->litlengthCTable, srcCCtx->litlengthCTable, sizeof(dstCCtx->litlengthCTable)); + memcpy(dstCCtx->matchlengthCTable, srcCCtx->matchlengthCTable, sizeof(dstCCtx->matchlengthCTable)); + memcpy(dstCCtx->offcodeCTable, srcCCtx->offcodeCTable, sizeof(dstCCtx->offcodeCTable)); + } + if (srcCCtx->flagStaticHufTable) { + memcpy(dstCCtx->hufTable, srcCCtx->hufTable, 256 * 4); + } + + return 0; +} + +/*! ZSTD_reduceTable() : +* reduce table indexes by `reducerValue` */ +static void ZSTD_reduceTable(U32 *const table, U32 const size, U32 const reducerValue) +{ + U32 u; + for (u = 0; u < size; u++) { + if (table[u] < reducerValue) + table[u] = 0; + else + table[u] -= reducerValue; + } +} + +/*! ZSTD_reduceIndex() : +* rescale all indexes to avoid future overflow (indexes are U32) */ +static void ZSTD_reduceIndex(ZSTD_CCtx *zc, const U32 reducerValue) +{ + { + U32 const hSize = 1 << zc->params.cParams.hashLog; + ZSTD_reduceTable(zc->hashTable, hSize, reducerValue); + } + + { + U32 const chainSize = (zc->params.cParams.strategy == ZSTD_fast) ? 0 : (1 << zc->params.cParams.chainLog); + ZSTD_reduceTable(zc->chainTable, chainSize, reducerValue); + } + + { + U32 const h3Size = (zc->hashLog3) ? 1 << zc->hashLog3 : 0; + ZSTD_reduceTable(zc->hashTable3, h3Size, reducerValue); + } +} + +/*-******************************************************* +* Block entropic compression +*********************************************************/ + +/* See doc/zstd_compression_format.md for detailed format description */ + +size_t ZSTD_noCompressBlock(void *dst, size_t dstCapacity, const void *src, size_t srcSize) +{ + if (srcSize + ZSTD_blockHeaderSize > dstCapacity) + return ERROR(dstSize_tooSmall); + memcpy((BYTE *)dst + ZSTD_blockHeaderSize, src, srcSize); + ZSTD_writeLE24(dst, (U32)(srcSize << 2) + (U32)bt_raw); + return ZSTD_blockHeaderSize + srcSize; +} + +static size_t ZSTD_noCompressLiterals(void *dst, size_t dstCapacity, const void *src, size_t srcSize) +{ + BYTE *const ostart = (BYTE * const)dst; + U32 const flSize = 1 + (srcSize > 31) + (srcSize > 4095); + + if (srcSize + flSize > dstCapacity) + return ERROR(dstSize_tooSmall); + + switch (flSize) { + case 1: /* 2 - 1 - 5 */ ostart[0] = (BYTE)((U32)set_basic + (srcSize << 3)); break; + case 2: /* 2 - 2 - 12 */ ZSTD_writeLE16(ostart, (U16)((U32)set_basic + (1 << 2) + (srcSize << 4))); break; + default: /*note : should not be necessary : flSize is within {1,2,3} */ + case 3: /* 2 - 2 - 20 */ ZSTD_writeLE32(ostart, (U32)((U32)set_basic + (3 << 2) + (srcSize << 4))); break; + } + + memcpy(ostart + flSize, src, srcSize); + return srcSize + flSize; +} + +static size_t ZSTD_compressRleLiteralsBlock(void *dst, size_t dstCapacity, const void *src, size_t srcSize) +{ + BYTE *const ostart = (BYTE * const)dst; + U32 const flSize = 1 + (srcSize > 31) + (srcSize > 4095); + + (void)dstCapacity; /* dstCapacity already guaranteed to be >=4, hence large enough */ + + switch (flSize) { + case 1: /* 2 - 1 - 5 */ ostart[0] = (BYTE)((U32)set_rle + (srcSize << 3)); break; + case 2: /* 2 - 2 - 12 */ ZSTD_writeLE16(ostart, (U16)((U32)set_rle + (1 << 2) + (srcSize << 4))); break; + default: /*note : should not be necessary : flSize is necessarily within {1,2,3} */ + case 3: /* 2 - 2 - 20 */ ZSTD_writeLE32(ostart, (U32)((U32)set_rle + (3 << 2) + (srcSize << 4))); break; + } + + ostart[flSize] = *(const BYTE *)src; + return flSize + 1; +} + +static size_t ZSTD_minGain(size_t srcSize) { return (srcSize >> 6) + 2; } + +static size_t ZSTD_compressLiterals(ZSTD_CCtx *zc, void *dst, size_t dstCapacity, const void *src, size_t srcSize) +{ + size_t const minGain = ZSTD_minGain(srcSize); + size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB); + BYTE *const ostart = (BYTE *)dst; + U32 singleStream = srcSize < 256; + symbolEncodingType_e hType = set_compressed; + size_t cLitSize; + +/* small ? don't even attempt compression (speed opt) */ +#define LITERAL_NOENTROPY 63 + { + size_t const minLitSize = zc->flagStaticHufTable == HUF_repeat_valid ? 6 : LITERAL_NOENTROPY; + if (srcSize <= minLitSize) + return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); + } + + if (dstCapacity < lhSize + 1) + return ERROR(dstSize_tooSmall); /* not enough space for compression */ + { + HUF_repeat repeat = zc->flagStaticHufTable; + int const preferRepeat = zc->params.cParams.strategy < ZSTD_lazy ? srcSize <= 1024 : 0; + if (repeat == HUF_repeat_valid && lhSize == 3) + singleStream = 1; + cLitSize = singleStream ? HUF_compress1X_repeat(ostart + lhSize, dstCapacity - lhSize, src, srcSize, 255, 11, zc->tmpCounters, + sizeof(zc->tmpCounters), zc->hufTable, &repeat, preferRepeat) + : HUF_compress4X_repeat(ostart + lhSize, dstCapacity - lhSize, src, srcSize, 255, 11, zc->tmpCounters, + sizeof(zc->tmpCounters), zc->hufTable, &repeat, preferRepeat); + if (repeat != HUF_repeat_none) { + hType = set_repeat; + } /* reused the existing table */ + else { + zc->flagStaticHufTable = HUF_repeat_check; + } /* now have a table to reuse */ + } + + if ((cLitSize == 0) | (cLitSize >= srcSize - minGain)) { + zc->flagStaticHufTable = HUF_repeat_none; + return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); + } + if (cLitSize == 1) { + zc->flagStaticHufTable = HUF_repeat_none; + return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize); + } + + /* Build header */ + switch (lhSize) { + case 3: /* 2 - 2 - 10 - 10 */ + { + U32 const lhc = hType + ((!singleStream) << 2) + ((U32)srcSize << 4) + ((U32)cLitSize << 14); + ZSTD_writeLE24(ostart, lhc); + break; + } + case 4: /* 2 - 2 - 14 - 14 */ + { + U32 const lhc = hType + (2 << 2) + ((U32)srcSize << 4) + ((U32)cLitSize << 18); + ZSTD_writeLE32(ostart, lhc); + break; + } + default: /* should not be necessary, lhSize is only {3,4,5} */ + case 5: /* 2 - 2 - 18 - 18 */ + { + U32 const lhc = hType + (3 << 2) + ((U32)srcSize << 4) + ((U32)cLitSize << 22); + ZSTD_writeLE32(ostart, lhc); + ostart[4] = (BYTE)(cLitSize >> 10); + break; + } + } + return lhSize + cLitSize; +} + +static const BYTE LL_Code[64] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 17, 17, 18, 18, + 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, + 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24}; + +static const BYTE ML_Code[128] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, + 26, 27, 28, 29, 30, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, + 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, + 40, 40, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42}; + +void ZSTD_seqToCodes(const seqStore_t *seqStorePtr) +{ + BYTE const LL_deltaCode = 19; + BYTE const ML_deltaCode = 36; + const seqDef *const sequences = seqStorePtr->sequencesStart; + BYTE *const llCodeTable = seqStorePtr->llCode; + BYTE *const ofCodeTable = seqStorePtr->ofCode; + BYTE *const mlCodeTable = seqStorePtr->mlCode; + U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + U32 u; + for (u = 0; u < nbSeq; u++) { + U32 const llv = sequences[u].litLength; + U32 const mlv = sequences[u].matchLength; + llCodeTable[u] = (llv > 63) ? (BYTE)ZSTD_highbit32(llv) + LL_deltaCode : LL_Code[llv]; + ofCodeTable[u] = (BYTE)ZSTD_highbit32(sequences[u].offset); + mlCodeTable[u] = (mlv > 127) ? (BYTE)ZSTD_highbit32(mlv) + ML_deltaCode : ML_Code[mlv]; + } + if (seqStorePtr->longLengthID == 1) + llCodeTable[seqStorePtr->longLengthPos] = MaxLL; + if (seqStorePtr->longLengthID == 2) + mlCodeTable[seqStorePtr->longLengthPos] = MaxML; +} + +ZSTD_STATIC size_t ZSTD_compressSequences_internal(ZSTD_CCtx *zc, void *dst, size_t dstCapacity) +{ + const int longOffsets = zc->params.cParams.windowLog > STREAM_ACCUMULATOR_MIN; + const seqStore_t *seqStorePtr = &(zc->seqStore); + FSE_CTable *CTable_LitLength = zc->litlengthCTable; + FSE_CTable *CTable_OffsetBits = zc->offcodeCTable; + FSE_CTable *CTable_MatchLength = zc->matchlengthCTable; + U32 LLtype, Offtype, MLtype; /* compressed, raw or rle */ + const seqDef *const sequences = seqStorePtr->sequencesStart; + const BYTE *const ofCodeTable = seqStorePtr->ofCode; + const BYTE *const llCodeTable = seqStorePtr->llCode; + const BYTE *const mlCodeTable = seqStorePtr->mlCode; + BYTE *const ostart = (BYTE *)dst; + BYTE *const oend = ostart + dstCapacity; + BYTE *op = ostart; + size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart; + BYTE *seqHead; + + U32 *count; + S16 *norm; + U32 *workspace; + size_t workspaceSize = sizeof(zc->tmpCounters); + { + size_t spaceUsed32 = 0; + count = (U32 *)zc->tmpCounters + spaceUsed32; + spaceUsed32 += MaxSeq + 1; + norm = (S16 *)((U32 *)zc->tmpCounters + spaceUsed32); + spaceUsed32 += ALIGN(sizeof(S16) * (MaxSeq + 1), sizeof(U32)) >> 2; + + workspace = (U32 *)zc->tmpCounters + spaceUsed32; + workspaceSize -= (spaceUsed32 << 2); + } + + /* Compress literals */ + { + const BYTE *const literals = seqStorePtr->litStart; + size_t const litSize = seqStorePtr->lit - literals; + size_t const cSize = ZSTD_compressLiterals(zc, op, dstCapacity, literals, litSize); + if (ZSTD_isError(cSize)) + return cSize; + op += cSize; + } + + /* Sequences Header */ + if ((oend - op) < 3 /*max nbSeq Size*/ + 1 /*seqHead */) + return ERROR(dstSize_tooSmall); + if (nbSeq < 0x7F) + *op++ = (BYTE)nbSeq; + else if (nbSeq < LONGNBSEQ) + op[0] = (BYTE)((nbSeq >> 8) + 0x80), op[1] = (BYTE)nbSeq, op += 2; + else + op[0] = 0xFF, ZSTD_writeLE16(op + 1, (U16)(nbSeq - LONGNBSEQ)), op += 3; + if (nbSeq == 0) + return op - ostart; + + /* seqHead : flags for FSE encoding type */ + seqHead = op++; + +#define MIN_SEQ_FOR_DYNAMIC_FSE 64 +#define MAX_SEQ_FOR_STATIC_FSE 1000 + + /* convert length/distances into codes */ + ZSTD_seqToCodes(seqStorePtr); + + /* CTable for Literal Lengths */ + { + U32 max = MaxLL; + size_t const mostFrequent = FSE_countFast_wksp(count, &max, llCodeTable, nbSeq, workspace); + if ((mostFrequent == nbSeq) && (nbSeq > 2)) { + *op++ = llCodeTable[0]; + FSE_buildCTable_rle(CTable_LitLength, (BYTE)max); + LLtype = set_rle; + } else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) { + LLtype = set_repeat; + } else if ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (LL_defaultNormLog - 1)))) { + FSE_buildCTable_wksp(CTable_LitLength, LL_defaultNorm, MaxLL, LL_defaultNormLog, workspace, workspaceSize); + LLtype = set_basic; + } else { + size_t nbSeq_1 = nbSeq; + const U32 tableLog = FSE_optimalTableLog(LLFSELog, nbSeq, max); + if (count[llCodeTable[nbSeq - 1]] > 1) { + count[llCodeTable[nbSeq - 1]]--; + nbSeq_1--; + } + FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max); + { + size_t const NCountSize = FSE_writeNCount(op, oend - op, norm, max, tableLog); /* overflow protected */ + if (FSE_isError(NCountSize)) + return NCountSize; + op += NCountSize; + } + FSE_buildCTable_wksp(CTable_LitLength, norm, max, tableLog, workspace, workspaceSize); + LLtype = set_compressed; + } + } + + /* CTable for Offsets */ + { + U32 max = MaxOff; + size_t const mostFrequent = FSE_countFast_wksp(count, &max, ofCodeTable, nbSeq, workspace); + if ((mostFrequent == nbSeq) && (nbSeq > 2)) { + *op++ = ofCodeTable[0]; + FSE_buildCTable_rle(CTable_OffsetBits, (BYTE)max); + Offtype = set_rle; + } else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) { + Offtype = set_repeat; + } else if ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (OF_defaultNormLog - 1)))) { + FSE_buildCTable_wksp(CTable_OffsetBits, OF_defaultNorm, MaxOff, OF_defaultNormLog, workspace, workspaceSize); + Offtype = set_basic; + } else { + size_t nbSeq_1 = nbSeq; + const U32 tableLog = FSE_optimalTableLog(OffFSELog, nbSeq, max); + if (count[ofCodeTable[nbSeq - 1]] > 1) { + count[ofCodeTable[nbSeq - 1]]--; + nbSeq_1--; + } + FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max); + { + size_t const NCountSize = FSE_writeNCount(op, oend - op, norm, max, tableLog); /* overflow protected */ + if (FSE_isError(NCountSize)) + return NCountSize; + op += NCountSize; + } + FSE_buildCTable_wksp(CTable_OffsetBits, norm, max, tableLog, workspace, workspaceSize); + Offtype = set_compressed; + } + } + + /* CTable for MatchLengths */ + { + U32 max = MaxML; + size_t const mostFrequent = FSE_countFast_wksp(count, &max, mlCodeTable, nbSeq, workspace); + if ((mostFrequent == nbSeq) && (nbSeq > 2)) { + *op++ = *mlCodeTable; + FSE_buildCTable_rle(CTable_MatchLength, (BYTE)max); + MLtype = set_rle; + } else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) { + MLtype = set_repeat; + } else if ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (ML_defaultNormLog - 1)))) { + FSE_buildCTable_wksp(CTable_MatchLength, ML_defaultNorm, MaxML, ML_defaultNormLog, workspace, workspaceSize); + MLtype = set_basic; + } else { + size_t nbSeq_1 = nbSeq; + const U32 tableLog = FSE_optimalTableLog(MLFSELog, nbSeq, max); + if (count[mlCodeTable[nbSeq - 1]] > 1) { + count[mlCodeTable[nbSeq - 1]]--; + nbSeq_1--; + } + FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max); + { + size_t const NCountSize = FSE_writeNCount(op, oend - op, norm, max, tableLog); /* overflow protected */ + if (FSE_isError(NCountSize)) + return NCountSize; + op += NCountSize; + } + FSE_buildCTable_wksp(CTable_MatchLength, norm, max, tableLog, workspace, workspaceSize); + MLtype = set_compressed; + } + } + + *seqHead = (BYTE)((LLtype << 6) + (Offtype << 4) + (MLtype << 2)); + zc->flagStaticTables = 0; + + /* Encoding Sequences */ + { + BIT_CStream_t blockStream; + FSE_CState_t stateMatchLength; + FSE_CState_t stateOffsetBits; + FSE_CState_t stateLitLength; + + CHECK_E(BIT_initCStream(&blockStream, op, oend - op), dstSize_tooSmall); /* not enough space remaining */ + + /* first symbols */ + FSE_initCState2(&stateMatchLength, CTable_MatchLength, mlCodeTable[nbSeq - 1]); + FSE_initCState2(&stateOffsetBits, CTable_OffsetBits, ofCodeTable[nbSeq - 1]); + FSE_initCState2(&stateLitLength, CTable_LitLength, llCodeTable[nbSeq - 1]); + BIT_addBits(&blockStream, sequences[nbSeq - 1].litLength, LL_bits[llCodeTable[nbSeq - 1]]); + if (ZSTD_32bits()) + BIT_flushBits(&blockStream); + BIT_addBits(&blockStream, sequences[nbSeq - 1].matchLength, ML_bits[mlCodeTable[nbSeq - 1]]); + if (ZSTD_32bits()) + BIT_flushBits(&blockStream); + if (longOffsets) { + U32 const ofBits = ofCodeTable[nbSeq - 1]; + int const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN - 1); + if (extraBits) { + BIT_addBits(&blockStream, sequences[nbSeq - 1].offset, extraBits); + BIT_flushBits(&blockStream); + } + BIT_addBits(&blockStream, sequences[nbSeq - 1].offset >> extraBits, ofBits - extraBits); + } else { + BIT_addBits(&blockStream, sequences[nbSeq - 1].offset, ofCodeTable[nbSeq - 1]); + } + BIT_flushBits(&blockStream); + + { + size_t n; + for (n = nbSeq - 2; n < nbSeq; n--) { /* intentional underflow */ + BYTE const llCode = llCodeTable[n]; + BYTE const ofCode = ofCodeTable[n]; + BYTE const mlCode = mlCodeTable[n]; + U32 const llBits = LL_bits[llCode]; + U32 const ofBits = ofCode; /* 32b*/ /* 64b*/ + U32 const mlBits = ML_bits[mlCode]; + /* (7)*/ /* (7)*/ + FSE_encodeSymbol(&blockStream, &stateOffsetBits, ofCode); /* 15 */ /* 15 */ + FSE_encodeSymbol(&blockStream, &stateMatchLength, mlCode); /* 24 */ /* 24 */ + if (ZSTD_32bits()) + BIT_flushBits(&blockStream); /* (7)*/ + FSE_encodeSymbol(&blockStream, &stateLitLength, llCode); /* 16 */ /* 33 */ + if (ZSTD_32bits() || (ofBits + mlBits + llBits >= 64 - 7 - (LLFSELog + MLFSELog + OffFSELog))) + BIT_flushBits(&blockStream); /* (7)*/ + BIT_addBits(&blockStream, sequences[n].litLength, llBits); + if (ZSTD_32bits() && ((llBits + mlBits) > 24)) + BIT_flushBits(&blockStream); + BIT_addBits(&blockStream, sequences[n].matchLength, mlBits); + if (ZSTD_32bits()) + BIT_flushBits(&blockStream); /* (7)*/ + if (longOffsets) { + int const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN - 1); + if (extraBits) { + BIT_addBits(&blockStream, sequences[n].offset, extraBits); + BIT_flushBits(&blockStream); /* (7)*/ + } + BIT_addBits(&blockStream, sequences[n].offset >> extraBits, ofBits - extraBits); /* 31 */ + } else { + BIT_addBits(&blockStream, sequences[n].offset, ofBits); /* 31 */ + } + BIT_flushBits(&blockStream); /* (7)*/ + } + } + + FSE_flushCState(&blockStream, &stateMatchLength); + FSE_flushCState(&blockStream, &stateOffsetBits); + FSE_flushCState(&blockStream, &stateLitLength); + + { + size_t const streamSize = BIT_closeCStream(&blockStream); + if (streamSize == 0) + return ERROR(dstSize_tooSmall); /* not enough space */ + op += streamSize; + } + } + return op - ostart; +} + +ZSTD_STATIC size_t ZSTD_compressSequences(ZSTD_CCtx *zc, void *dst, size_t dstCapacity, size_t srcSize) +{ + size_t const cSize = ZSTD_compressSequences_internal(zc, dst, dstCapacity); + size_t const minGain = ZSTD_minGain(srcSize); + size_t const maxCSize = srcSize - minGain; + /* If the srcSize <= dstCapacity, then there is enough space to write a + * raw uncompressed block. Since we ran out of space, the block must not + * be compressible, so fall back to a raw uncompressed block. + */ + int const uncompressibleError = cSize == ERROR(dstSize_tooSmall) && srcSize <= dstCapacity; + int i; + + if (ZSTD_isError(cSize) && !uncompressibleError) + return cSize; + if (cSize >= maxCSize || uncompressibleError) { + zc->flagStaticHufTable = HUF_repeat_none; + return 0; + } + /* confirm repcodes */ + for (i = 0; i < ZSTD_REP_NUM; i++) + zc->rep[i] = zc->repToConfirm[i]; + return cSize; +} + +/*! ZSTD_storeSeq() : + Store a sequence (literal length, literals, offset code and match length code) into seqStore_t. + `offsetCode` : distance to match, or 0 == repCode. + `matchCode` : matchLength - MINMATCH +*/ +ZSTD_STATIC void ZSTD_storeSeq(seqStore_t *seqStorePtr, size_t litLength, const void *literals, U32 offsetCode, size_t matchCode) +{ + /* copy Literals */ + ZSTD_wildcopy(seqStorePtr->lit, literals, litLength); + seqStorePtr->lit += litLength; + + /* literal Length */ + if (litLength > 0xFFFF) { + seqStorePtr->longLengthID = 1; + seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + } + seqStorePtr->sequences[0].litLength = (U16)litLength; + + /* match offset */ + seqStorePtr->sequences[0].offset = offsetCode + 1; + + /* match Length */ + if (matchCode > 0xFFFF) { + seqStorePtr->longLengthID = 2; + seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + } + seqStorePtr->sequences[0].matchLength = (U16)matchCode; + + seqStorePtr->sequences++; +} + +/*-************************************* +* Match length counter +***************************************/ +static unsigned ZSTD_NbCommonBytes(register size_t val) +{ + if (ZSTD_isLittleEndian()) { + if (ZSTD_64bits()) { + return (__builtin_ctzll((U64)val) >> 3); + } else { /* 32 bits */ + return (__builtin_ctz((U32)val) >> 3); + } + } else { /* Big Endian CPU */ + if (ZSTD_64bits()) { + return (__builtin_clzll(val) >> 3); + } else { /* 32 bits */ + return (__builtin_clz((U32)val) >> 3); + } + } +} + +static size_t ZSTD_count(const BYTE *pIn, const BYTE *pMatch, const BYTE *const pInLimit) +{ + const BYTE *const pStart = pIn; + const BYTE *const pInLoopLimit = pInLimit - (sizeof(size_t) - 1); + + while (pIn < pInLoopLimit) { + size_t const diff = ZSTD_readST(pMatch) ^ ZSTD_readST(pIn); + if (!diff) { + pIn += sizeof(size_t); + pMatch += sizeof(size_t); + continue; + } + pIn += ZSTD_NbCommonBytes(diff); + return (size_t)(pIn - pStart); + } + if (ZSTD_64bits()) + if ((pIn < (pInLimit - 3)) && (ZSTD_read32(pMatch) == ZSTD_read32(pIn))) { + pIn += 4; + pMatch += 4; + } + if ((pIn < (pInLimit - 1)) && (ZSTD_read16(pMatch) == ZSTD_read16(pIn))) { + pIn += 2; + pMatch += 2; + } + if ((pIn < pInLimit) && (*pMatch == *pIn)) + pIn++; + return (size_t)(pIn - pStart); +} + +/** ZSTD_count_2segments() : +* can count match length with `ip` & `match` in 2 different segments. +* convention : on reaching mEnd, match count continue starting from iStart +*/ +static size_t ZSTD_count_2segments(const BYTE *ip, const BYTE *match, const BYTE *iEnd, const BYTE *mEnd, const BYTE *iStart) +{ + const BYTE *const vEnd = MIN(ip + (mEnd - match), iEnd); + size_t const matchLength = ZSTD_count(ip, match, vEnd); + if (match + matchLength != mEnd) + return matchLength; + return matchLength + ZSTD_count(ip + matchLength, iStart, iEnd); +} + +/*-************************************* +* Hashes +***************************************/ +static const U32 prime3bytes = 506832829U; +static U32 ZSTD_hash3(U32 u, U32 h) { return ((u << (32 - 24)) * prime3bytes) >> (32 - h); } +ZSTD_STATIC size_t ZSTD_hash3Ptr(const void *ptr, U32 h) { return ZSTD_hash3(ZSTD_readLE32(ptr), h); } /* only in zstd_opt.h */ + +static const U32 prime4bytes = 2654435761U; +static U32 ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32 - h); } +static size_t ZSTD_hash4Ptr(const void *ptr, U32 h) { return ZSTD_hash4(ZSTD_read32(ptr), h); } + +static const U64 prime5bytes = 889523592379ULL; +static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u << (64 - 40)) * prime5bytes) >> (64 - h)); } +static size_t ZSTD_hash5Ptr(const void *p, U32 h) { return ZSTD_hash5(ZSTD_readLE64(p), h); } + +static const U64 prime6bytes = 227718039650203ULL; +static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u << (64 - 48)) * prime6bytes) >> (64 - h)); } +static size_t ZSTD_hash6Ptr(const void *p, U32 h) { return ZSTD_hash6(ZSTD_readLE64(p), h); } + +static const U64 prime7bytes = 58295818150454627ULL; +static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u << (64 - 56)) * prime7bytes) >> (64 - h)); } +static size_t ZSTD_hash7Ptr(const void *p, U32 h) { return ZSTD_hash7(ZSTD_readLE64(p), h); } + +static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL; +static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u)*prime8bytes) >> (64 - h)); } +static size_t ZSTD_hash8Ptr(const void *p, U32 h) { return ZSTD_hash8(ZSTD_readLE64(p), h); } + +static size_t ZSTD_hashPtr(const void *p, U32 hBits, U32 mls) +{ + switch (mls) { + // case 3: return ZSTD_hash3Ptr(p, hBits); + default: + case 4: return ZSTD_hash4Ptr(p, hBits); + case 5: return ZSTD_hash5Ptr(p, hBits); + case 6: return ZSTD_hash6Ptr(p, hBits); + case 7: return ZSTD_hash7Ptr(p, hBits); + case 8: return ZSTD_hash8Ptr(p, hBits); + } +} + +/*-************************************* +* Fast Scan +***************************************/ +static void ZSTD_fillHashTable(ZSTD_CCtx *zc, const void *end, const U32 mls) +{ + U32 *const hashTable = zc->hashTable; + U32 const hBits = zc->params.cParams.hashLog; + const BYTE *const base = zc->base; + const BYTE *ip = base + zc->nextToUpdate; + const BYTE *const iend = ((const BYTE *)end) - HASH_READ_SIZE; + const size_t fastHashFillStep = 3; + + while (ip <= iend) { + hashTable[ZSTD_hashPtr(ip, hBits, mls)] = (U32)(ip - base); + ip += fastHashFillStep; + } +} + +FORCE_INLINE +void ZSTD_compressBlock_fast_generic(ZSTD_CCtx *cctx, const void *src, size_t srcSize, const U32 mls) +{ + U32 *const hashTable = cctx->hashTable; + U32 const hBits = cctx->params.cParams.hashLog; + seqStore_t *seqStorePtr = &(cctx->seqStore); + const BYTE *const base = cctx->base; + const BYTE *const istart = (const BYTE *)src; + const BYTE *ip = istart; + const BYTE *anchor = istart; + const U32 lowestIndex = cctx->dictLimit; + const BYTE *const lowest = base + lowestIndex; + const BYTE *const iend = istart + srcSize; + const BYTE *const ilimit = iend - HASH_READ_SIZE; + U32 offset_1 = cctx->rep[0], offset_2 = cctx->rep[1]; + U32 offsetSaved = 0; + + /* init */ + ip += (ip == lowest); + { + U32 const maxRep = (U32)(ip - lowest); + if (offset_2 > maxRep) + offsetSaved = offset_2, offset_2 = 0; + if (offset_1 > maxRep) + offsetSaved = offset_1, offset_1 = 0; + } + + /* Main Search Loop */ + while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */ + size_t mLength; + size_t const h = ZSTD_hashPtr(ip, hBits, mls); + U32 const curr = (U32)(ip - base); + U32 const matchIndex = hashTable[h]; + const BYTE *match = base + matchIndex; + hashTable[h] = curr; /* update hash table */ + + if ((offset_1 > 0) & (ZSTD_read32(ip + 1 - offset_1) == ZSTD_read32(ip + 1))) { + mLength = ZSTD_count(ip + 1 + 4, ip + 1 + 4 - offset_1, iend) + 4; + ip++; + ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, 0, mLength - MINMATCH); + } else { + U32 offset; + if ((matchIndex <= lowestIndex) || (ZSTD_read32(match) != ZSTD_read32(ip))) { + ip += ((ip - anchor) >> g_searchStrength) + 1; + continue; + } + mLength = ZSTD_count(ip + 4, match + 4, iend) + 4; + offset = (U32)(ip - match); + while (((ip > anchor) & (match > lowest)) && (ip[-1] == match[-1])) { + ip--; + match--; + mLength++; + } /* catch up */ + offset_2 = offset_1; + offset_1 = offset; + + ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, offset + ZSTD_REP_MOVE, mLength - MINMATCH); + } + + /* match found */ + ip += mLength; + anchor = ip; + + if (ip <= ilimit) { + /* Fill Table */ + hashTable[ZSTD_hashPtr(base + curr + 2, hBits, mls)] = curr + 2; /* here because curr+2 could be > iend-8 */ + hashTable[ZSTD_hashPtr(ip - 2, hBits, mls)] = (U32)(ip - 2 - base); + /* check immediate repcode */ + while ((ip <= ilimit) && ((offset_2 > 0) & (ZSTD_read32(ip) == ZSTD_read32(ip - offset_2)))) { + /* store sequence */ + size_t const rLength = ZSTD_count(ip + 4, ip + 4 - offset_2, iend) + 4; + { + U32 const tmpOff = offset_2; + offset_2 = offset_1; + offset_1 = tmpOff; + } /* swap offset_2 <=> offset_1 */ + hashTable[ZSTD_hashPtr(ip, hBits, mls)] = (U32)(ip - base); + ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, rLength - MINMATCH); + ip += rLength; + anchor = ip; + continue; /* faster when present ... (?) */ + } + } + } + + /* save reps for next block */ + cctx->repToConfirm[0] = offset_1 ? offset_1 : offsetSaved; + cctx->repToConfirm[1] = offset_2 ? offset_2 : offsetSaved; + + /* Last Literals */ + { + size_t const lastLLSize = iend - anchor; + memcpy(seqStorePtr->lit, anchor, lastLLSize); + seqStorePtr->lit += lastLLSize; + } +} + +static void ZSTD_compressBlock_fast(ZSTD_CCtx *ctx, const void *src, size_t srcSize) +{ + const U32 mls = ctx->params.cParams.searchLength; + switch (mls) { + default: /* includes case 3 */ + case 4: ZSTD_compressBlock_fast_generic(ctx, src, srcSize, 4); return; + case 5: ZSTD_compressBlock_fast_generic(ctx, src, srcSize, 5); return; + case 6: ZSTD_compressBlock_fast_generic(ctx, src, srcSize, 6); return; + case 7: ZSTD_compressBlock_fast_generic(ctx, src, srcSize, 7); return; + } +} + +static void ZSTD_compressBlock_fast_extDict_generic(ZSTD_CCtx *ctx, const void *src, size_t srcSize, const U32 mls) +{ + U32 *hashTable = ctx->hashTable; + const U32 hBits = ctx->params.cParams.hashLog; + seqStore_t *seqStorePtr = &(ctx->seqStore); + const BYTE *const base = ctx->base; + const BYTE *const dictBase = ctx->dictBase; + const BYTE *const istart = (const BYTE *)src; + const BYTE *ip = istart; + const BYTE *anchor = istart; + const U32 lowestIndex = ctx->lowLimit; + const BYTE *const dictStart = dictBase + lowestIndex; + const U32 dictLimit = ctx->dictLimit; + const BYTE *const lowPrefixPtr = base + dictLimit; + const BYTE *const dictEnd = dictBase + dictLimit; + const BYTE *const iend = istart + srcSize; + const BYTE *const ilimit = iend - 8; + U32 offset_1 = ctx->rep[0], offset_2 = ctx->rep[1]; + + /* Search Loop */ + while (ip < ilimit) { /* < instead of <=, because (ip+1) */ + const size_t h = ZSTD_hashPtr(ip, hBits, mls); + const U32 matchIndex = hashTable[h]; + const BYTE *matchBase = matchIndex < dictLimit ? dictBase : base; + const BYTE *match = matchBase + matchIndex; + const U32 curr = (U32)(ip - base); + const U32 repIndex = curr + 1 - offset_1; /* offset_1 expected <= curr +1 */ + const BYTE *repBase = repIndex < dictLimit ? dictBase : base; + const BYTE *repMatch = repBase + repIndex; + size_t mLength; + hashTable[h] = curr; /* update hash table */ + + if ((((U32)((dictLimit - 1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > lowestIndex)) && + (ZSTD_read32(repMatch) == ZSTD_read32(ip + 1))) { + const BYTE *repMatchEnd = repIndex < dictLimit ? dictEnd : iend; + mLength = ZSTD_count_2segments(ip + 1 + EQUAL_READ32, repMatch + EQUAL_READ32, iend, repMatchEnd, lowPrefixPtr) + EQUAL_READ32; + ip++; + ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, 0, mLength - MINMATCH); + } else { + if ((matchIndex < lowestIndex) || (ZSTD_read32(match) != ZSTD_read32(ip))) { + ip += ((ip - anchor) >> g_searchStrength) + 1; + continue; + } + { + const BYTE *matchEnd = matchIndex < dictLimit ? dictEnd : iend; + const BYTE *lowMatchPtr = matchIndex < dictLimit ? dictStart : lowPrefixPtr; + U32 offset; + mLength = ZSTD_count_2segments(ip + EQUAL_READ32, match + EQUAL_READ32, iend, matchEnd, lowPrefixPtr) + EQUAL_READ32; + while (((ip > anchor) & (match > lowMatchPtr)) && (ip[-1] == match[-1])) { + ip--; + match--; + mLength++; + } /* catch up */ + offset = curr - matchIndex; + offset_2 = offset_1; + offset_1 = offset; + ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, offset + ZSTD_REP_MOVE, mLength - MINMATCH); + } + } + + /* found a match : store it */ + ip += mLength; + anchor = ip; + + if (ip <= ilimit) { + /* Fill Table */ + hashTable[ZSTD_hashPtr(base + curr + 2, hBits, mls)] = curr + 2; + hashTable[ZSTD_hashPtr(ip - 2, hBits, mls)] = (U32)(ip - 2 - base); + /* check immediate repcode */ + while (ip <= ilimit) { + U32 const curr2 = (U32)(ip - base); + U32 const repIndex2 = curr2 - offset_2; + const BYTE *repMatch2 = repIndex2 < dictLimit ? dictBase + repIndex2 : base + repIndex2; + if ((((U32)((dictLimit - 1) - repIndex2) >= 3) & (repIndex2 > lowestIndex)) /* intentional overflow */ + && (ZSTD_read32(repMatch2) == ZSTD_read32(ip))) { + const BYTE *const repEnd2 = repIndex2 < dictLimit ? dictEnd : iend; + size_t repLength2 = + ZSTD_count_2segments(ip + EQUAL_READ32, repMatch2 + EQUAL_READ32, iend, repEnd2, lowPrefixPtr) + EQUAL_READ32; + U32 tmpOffset = offset_2; + offset_2 = offset_1; + offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ + ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, repLength2 - MINMATCH); + hashTable[ZSTD_hashPtr(ip, hBits, mls)] = curr2; + ip += repLength2; + anchor = ip; + continue; + } + break; + } + } + } + + /* save reps for next block */ + ctx->repToConfirm[0] = offset_1; + ctx->repToConfirm[1] = offset_2; + + /* Last Literals */ + { + size_t const lastLLSize = iend - anchor; + memcpy(seqStorePtr->lit, anchor, lastLLSize); + seqStorePtr->lit += lastLLSize; + } +} + +static void ZSTD_compressBlock_fast_extDict(ZSTD_CCtx *ctx, const void *src, size_t srcSize) +{ + U32 const mls = ctx->params.cParams.searchLength; + switch (mls) { + default: /* includes case 3 */ + case 4: ZSTD_compressBlock_fast_extDict_generic(ctx, src, srcSize, 4); return; + case 5: ZSTD_compressBlock_fast_extDict_generic(ctx, src, srcSize, 5); return; + case 6: ZSTD_compressBlock_fast_extDict_generic(ctx, src, srcSize, 6); return; + case 7: ZSTD_compressBlock_fast_extDict_generic(ctx, src, srcSize, 7); return; + } +} + +/*-************************************* +* Double Fast +***************************************/ +static void ZSTD_fillDoubleHashTable(ZSTD_CCtx *cctx, const void *end, const U32 mls) +{ + U32 *const hashLarge = cctx->hashTable; + U32 const hBitsL = cctx->params.cParams.hashLog; + U32 *const hashSmall = cctx->chainTable; + U32 const hBitsS = cctx->params.cParams.chainLog; + const BYTE *const base = cctx->base; + const BYTE *ip = base + cctx->nextToUpdate; + const BYTE *const iend = ((const BYTE *)end) - HASH_READ_SIZE; + const size_t fastHashFillStep = 3; + + while (ip <= iend) { + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip - base); + hashLarge[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip - base); + ip += fastHashFillStep; + } +} + +FORCE_INLINE +void ZSTD_compressBlock_doubleFast_generic(ZSTD_CCtx *cctx, const void *src, size_t srcSize, const U32 mls) +{ + U32 *const hashLong = cctx->hashTable; + const U32 hBitsL = cctx->params.cParams.hashLog; + U32 *const hashSmall = cctx->chainTable; + const U32 hBitsS = cctx->params.cParams.chainLog; + seqStore_t *seqStorePtr = &(cctx->seqStore); + const BYTE *const base = cctx->base; + const BYTE *const istart = (const BYTE *)src; + const BYTE *ip = istart; + const BYTE *anchor = istart; + const U32 lowestIndex = cctx->dictLimit; + const BYTE *const lowest = base + lowestIndex; + const BYTE *const iend = istart + srcSize; + const BYTE *const ilimit = iend - HASH_READ_SIZE; + U32 offset_1 = cctx->rep[0], offset_2 = cctx->rep[1]; + U32 offsetSaved = 0; + + /* init */ + ip += (ip == lowest); + { + U32 const maxRep = (U32)(ip - lowest); + if (offset_2 > maxRep) + offsetSaved = offset_2, offset_2 = 0; + if (offset_1 > maxRep) + offsetSaved = offset_1, offset_1 = 0; + } + + /* Main Search Loop */ + while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */ + size_t mLength; + size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8); + size_t const h = ZSTD_hashPtr(ip, hBitsS, mls); + U32 const curr = (U32)(ip - base); + U32 const matchIndexL = hashLong[h2]; + U32 const matchIndexS = hashSmall[h]; + const BYTE *matchLong = base + matchIndexL; + const BYTE *match = base + matchIndexS; + hashLong[h2] = hashSmall[h] = curr; /* update hash tables */ + + if ((offset_1 > 0) & (ZSTD_read32(ip + 1 - offset_1) == ZSTD_read32(ip + 1))) { /* note : by construction, offset_1 <= curr */ + mLength = ZSTD_count(ip + 1 + 4, ip + 1 + 4 - offset_1, iend) + 4; + ip++; + ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, 0, mLength - MINMATCH); + } else { + U32 offset; + if ((matchIndexL > lowestIndex) && (ZSTD_read64(matchLong) == ZSTD_read64(ip))) { + mLength = ZSTD_count(ip + 8, matchLong + 8, iend) + 8; + offset = (U32)(ip - matchLong); + while (((ip > anchor) & (matchLong > lowest)) && (ip[-1] == matchLong[-1])) { + ip--; + matchLong--; + mLength++; + } /* catch up */ + } else if ((matchIndexS > lowestIndex) && (ZSTD_read32(match) == ZSTD_read32(ip))) { + size_t const h3 = ZSTD_hashPtr(ip + 1, hBitsL, 8); + U32 const matchIndex3 = hashLong[h3]; + const BYTE *match3 = base + matchIndex3; + hashLong[h3] = curr + 1; + if ((matchIndex3 > lowestIndex) && (ZSTD_read64(match3) == ZSTD_read64(ip + 1))) { + mLength = ZSTD_count(ip + 9, match3 + 8, iend) + 8; + ip++; + offset = (U32)(ip - match3); + while (((ip > anchor) & (match3 > lowest)) && (ip[-1] == match3[-1])) { + ip--; + match3--; + mLength++; + } /* catch up */ + } else { + mLength = ZSTD_count(ip + 4, match + 4, iend) + 4; + offset = (U32)(ip - match); + while (((ip > anchor) & (match > lowest)) && (ip[-1] == match[-1])) { + ip--; + match--; + mLength++; + } /* catch up */ + } + } else { + ip += ((ip - anchor) >> g_searchStrength) + 1; + continue; + } + + offset_2 = offset_1; + offset_1 = offset; + + ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, offset + ZSTD_REP_MOVE, mLength - MINMATCH); + } + + /* match found */ + ip += mLength; + anchor = ip; + + if (ip <= ilimit) { + /* Fill Table */ + hashLong[ZSTD_hashPtr(base + curr + 2, hBitsL, 8)] = hashSmall[ZSTD_hashPtr(base + curr + 2, hBitsS, mls)] = + curr + 2; /* here because curr+2 could be > iend-8 */ + hashLong[ZSTD_hashPtr(ip - 2, hBitsL, 8)] = hashSmall[ZSTD_hashPtr(ip - 2, hBitsS, mls)] = (U32)(ip - 2 - base); + + /* check immediate repcode */ + while ((ip <= ilimit) && ((offset_2 > 0) & (ZSTD_read32(ip) == ZSTD_read32(ip - offset_2)))) { + /* store sequence */ + size_t const rLength = ZSTD_count(ip + 4, ip + 4 - offset_2, iend) + 4; + { + U32 const tmpOff = offset_2; + offset_2 = offset_1; + offset_1 = tmpOff; + } /* swap offset_2 <=> offset_1 */ + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip - base); + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip - base); + ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, rLength - MINMATCH); + ip += rLength; + anchor = ip; + continue; /* faster when present ... (?) */ + } + } + } + + /* save reps for next block */ + cctx->repToConfirm[0] = offset_1 ? offset_1 : offsetSaved; + cctx->repToConfirm[1] = offset_2 ? offset_2 : offsetSaved; + + /* Last Literals */ + { + size_t const lastLLSize = iend - anchor; + memcpy(seqStorePtr->lit, anchor, lastLLSize); + seqStorePtr->lit += lastLLSize; + } +} + +static void ZSTD_compressBlock_doubleFast(ZSTD_CCtx *ctx, const void *src, size_t srcSize) +{ + const U32 mls = ctx->params.cParams.searchLength; + switch (mls) { + default: /* includes case 3 */ + case 4: ZSTD_compressBlock_doubleFast_generic(ctx, src, srcSize, 4); return; + case 5: ZSTD_compressBlock_doubleFast_generic(ctx, src, srcSize, 5); return; + case 6: ZSTD_compressBlock_doubleFast_generic(ctx, src, srcSize, 6); return; + case 7: ZSTD_compressBlock_doubleFast_generic(ctx, src, srcSize, 7); return; + } +} + +static void ZSTD_compressBlock_doubleFast_extDict_generic(ZSTD_CCtx *ctx, const void *src, size_t srcSize, const U32 mls) +{ + U32 *const hashLong = ctx->hashTable; + U32 const hBitsL = ctx->params.cParams.hashLog; + U32 *const hashSmall = ctx->chainTable; + U32 const hBitsS = ctx->params.cParams.chainLog; + seqStore_t *seqStorePtr = &(ctx->seqStore); + const BYTE *const base = ctx->base; + const BYTE *const dictBase = ctx->dictBase; + const BYTE *const istart = (const BYTE *)src; + const BYTE *ip = istart; + const BYTE *anchor = istart; + const U32 lowestIndex = ctx->lowLimit; + const BYTE *const dictStart = dictBase + lowestIndex; + const U32 dictLimit = ctx->dictLimit; + const BYTE *const lowPrefixPtr = base + dictLimit; + const BYTE *const dictEnd = dictBase + dictLimit; + const BYTE *const iend = istart + srcSize; + const BYTE *const ilimit = iend - 8; + U32 offset_1 = ctx->rep[0], offset_2 = ctx->rep[1]; + + /* Search Loop */ + while (ip < ilimit) { /* < instead of <=, because (ip+1) */ + const size_t hSmall = ZSTD_hashPtr(ip, hBitsS, mls); + const U32 matchIndex = hashSmall[hSmall]; + const BYTE *matchBase = matchIndex < dictLimit ? dictBase : base; + const BYTE *match = matchBase + matchIndex; + + const size_t hLong = ZSTD_hashPtr(ip, hBitsL, 8); + const U32 matchLongIndex = hashLong[hLong]; + const BYTE *matchLongBase = matchLongIndex < dictLimit ? dictBase : base; + const BYTE *matchLong = matchLongBase + matchLongIndex; + + const U32 curr = (U32)(ip - base); + const U32 repIndex = curr + 1 - offset_1; /* offset_1 expected <= curr +1 */ + const BYTE *repBase = repIndex < dictLimit ? dictBase : base; + const BYTE *repMatch = repBase + repIndex; + size_t mLength; + hashSmall[hSmall] = hashLong[hLong] = curr; /* update hash table */ + + if ((((U32)((dictLimit - 1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > lowestIndex)) && + (ZSTD_read32(repMatch) == ZSTD_read32(ip + 1))) { + const BYTE *repMatchEnd = repIndex < dictLimit ? dictEnd : iend; + mLength = ZSTD_count_2segments(ip + 1 + 4, repMatch + 4, iend, repMatchEnd, lowPrefixPtr) + 4; + ip++; + ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, 0, mLength - MINMATCH); + } else { + if ((matchLongIndex > lowestIndex) && (ZSTD_read64(matchLong) == ZSTD_read64(ip))) { + const BYTE *matchEnd = matchLongIndex < dictLimit ? dictEnd : iend; + const BYTE *lowMatchPtr = matchLongIndex < dictLimit ? dictStart : lowPrefixPtr; + U32 offset; + mLength = ZSTD_count_2segments(ip + 8, matchLong + 8, iend, matchEnd, lowPrefixPtr) + 8; + offset = curr - matchLongIndex; + while (((ip > anchor) & (matchLong > lowMatchPtr)) && (ip[-1] == matchLong[-1])) { + ip--; + matchLong--; + mLength++; + } /* catch up */ + offset_2 = offset_1; + offset_1 = offset; + ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, offset + ZSTD_REP_MOVE, mLength - MINMATCH); + + } else if ((matchIndex > lowestIndex) && (ZSTD_read32(match) == ZSTD_read32(ip))) { + size_t const h3 = ZSTD_hashPtr(ip + 1, hBitsL, 8); + U32 const matchIndex3 = hashLong[h3]; + const BYTE *const match3Base = matchIndex3 < dictLimit ? dictBase : base; + const BYTE *match3 = match3Base + matchIndex3; + U32 offset; + hashLong[h3] = curr + 1; + if ((matchIndex3 > lowestIndex) && (ZSTD_read64(match3) == ZSTD_read64(ip + 1))) { + const BYTE *matchEnd = matchIndex3 < dictLimit ? dictEnd : iend; + const BYTE *lowMatchPtr = matchIndex3 < dictLimit ? dictStart : lowPrefixPtr; + mLength = ZSTD_count_2segments(ip + 9, match3 + 8, iend, matchEnd, lowPrefixPtr) + 8; + ip++; + offset = curr + 1 - matchIndex3; + while (((ip > anchor) & (match3 > lowMatchPtr)) && (ip[-1] == match3[-1])) { + ip--; + match3--; + mLength++; + } /* catch up */ + } else { + const BYTE *matchEnd = matchIndex < dictLimit ? dictEnd : iend; + const BYTE *lowMatchPtr = matchIndex < dictLimit ? dictStart : lowPrefixPtr; + mLength = ZSTD_count_2segments(ip + 4, match + 4, iend, matchEnd, lowPrefixPtr) + 4; + offset = curr - matchIndex; + while (((ip > anchor) & (match > lowMatchPtr)) && (ip[-1] == match[-1])) { + ip--; + match--; + mLength++; + } /* catch up */ + } + offset_2 = offset_1; + offset_1 = offset; + ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, offset + ZSTD_REP_MOVE, mLength - MINMATCH); + + } else { + ip += ((ip - anchor) >> g_searchStrength) + 1; + continue; + } + } + + /* found a match : store it */ + ip += mLength; + anchor = ip; + + if (ip <= ilimit) { + /* Fill Table */ + hashSmall[ZSTD_hashPtr(base + curr + 2, hBitsS, mls)] = curr + 2; + hashLong[ZSTD_hashPtr(base + curr + 2, hBitsL, 8)] = curr + 2; + hashSmall[ZSTD_hashPtr(ip - 2, hBitsS, mls)] = (U32)(ip - 2 - base); + hashLong[ZSTD_hashPtr(ip - 2, hBitsL, 8)] = (U32)(ip - 2 - base); + /* check immediate repcode */ + while (ip <= ilimit) { + U32 const curr2 = (U32)(ip - base); + U32 const repIndex2 = curr2 - offset_2; + const BYTE *repMatch2 = repIndex2 < dictLimit ? dictBase + repIndex2 : base + repIndex2; + if ((((U32)((dictLimit - 1) - repIndex2) >= 3) & (repIndex2 > lowestIndex)) /* intentional overflow */ + && (ZSTD_read32(repMatch2) == ZSTD_read32(ip))) { + const BYTE *const repEnd2 = repIndex2 < dictLimit ? dictEnd : iend; + size_t const repLength2 = + ZSTD_count_2segments(ip + EQUAL_READ32, repMatch2 + EQUAL_READ32, iend, repEnd2, lowPrefixPtr) + EQUAL_READ32; + U32 tmpOffset = offset_2; + offset_2 = offset_1; + offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ + ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, repLength2 - MINMATCH); + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = curr2; + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = curr2; + ip += repLength2; + anchor = ip; + continue; + } + break; + } + } + } + + /* save reps for next block */ + ctx->repToConfirm[0] = offset_1; + ctx->repToConfirm[1] = offset_2; + + /* Last Literals */ + { + size_t const lastLLSize = iend - anchor; + memcpy(seqStorePtr->lit, anchor, lastLLSize); + seqStorePtr->lit += lastLLSize; + } +} + +static void ZSTD_compressBlock_doubleFast_extDict(ZSTD_CCtx *ctx, const void *src, size_t srcSize) +{ + U32 const mls = ctx->params.cParams.searchLength; + switch (mls) { + default: /* includes case 3 */ + case 4: ZSTD_compressBlock_doubleFast_extDict_generic(ctx, src, srcSize, 4); return; + case 5: ZSTD_compressBlock_doubleFast_extDict_generic(ctx, src, srcSize, 5); return; + case 6: ZSTD_compressBlock_doubleFast_extDict_generic(ctx, src, srcSize, 6); return; + case 7: ZSTD_compressBlock_doubleFast_extDict_generic(ctx, src, srcSize, 7); return; + } +} + +/*-************************************* +* Binary Tree search +***************************************/ +/** ZSTD_insertBt1() : add one or multiple positions to tree. +* ip : assumed <= iend-8 . +* @return : nb of positions added */ +static U32 ZSTD_insertBt1(ZSTD_CCtx *zc, const BYTE *const ip, const U32 mls, const BYTE *const iend, U32 nbCompares, U32 extDict) +{ + U32 *const hashTable = zc->hashTable; + U32 const hashLog = zc->params.cParams.hashLog; + size_t const h = ZSTD_hashPtr(ip, hashLog, mls); + U32 *const bt = zc->chainTable; + U32 const btLog = zc->params.cParams.chainLog - 1; + U32 const btMask = (1 << btLog) - 1; + U32 matchIndex = hashTable[h]; + size_t commonLengthSmaller = 0, commonLengthLarger = 0; + const BYTE *const base = zc->base; + const BYTE *const dictBase = zc->dictBase; + const U32 dictLimit = zc->dictLimit; + const BYTE *const dictEnd = dictBase + dictLimit; + const BYTE *const prefixStart = base + dictLimit; + const BYTE *match; + const U32 curr = (U32)(ip - base); + const U32 btLow = btMask >= curr ? 0 : curr - btMask; + U32 *smallerPtr = bt + 2 * (curr & btMask); + U32 *largerPtr = smallerPtr + 1; + U32 dummy32; /* to be nullified at the end */ + U32 const windowLow = zc->lowLimit; + U32 matchEndIdx = curr + 8; + size_t bestLength = 8; + + hashTable[h] = curr; /* Update Hash Table */ + + while (nbCompares-- && (matchIndex > windowLow)) { + U32 *const nextPtr = bt + 2 * (matchIndex & btMask); + size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ + + if ((!extDict) || (matchIndex + matchLength >= dictLimit)) { + match = base + matchIndex; + if (match[matchLength] == ip[matchLength]) + matchLength += ZSTD_count(ip + matchLength + 1, match + matchLength + 1, iend) + 1; + } else { + match = dictBase + matchIndex; + matchLength += ZSTD_count_2segments(ip + matchLength, match + matchLength, iend, dictEnd, prefixStart); + if (matchIndex + matchLength >= dictLimit) + match = base + matchIndex; /* to prepare for next usage of match[matchLength] */ + } + + if (matchLength > bestLength) { + bestLength = matchLength; + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; + } + + if (ip + matchLength == iend) /* equal : no way to know if inf or sup */ + break; /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt the tree */ + + if (match[matchLength] < ip[matchLength]) { /* necessarily within correct buffer */ + /* match is smaller than curr */ + *smallerPtr = matchIndex; /* update smaller idx */ + commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ + if (matchIndex <= btLow) { + smallerPtr = &dummy32; + break; + } /* beyond tree size, stop the search */ + smallerPtr = nextPtr + 1; /* new "smaller" => larger of match */ + matchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to curr) */ + } else { + /* match is larger than curr */ + *largerPtr = matchIndex; + commonLengthLarger = matchLength; + if (matchIndex <= btLow) { + largerPtr = &dummy32; + break; + } /* beyond tree size, stop the search */ + largerPtr = nextPtr; + matchIndex = nextPtr[0]; + } + } + + *smallerPtr = *largerPtr = 0; + if (bestLength > 384) + return MIN(192, (U32)(bestLength - 384)); /* speed optimization */ + if (matchEndIdx > curr + 8) + return matchEndIdx - curr - 8; + return 1; +} + +static size_t ZSTD_insertBtAndFindBestMatch(ZSTD_CCtx *zc, const BYTE *const ip, const BYTE *const iend, size_t *offsetPtr, U32 nbCompares, const U32 mls, + U32 extDict) +{ + U32 *const hashTable = zc->hashTable; + U32 const hashLog = zc->params.cParams.hashLog; + size_t const h = ZSTD_hashPtr(ip, hashLog, mls); + U32 *const bt = zc->chainTable; + U32 const btLog = zc->params.cParams.chainLog - 1; + U32 const btMask = (1 << btLog) - 1; + U32 matchIndex = hashTable[h]; + size_t commonLengthSmaller = 0, commonLengthLarger = 0; + const BYTE *const base = zc->base; + const BYTE *const dictBase = zc->dictBase; + const U32 dictLimit = zc->dictLimit; + const BYTE *const dictEnd = dictBase + dictLimit; + const BYTE *const prefixStart = base + dictLimit; + const U32 curr = (U32)(ip - base); + const U32 btLow = btMask >= curr ? 0 : curr - btMask; + const U32 windowLow = zc->lowLimit; + U32 *smallerPtr = bt + 2 * (curr & btMask); + U32 *largerPtr = bt + 2 * (curr & btMask) + 1; + U32 matchEndIdx = curr + 8; + U32 dummy32; /* to be nullified at the end */ + size_t bestLength = 0; + + hashTable[h] = curr; /* Update Hash Table */ + + while (nbCompares-- && (matchIndex > windowLow)) { + U32 *const nextPtr = bt + 2 * (matchIndex & btMask); + size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ + const BYTE *match; + + if ((!extDict) || (matchIndex + matchLength >= dictLimit)) { + match = base + matchIndex; + if (match[matchLength] == ip[matchLength]) + matchLength += ZSTD_count(ip + matchLength + 1, match + matchLength + 1, iend) + 1; + } else { + match = dictBase + matchIndex; + matchLength += ZSTD_count_2segments(ip + matchLength, match + matchLength, iend, dictEnd, prefixStart); + if (matchIndex + matchLength >= dictLimit) + match = base + matchIndex; /* to prepare for next usage of match[matchLength] */ + } + + if (matchLength > bestLength) { + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; + if ((4 * (int)(matchLength - bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)offsetPtr[0] + 1))) + bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex; + if (ip + matchLength == iend) /* equal : no way to know if inf or sup */ + break; /* drop, to guarantee consistency (miss a little bit of compression) */ + } + + if (match[matchLength] < ip[matchLength]) { + /* match is smaller than curr */ + *smallerPtr = matchIndex; /* update smaller idx */ + commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ + if (matchIndex <= btLow) { + smallerPtr = &dummy32; + break; + } /* beyond tree size, stop the search */ + smallerPtr = nextPtr + 1; /* new "smaller" => larger of match */ + matchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to curr) */ + } else { + /* match is larger than curr */ + *largerPtr = matchIndex; + commonLengthLarger = matchLength; + if (matchIndex <= btLow) { + largerPtr = &dummy32; + break; + } /* beyond tree size, stop the search */ + largerPtr = nextPtr; + matchIndex = nextPtr[0]; + } + } + + *smallerPtr = *largerPtr = 0; + + zc->nextToUpdate = (matchEndIdx > curr + 8) ? matchEndIdx - 8 : curr + 1; + return bestLength; +} + +static void ZSTD_updateTree(ZSTD_CCtx *zc, const BYTE *const ip, const BYTE *const iend, const U32 nbCompares, const U32 mls) +{ + const BYTE *const base = zc->base; + const U32 target = (U32)(ip - base); + U32 idx = zc->nextToUpdate; + + while (idx < target) + idx += ZSTD_insertBt1(zc, base + idx, mls, iend, nbCompares, 0); +} + +/** ZSTD_BtFindBestMatch() : Tree updater, providing best match */ +static size_t ZSTD_BtFindBestMatch(ZSTD_CCtx *zc, const BYTE *const ip, const BYTE *const iLimit, size_t *offsetPtr, const U32 maxNbAttempts, const U32 mls) +{ + if (ip < zc->base + zc->nextToUpdate) + return 0; /* skipped area */ + ZSTD_updateTree(zc, ip, iLimit, maxNbAttempts, mls); + return ZSTD_insertBtAndFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, mls, 0); +} + +static size_t ZSTD_BtFindBestMatch_selectMLS(ZSTD_CCtx *zc, /* Index table will be updated */ + const BYTE *ip, const BYTE *const iLimit, size_t *offsetPtr, const U32 maxNbAttempts, const U32 matchLengthSearch) +{ + switch (matchLengthSearch) { + default: /* includes case 3 */ + case 4: return ZSTD_BtFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, 4); + case 5: return ZSTD_BtFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, 5); + case 7: + case 6: return ZSTD_BtFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, 6); + } +} + +static void ZSTD_updateTree_extDict(ZSTD_CCtx *zc, const BYTE *const ip, const BYTE *const iend, const U32 nbCompares, const U32 mls) +{ + const BYTE *const base = zc->base; + const U32 target = (U32)(ip - base); + U32 idx = zc->nextToUpdate; + + while (idx < target) + idx += ZSTD_insertBt1(zc, base + idx, mls, iend, nbCompares, 1); +} + +/** Tree updater, providing best match */ +static size_t ZSTD_BtFindBestMatch_extDict(ZSTD_CCtx *zc, const BYTE *const ip, const BYTE *const iLimit, size_t *offsetPtr, const U32 maxNbAttempts, + const U32 mls) +{ + if (ip < zc->base + zc->nextToUpdate) + return 0; /* skipped area */ + ZSTD_updateTree_extDict(zc, ip, iLimit, maxNbAttempts, mls); + return ZSTD_insertBtAndFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, mls, 1); +} + +static size_t ZSTD_BtFindBestMatch_selectMLS_extDict(ZSTD_CCtx *zc, /* Index table will be updated */ + const BYTE *ip, const BYTE *const iLimit, size_t *offsetPtr, const U32 maxNbAttempts, + const U32 matchLengthSearch) +{ + switch (matchLengthSearch) { + default: /* includes case 3 */ + case 4: return ZSTD_BtFindBestMatch_extDict(zc, ip, iLimit, offsetPtr, maxNbAttempts, 4); + case 5: return ZSTD_BtFindBestMatch_extDict(zc, ip, iLimit, offsetPtr, maxNbAttempts, 5); + case 7: + case 6: return ZSTD_BtFindBestMatch_extDict(zc, ip, iLimit, offsetPtr, maxNbAttempts, 6); + } +} + +/* ********************************* +* Hash Chain +***********************************/ +#define NEXT_IN_CHAIN(d, mask) chainTable[(d)&mask] + +/* Update chains up to ip (excluded) + Assumption : always within prefix (i.e. not within extDict) */ +FORCE_INLINE +U32 ZSTD_insertAndFindFirstIndex(ZSTD_CCtx *zc, const BYTE *ip, U32 mls) +{ + U32 *const hashTable = zc->hashTable; + const U32 hashLog = zc->params.cParams.hashLog; + U32 *const chainTable = zc->chainTable; + const U32 chainMask = (1 << zc->params.cParams.chainLog) - 1; + const BYTE *const base = zc->base; + const U32 target = (U32)(ip - base); + U32 idx = zc->nextToUpdate; + + while (idx < target) { /* catch up */ + size_t const h = ZSTD_hashPtr(base + idx, hashLog, mls); + NEXT_IN_CHAIN(idx, chainMask) = hashTable[h]; + hashTable[h] = idx; + idx++; + } + + zc->nextToUpdate = target; + return hashTable[ZSTD_hashPtr(ip, hashLog, mls)]; +} + +/* inlining is important to hardwire a hot branch (template emulation) */ +FORCE_INLINE +size_t ZSTD_HcFindBestMatch_generic(ZSTD_CCtx *zc, /* Index table will be updated */ + const BYTE *const ip, const BYTE *const iLimit, size_t *offsetPtr, const U32 maxNbAttempts, const U32 mls, + const U32 extDict) +{ + U32 *const chainTable = zc->chainTable; + const U32 chainSize = (1 << zc->params.cParams.chainLog); + const U32 chainMask = chainSize - 1; + const BYTE *const base = zc->base; + const BYTE *const dictBase = zc->dictBase; + const U32 dictLimit = zc->dictLimit; + const BYTE *const prefixStart = base + dictLimit; + const BYTE *const dictEnd = dictBase + dictLimit; + const U32 lowLimit = zc->lowLimit; + const U32 curr = (U32)(ip - base); + const U32 minChain = curr > chainSize ? curr - chainSize : 0; + int nbAttempts = maxNbAttempts; + size_t ml = EQUAL_READ32 - 1; + + /* HC4 match finder */ + U32 matchIndex = ZSTD_insertAndFindFirstIndex(zc, ip, mls); + + for (; (matchIndex > lowLimit) & (nbAttempts > 0); nbAttempts--) { + const BYTE *match; + size_t currMl = 0; + if ((!extDict) || matchIndex >= dictLimit) { + match = base + matchIndex; + if (match[ml] == ip[ml]) /* potentially better */ + currMl = ZSTD_count(ip, match, iLimit); + } else { + match = dictBase + matchIndex; + if (ZSTD_read32(match) == ZSTD_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */ + currMl = ZSTD_count_2segments(ip + EQUAL_READ32, match + EQUAL_READ32, iLimit, dictEnd, prefixStart) + EQUAL_READ32; + } + + /* save best solution */ + if (currMl > ml) { + ml = currMl; + *offsetPtr = curr - matchIndex + ZSTD_REP_MOVE; + if (ip + currMl == iLimit) + break; /* best possible, and avoid read overflow*/ + } + + if (matchIndex <= minChain) + break; + matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask); + } + + return ml; +} + +FORCE_INLINE size_t ZSTD_HcFindBestMatch_selectMLS(ZSTD_CCtx *zc, const BYTE *ip, const BYTE *const iLimit, size_t *offsetPtr, const U32 maxNbAttempts, + const U32 matchLengthSearch) +{ + switch (matchLengthSearch) { + default: /* includes case 3 */ + case 4: return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 4, 0); + case 5: return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 5, 0); + case 7: + case 6: return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 6, 0); + } +} + +FORCE_INLINE size_t ZSTD_HcFindBestMatch_extDict_selectMLS(ZSTD_CCtx *zc, const BYTE *ip, const BYTE *const iLimit, size_t *offsetPtr, const U32 maxNbAttempts, + const U32 matchLengthSearch) +{ + switch (matchLengthSearch) { + default: /* includes case 3 */ + case 4: return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 4, 1); + case 5: return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 5, 1); + case 7: + case 6: return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 6, 1); + } +} + +/* ******************************* +* Common parser - lazy strategy +*********************************/ +FORCE_INLINE +void ZSTD_compressBlock_lazy_generic(ZSTD_CCtx *ctx, const void *src, size_t srcSize, const U32 searchMethod, const U32 depth) +{ + seqStore_t *seqStorePtr = &(ctx->seqStore); + const BYTE *const istart = (const BYTE *)src; + const BYTE *ip = istart; + const BYTE *anchor = istart; + const BYTE *const iend = istart + srcSize; + const BYTE *const ilimit = iend - 8; + const BYTE *const base = ctx->base + ctx->dictLimit; + + U32 const maxSearches = 1 << ctx->params.cParams.searchLog; + U32 const mls = ctx->params.cParams.searchLength; + + typedef size_t (*searchMax_f)(ZSTD_CCtx * zc, const BYTE *ip, const BYTE *iLimit, size_t *offsetPtr, U32 maxNbAttempts, U32 matchLengthSearch); + searchMax_f const searchMax = searchMethod ? ZSTD_BtFindBestMatch_selectMLS : ZSTD_HcFindBestMatch_selectMLS; + U32 offset_1 = ctx->rep[0], offset_2 = ctx->rep[1], savedOffset = 0; + + /* init */ + ip += (ip == base); + ctx->nextToUpdate3 = ctx->nextToUpdate; + { + U32 const maxRep = (U32)(ip - base); + if (offset_2 > maxRep) + savedOffset = offset_2, offset_2 = 0; + if (offset_1 > maxRep) + savedOffset = offset_1, offset_1 = 0; + } + + /* Match Loop */ + while (ip < ilimit) { + size_t matchLength = 0; + size_t offset = 0; + const BYTE *start = ip + 1; + + /* check repCode */ + if ((offset_1 > 0) & (ZSTD_read32(ip + 1) == ZSTD_read32(ip + 1 - offset_1))) { + /* repcode : we take it */ + matchLength = ZSTD_count(ip + 1 + EQUAL_READ32, ip + 1 + EQUAL_READ32 - offset_1, iend) + EQUAL_READ32; + if (depth == 0) + goto _storeSequence; + } + + /* first search (depth 0) */ + { + size_t offsetFound = 99999999; + size_t const ml2 = searchMax(ctx, ip, iend, &offsetFound, maxSearches, mls); + if (ml2 > matchLength) + matchLength = ml2, start = ip, offset = offsetFound; + } + + if (matchLength < EQUAL_READ32) { + ip += ((ip - anchor) >> g_searchStrength) + 1; /* jump faster over incompressible sections */ + continue; + } + + /* let's try to find a better solution */ + if (depth >= 1) + while (ip < ilimit) { + ip++; + if ((offset) && ((offset_1 > 0) & (ZSTD_read32(ip) == ZSTD_read32(ip - offset_1)))) { + size_t const mlRep = ZSTD_count(ip + EQUAL_READ32, ip + EQUAL_READ32 - offset_1, iend) + EQUAL_READ32; + int const gain2 = (int)(mlRep * 3); + int const gain1 = (int)(matchLength * 3 - ZSTD_highbit32((U32)offset + 1) + 1); + if ((mlRep >= EQUAL_READ32) && (gain2 > gain1)) + matchLength = mlRep, offset = 0, start = ip; + } + { + size_t offset2 = 99999999; + size_t const ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls); + int const gain2 = (int)(ml2 * 4 - ZSTD_highbit32((U32)offset2 + 1)); /* raw approx */ + int const gain1 = (int)(matchLength * 4 - ZSTD_highbit32((U32)offset + 1) + 4); + if ((ml2 >= EQUAL_READ32) && (gain2 > gain1)) { + matchLength = ml2, offset = offset2, start = ip; + continue; /* search a better one */ + } + } + + /* let's find an even better one */ + if ((depth == 2) && (ip < ilimit)) { + ip++; + if ((offset) && ((offset_1 > 0) & (ZSTD_read32(ip) == ZSTD_read32(ip - offset_1)))) { + size_t const ml2 = ZSTD_count(ip + EQUAL_READ32, ip + EQUAL_READ32 - offset_1, iend) + EQUAL_READ32; + int const gain2 = (int)(ml2 * 4); + int const gain1 = (int)(matchLength * 4 - ZSTD_highbit32((U32)offset + 1) + 1); + if ((ml2 >= EQUAL_READ32) && (gain2 > gain1)) + matchLength = ml2, offset = 0, start = ip; + } + { + size_t offset2 = 99999999; + size_t const ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls); + int const gain2 = (int)(ml2 * 4 - ZSTD_highbit32((U32)offset2 + 1)); /* raw approx */ + int const gain1 = (int)(matchLength * 4 - ZSTD_highbit32((U32)offset + 1) + 7); + if ((ml2 >= EQUAL_READ32) && (gain2 > gain1)) { + matchLength = ml2, offset = offset2, start = ip; + continue; + } + } + } + break; /* nothing found : store previous solution */ + } + + /* NOTE: + * start[-offset+ZSTD_REP_MOVE-1] is undefined behavior. + * (-offset+ZSTD_REP_MOVE-1) is unsigned, and is added to start, which + * overflows the pointer, which is undefined behavior. + */ + /* catch up */ + if (offset) { + while ((start > anchor) && (start > base + offset - ZSTD_REP_MOVE) && + (start[-1] == (start-offset+ZSTD_REP_MOVE)[-1])) /* only search for offset within prefix */ + { + start--; + matchLength++; + } + offset_2 = offset_1; + offset_1 = (U32)(offset - ZSTD_REP_MOVE); + } + + /* store sequence */ +_storeSequence: + { + size_t const litLength = start - anchor; + ZSTD_storeSeq(seqStorePtr, litLength, anchor, (U32)offset, matchLength - MINMATCH); + anchor = ip = start + matchLength; + } + + /* check immediate repcode */ + while ((ip <= ilimit) && ((offset_2 > 0) & (ZSTD_read32(ip) == ZSTD_read32(ip - offset_2)))) { + /* store sequence */ + matchLength = ZSTD_count(ip + EQUAL_READ32, ip + EQUAL_READ32 - offset_2, iend) + EQUAL_READ32; + offset = offset_2; + offset_2 = offset_1; + offset_1 = (U32)offset; /* swap repcodes */ + ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, matchLength - MINMATCH); + ip += matchLength; + anchor = ip; + continue; /* faster when present ... (?) */ + } + } + + /* Save reps for next block */ + ctx->repToConfirm[0] = offset_1 ? offset_1 : savedOffset; + ctx->repToConfirm[1] = offset_2 ? offset_2 : savedOffset; + + /* Last Literals */ + { + size_t const lastLLSize = iend - anchor; + memcpy(seqStorePtr->lit, anchor, lastLLSize); + seqStorePtr->lit += lastLLSize; + } +} + +static void ZSTD_compressBlock_btlazy2(ZSTD_CCtx *ctx, const void *src, size_t srcSize) { ZSTD_compressBlock_lazy_generic(ctx, src, srcSize, 1, 2); } + +static void ZSTD_compressBlock_lazy2(ZSTD_CCtx *ctx, const void *src, size_t srcSize) { ZSTD_compressBlock_lazy_generic(ctx, src, srcSize, 0, 2); } + +static void ZSTD_compressBlock_lazy(ZSTD_CCtx *ctx, const void *src, size_t srcSize) { ZSTD_compressBlock_lazy_generic(ctx, src, srcSize, 0, 1); } + +static void ZSTD_compressBlock_greedy(ZSTD_CCtx *ctx, const void *src, size_t srcSize) { ZSTD_compressBlock_lazy_generic(ctx, src, srcSize, 0, 0); } + +FORCE_INLINE +void ZSTD_compressBlock_lazy_extDict_generic(ZSTD_CCtx *ctx, const void *src, size_t srcSize, const U32 searchMethod, const U32 depth) +{ + seqStore_t *seqStorePtr = &(ctx->seqStore); + const BYTE *const istart = (const BYTE *)src; + const BYTE *ip = istart; + const BYTE *anchor = istart; + const BYTE *const iend = istart + srcSize; + const BYTE *const ilimit = iend - 8; + const BYTE *const base = ctx->base; + const U32 dictLimit = ctx->dictLimit; + const U32 lowestIndex = ctx->lowLimit; + const BYTE *const prefixStart = base + dictLimit; + const BYTE *const dictBase = ctx->dictBase; + const BYTE *const dictEnd = dictBase + dictLimit; + const BYTE *const dictStart = dictBase + ctx->lowLimit; + + const U32 maxSearches = 1 << ctx->params.cParams.searchLog; + const U32 mls = ctx->params.cParams.searchLength; + + typedef size_t (*searchMax_f)(ZSTD_CCtx * zc, const BYTE *ip, const BYTE *iLimit, size_t *offsetPtr, U32 maxNbAttempts, U32 matchLengthSearch); + searchMax_f searchMax = searchMethod ? ZSTD_BtFindBestMatch_selectMLS_extDict : ZSTD_HcFindBestMatch_extDict_selectMLS; + + U32 offset_1 = ctx->rep[0], offset_2 = ctx->rep[1]; + + /* init */ + ctx->nextToUpdate3 = ctx->nextToUpdate; + ip += (ip == prefixStart); + + /* Match Loop */ + while (ip < ilimit) { + size_t matchLength = 0; + size_t offset = 0; + const BYTE *start = ip + 1; + U32 curr = (U32)(ip - base); + + /* check repCode */ + { + const U32 repIndex = (U32)(curr + 1 - offset_1); + const BYTE *const repBase = repIndex < dictLimit ? dictBase : base; + const BYTE *const repMatch = repBase + repIndex; + if (((U32)((dictLimit - 1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */ + if (ZSTD_read32(ip + 1) == ZSTD_read32(repMatch)) { + /* repcode detected we should take it */ + const BYTE *const repEnd = repIndex < dictLimit ? dictEnd : iend; + matchLength = + ZSTD_count_2segments(ip + 1 + EQUAL_READ32, repMatch + EQUAL_READ32, iend, repEnd, prefixStart) + EQUAL_READ32; + if (depth == 0) + goto _storeSequence; + } + } + + /* first search (depth 0) */ + { + size_t offsetFound = 99999999; + size_t const ml2 = searchMax(ctx, ip, iend, &offsetFound, maxSearches, mls); + if (ml2 > matchLength) + matchLength = ml2, start = ip, offset = offsetFound; + } + + if (matchLength < EQUAL_READ32) { + ip += ((ip - anchor) >> g_searchStrength) + 1; /* jump faster over incompressible sections */ + continue; + } + + /* let's try to find a better solution */ + if (depth >= 1) + while (ip < ilimit) { + ip++; + curr++; + /* check repCode */ + if (offset) { + const U32 repIndex = (U32)(curr - offset_1); + const BYTE *const repBase = repIndex < dictLimit ? dictBase : base; + const BYTE *const repMatch = repBase + repIndex; + if (((U32)((dictLimit - 1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */ + if (ZSTD_read32(ip) == ZSTD_read32(repMatch)) { + /* repcode detected */ + const BYTE *const repEnd = repIndex < dictLimit ? dictEnd : iend; + size_t const repLength = + ZSTD_count_2segments(ip + EQUAL_READ32, repMatch + EQUAL_READ32, iend, repEnd, prefixStart) + + EQUAL_READ32; + int const gain2 = (int)(repLength * 3); + int const gain1 = (int)(matchLength * 3 - ZSTD_highbit32((U32)offset + 1) + 1); + if ((repLength >= EQUAL_READ32) && (gain2 > gain1)) + matchLength = repLength, offset = 0, start = ip; + } + } + + /* search match, depth 1 */ + { + size_t offset2 = 99999999; + size_t const ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls); + int const gain2 = (int)(ml2 * 4 - ZSTD_highbit32((U32)offset2 + 1)); /* raw approx */ + int const gain1 = (int)(matchLength * 4 - ZSTD_highbit32((U32)offset + 1) + 4); + if ((ml2 >= EQUAL_READ32) && (gain2 > gain1)) { + matchLength = ml2, offset = offset2, start = ip; + continue; /* search a better one */ + } + } + + /* let's find an even better one */ + if ((depth == 2) && (ip < ilimit)) { + ip++; + curr++; + /* check repCode */ + if (offset) { + const U32 repIndex = (U32)(curr - offset_1); + const BYTE *const repBase = repIndex < dictLimit ? dictBase : base; + const BYTE *const repMatch = repBase + repIndex; + if (((U32)((dictLimit - 1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */ + if (ZSTD_read32(ip) == ZSTD_read32(repMatch)) { + /* repcode detected */ + const BYTE *const repEnd = repIndex < dictLimit ? dictEnd : iend; + size_t repLength = ZSTD_count_2segments(ip + EQUAL_READ32, repMatch + EQUAL_READ32, iend, + repEnd, prefixStart) + + EQUAL_READ32; + int gain2 = (int)(repLength * 4); + int gain1 = (int)(matchLength * 4 - ZSTD_highbit32((U32)offset + 1) + 1); + if ((repLength >= EQUAL_READ32) && (gain2 > gain1)) + matchLength = repLength, offset = 0, start = ip; + } + } + + /* search match, depth 2 */ + { + size_t offset2 = 99999999; + size_t const ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls); + int const gain2 = (int)(ml2 * 4 - ZSTD_highbit32((U32)offset2 + 1)); /* raw approx */ + int const gain1 = (int)(matchLength * 4 - ZSTD_highbit32((U32)offset + 1) + 7); + if ((ml2 >= EQUAL_READ32) && (gain2 > gain1)) { + matchLength = ml2, offset = offset2, start = ip; + continue; + } + } + } + break; /* nothing found : store previous solution */ + } + + /* catch up */ + if (offset) { + U32 const matchIndex = (U32)((start - base) - (offset - ZSTD_REP_MOVE)); + const BYTE *match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex; + const BYTE *const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart; + while ((start > anchor) && (match > mStart) && (start[-1] == match[-1])) { + start--; + match--; + matchLength++; + } /* catch up */ + offset_2 = offset_1; + offset_1 = (U32)(offset - ZSTD_REP_MOVE); + } + + /* store sequence */ + _storeSequence : { + size_t const litLength = start - anchor; + ZSTD_storeSeq(seqStorePtr, litLength, anchor, (U32)offset, matchLength - MINMATCH); + anchor = ip = start + matchLength; + } + + /* check immediate repcode */ + while (ip <= ilimit) { + const U32 repIndex = (U32)((ip - base) - offset_2); + const BYTE *const repBase = repIndex < dictLimit ? dictBase : base; + const BYTE *const repMatch = repBase + repIndex; + if (((U32)((dictLimit - 1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */ + if (ZSTD_read32(ip) == ZSTD_read32(repMatch)) { + /* repcode detected we should take it */ + const BYTE *const repEnd = repIndex < dictLimit ? dictEnd : iend; + matchLength = + ZSTD_count_2segments(ip + EQUAL_READ32, repMatch + EQUAL_READ32, iend, repEnd, prefixStart) + EQUAL_READ32; + offset = offset_2; + offset_2 = offset_1; + offset_1 = (U32)offset; /* swap offset history */ + ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, matchLength - MINMATCH); + ip += matchLength; + anchor = ip; + continue; /* faster when present ... (?) */ + } + break; + } + } + + /* Save reps for next block */ + ctx->repToConfirm[0] = offset_1; + ctx->repToConfirm[1] = offset_2; + + /* Last Literals */ + { + size_t const lastLLSize = iend - anchor; + memcpy(seqStorePtr->lit, anchor, lastLLSize); + seqStorePtr->lit += lastLLSize; + } +} + +void ZSTD_compressBlock_greedy_extDict(ZSTD_CCtx *ctx, const void *src, size_t srcSize) { ZSTD_compressBlock_lazy_extDict_generic(ctx, src, srcSize, 0, 0); } + +static void ZSTD_compressBlock_lazy_extDict(ZSTD_CCtx *ctx, const void *src, size_t srcSize) +{ + ZSTD_compressBlock_lazy_extDict_generic(ctx, src, srcSize, 0, 1); +} + +static void ZSTD_compressBlock_lazy2_extDict(ZSTD_CCtx *ctx, const void *src, size_t srcSize) +{ + ZSTD_compressBlock_lazy_extDict_generic(ctx, src, srcSize, 0, 2); +} + +static void ZSTD_compressBlock_btlazy2_extDict(ZSTD_CCtx *ctx, const void *src, size_t srcSize) +{ + ZSTD_compressBlock_lazy_extDict_generic(ctx, src, srcSize, 1, 2); +} + +/* The optimal parser */ +#include "zstd_opt.h" + +static void ZSTD_compressBlock_btopt(ZSTD_CCtx *ctx, const void *src, size_t srcSize) +{ +#ifdef ZSTD_OPT_H_91842398743 + ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 0); +#else + (void)ctx; + (void)src; + (void)srcSize; + return; +#endif +} + +static void ZSTD_compressBlock_btopt2(ZSTD_CCtx *ctx, const void *src, size_t srcSize) +{ +#ifdef ZSTD_OPT_H_91842398743 + ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 1); +#else + (void)ctx; + (void)src; + (void)srcSize; + return; +#endif +} + +static void ZSTD_compressBlock_btopt_extDict(ZSTD_CCtx *ctx, const void *src, size_t srcSize) +{ +#ifdef ZSTD_OPT_H_91842398743 + ZSTD_compressBlock_opt_extDict_generic(ctx, src, srcSize, 0); +#else + (void)ctx; + (void)src; + (void)srcSize; + return; +#endif +} + +static void ZSTD_compressBlock_btopt2_extDict(ZSTD_CCtx *ctx, const void *src, size_t srcSize) +{ +#ifdef ZSTD_OPT_H_91842398743 + ZSTD_compressBlock_opt_extDict_generic(ctx, src, srcSize, 1); +#else + (void)ctx; + (void)src; + (void)srcSize; + return; +#endif +} + +typedef void (*ZSTD_blockCompressor)(ZSTD_CCtx *ctx, const void *src, size_t srcSize); + +static ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, int extDict) +{ + static const ZSTD_blockCompressor blockCompressor[2][8] = { + {ZSTD_compressBlock_fast, ZSTD_compressBlock_doubleFast, ZSTD_compressBlock_greedy, ZSTD_compressBlock_lazy, ZSTD_compressBlock_lazy2, + ZSTD_compressBlock_btlazy2, ZSTD_compressBlock_btopt, ZSTD_compressBlock_btopt2}, + {ZSTD_compressBlock_fast_extDict, ZSTD_compressBlock_doubleFast_extDict, ZSTD_compressBlock_greedy_extDict, ZSTD_compressBlock_lazy_extDict, + ZSTD_compressBlock_lazy2_extDict, ZSTD_compressBlock_btlazy2_extDict, ZSTD_compressBlock_btopt_extDict, ZSTD_compressBlock_btopt2_extDict}}; + + return blockCompressor[extDict][(U32)strat]; +} + +static size_t ZSTD_compressBlock_internal(ZSTD_CCtx *zc, void *dst, size_t dstCapacity, const void *src, size_t srcSize) +{ + ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->params.cParams.strategy, zc->lowLimit < zc->dictLimit); + const BYTE *const base = zc->base; + const BYTE *const istart = (const BYTE *)src; + const U32 curr = (U32)(istart - base); + if (srcSize < MIN_CBLOCK_SIZE + ZSTD_blockHeaderSize + 1) + return 0; /* don't even attempt compression below a certain srcSize */ + ZSTD_resetSeqStore(&(zc->seqStore)); + if (curr > zc->nextToUpdate + 384) + zc->nextToUpdate = curr - MIN(192, (U32)(curr - zc->nextToUpdate - 384)); /* update tree not updated after finding very long rep matches */ + blockCompressor(zc, src, srcSize); + return ZSTD_compressSequences(zc, dst, dstCapacity, srcSize); +} + +/*! ZSTD_compress_generic() : +* Compress a chunk of data into one or multiple blocks. +* All blocks will be terminated, all input will be consumed. +* Function will issue an error if there is not enough `dstCapacity` to hold the compressed content. +* Frame is supposed already started (header already produced) +* @return : compressed size, or an error code +*/ +static size_t ZSTD_compress_generic(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, U32 lastFrameChunk) +{ + size_t blockSize = cctx->blockSize; + size_t remaining = srcSize; + const BYTE *ip = (const BYTE *)src; + BYTE *const ostart = (BYTE *)dst; + BYTE *op = ostart; + U32 const maxDist = 1 << cctx->params.cParams.windowLog; + + if (cctx->params.fParams.checksumFlag && srcSize) + xxh64_update(&cctx->xxhState, src, srcSize); + + while (remaining) { + U32 const lastBlock = lastFrameChunk & (blockSize >= remaining); + size_t cSize; + + if (dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE) + return ERROR(dstSize_tooSmall); /* not enough space to store compressed block */ + if (remaining < blockSize) + blockSize = remaining; + + /* preemptive overflow correction */ + if (cctx->lowLimit > (3U << 29)) { + U32 const cycleMask = (1 << ZSTD_cycleLog(cctx->params.cParams.hashLog, cctx->params.cParams.strategy)) - 1; + U32 const curr = (U32)(ip - cctx->base); + U32 const newCurr = (curr & cycleMask) + (1 << cctx->params.cParams.windowLog); + U32 const correction = curr - newCurr; + ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX_64 <= 30); + ZSTD_reduceIndex(cctx, correction); + cctx->base += correction; + cctx->dictBase += correction; + cctx->lowLimit -= correction; + cctx->dictLimit -= correction; + if (cctx->nextToUpdate < correction) + cctx->nextToUpdate = 0; + else + cctx->nextToUpdate -= correction; + } + + if ((U32)(ip + blockSize - cctx->base) > cctx->loadedDictEnd + maxDist) { + /* enforce maxDist */ + U32 const newLowLimit = (U32)(ip + blockSize - cctx->base) - maxDist; + if (cctx->lowLimit < newLowLimit) + cctx->lowLimit = newLowLimit; + if (cctx->dictLimit < cctx->lowLimit) + cctx->dictLimit = cctx->lowLimit; + } + + cSize = ZSTD_compressBlock_internal(cctx, op + ZSTD_blockHeaderSize, dstCapacity - ZSTD_blockHeaderSize, ip, blockSize); + if (ZSTD_isError(cSize)) + return cSize; + + if (cSize == 0) { /* block is not compressible */ + U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw) << 1) + (U32)(blockSize << 3); + if (blockSize + ZSTD_blockHeaderSize > dstCapacity) + return ERROR(dstSize_tooSmall); + ZSTD_writeLE32(op, cBlockHeader24); /* no pb, 4th byte will be overwritten */ + memcpy(op + ZSTD_blockHeaderSize, ip, blockSize); + cSize = ZSTD_blockHeaderSize + blockSize; + } else { + U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed) << 1) + (U32)(cSize << 3); + ZSTD_writeLE24(op, cBlockHeader24); + cSize += ZSTD_blockHeaderSize; + } + + remaining -= blockSize; + dstCapacity -= cSize; + ip += blockSize; + op += cSize; + } + + if (lastFrameChunk && (op > ostart)) + cctx->stage = ZSTDcs_ending; + return op - ostart; +} + +static size_t ZSTD_writeFrameHeader(void *dst, size_t dstCapacity, ZSTD_parameters params, U64 pledgedSrcSize, U32 dictID) +{ + BYTE *const op = (BYTE *)dst; + U32 const dictIDSizeCode = (dictID > 0) + (dictID >= 256) + (dictID >= 65536); /* 0-3 */ + U32 const checksumFlag = params.fParams.checksumFlag > 0; + U32 const windowSize = 1U << params.cParams.windowLog; + U32 const singleSegment = params.fParams.contentSizeFlag && (windowSize >= pledgedSrcSize); + BYTE const windowLogByte = (BYTE)((params.cParams.windowLog - ZSTD_WINDOWLOG_ABSOLUTEMIN) << 3); + U32 const fcsCode = + params.fParams.contentSizeFlag ? (pledgedSrcSize >= 256) + (pledgedSrcSize >= 65536 + 256) + (pledgedSrcSize >= 0xFFFFFFFFU) : 0; /* 0-3 */ + BYTE const frameHeaderDecriptionByte = (BYTE)(dictIDSizeCode + (checksumFlag << 2) + (singleSegment << 5) + (fcsCode << 6)); + size_t pos; + + if (dstCapacity < ZSTD_frameHeaderSize_max) + return ERROR(dstSize_tooSmall); + + ZSTD_writeLE32(dst, ZSTD_MAGICNUMBER); + op[4] = frameHeaderDecriptionByte; + pos = 5; + if (!singleSegment) + op[pos++] = windowLogByte; + switch (dictIDSizeCode) { + default: /* impossible */ + case 0: break; + case 1: + op[pos] = (BYTE)(dictID); + pos++; + break; + case 2: + ZSTD_writeLE16(op + pos, (U16)dictID); + pos += 2; + break; + case 3: + ZSTD_writeLE32(op + pos, dictID); + pos += 4; + break; + } + switch (fcsCode) { + default: /* impossible */ + case 0: + if (singleSegment) + op[pos++] = (BYTE)(pledgedSrcSize); + break; + case 1: + ZSTD_writeLE16(op + pos, (U16)(pledgedSrcSize - 256)); + pos += 2; + break; + case 2: + ZSTD_writeLE32(op + pos, (U32)(pledgedSrcSize)); + pos += 4; + break; + case 3: + ZSTD_writeLE64(op + pos, (U64)(pledgedSrcSize)); + pos += 8; + break; + } + return pos; +} + +static size_t ZSTD_compressContinue_internal(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, U32 frame, U32 lastFrameChunk) +{ + const BYTE *const ip = (const BYTE *)src; + size_t fhSize = 0; + + if (cctx->stage == ZSTDcs_created) + return ERROR(stage_wrong); /* missing init (ZSTD_compressBegin) */ + + if (frame && (cctx->stage == ZSTDcs_init)) { + fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, cctx->params, cctx->frameContentSize, cctx->dictID); + if (ZSTD_isError(fhSize)) + return fhSize; + dstCapacity -= fhSize; + dst = (char *)dst + fhSize; + cctx->stage = ZSTDcs_ongoing; + } + + /* Check if blocks follow each other */ + if (src != cctx->nextSrc) { + /* not contiguous */ + ptrdiff_t const delta = cctx->nextSrc - ip; + cctx->lowLimit = cctx->dictLimit; + cctx->dictLimit = (U32)(cctx->nextSrc - cctx->base); + cctx->dictBase = cctx->base; + cctx->base -= delta; + cctx->nextToUpdate = cctx->dictLimit; + if (cctx->dictLimit - cctx->lowLimit < HASH_READ_SIZE) + cctx->lowLimit = cctx->dictLimit; /* too small extDict */ + } + + /* if input and dictionary overlap : reduce dictionary (area presumed modified by input) */ + if ((ip + srcSize > cctx->dictBase + cctx->lowLimit) & (ip < cctx->dictBase + cctx->dictLimit)) { + ptrdiff_t const highInputIdx = (ip + srcSize) - cctx->dictBase; + U32 const lowLimitMax = (highInputIdx > (ptrdiff_t)cctx->dictLimit) ? cctx->dictLimit : (U32)highInputIdx; + cctx->lowLimit = lowLimitMax; + } + + cctx->nextSrc = ip + srcSize; + + if (srcSize) { + size_t const cSize = frame ? ZSTD_compress_generic(cctx, dst, dstCapacity, src, srcSize, lastFrameChunk) + : ZSTD_compressBlock_internal(cctx, dst, dstCapacity, src, srcSize); + if (ZSTD_isError(cSize)) + return cSize; + return cSize + fhSize; + } else + return fhSize; +} + +size_t ZSTD_compressContinue(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize) +{ + return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1, 0); +} + +size_t ZSTD_getBlockSizeMax(ZSTD_CCtx *cctx) { return MIN(ZSTD_BLOCKSIZE_ABSOLUTEMAX, 1 << cctx->params.cParams.windowLog); } + +size_t ZSTD_compressBlock(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize) +{ + size_t const blockSizeMax = ZSTD_getBlockSizeMax(cctx); + if (srcSize > blockSizeMax) + return ERROR(srcSize_wrong); + return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0, 0); +} + +/*! ZSTD_loadDictionaryContent() : + * @return : 0, or an error code + */ +static size_t ZSTD_loadDictionaryContent(ZSTD_CCtx *zc, const void *src, size_t srcSize) +{ + const BYTE *const ip = (const BYTE *)src; + const BYTE *const iend = ip + srcSize; + + /* input becomes curr prefix */ + zc->lowLimit = zc->dictLimit; + zc->dictLimit = (U32)(zc->nextSrc - zc->base); + zc->dictBase = zc->base; + zc->base += ip - zc->nextSrc; + zc->nextToUpdate = zc->dictLimit; + zc->loadedDictEnd = zc->forceWindow ? 0 : (U32)(iend - zc->base); + + zc->nextSrc = iend; + if (srcSize <= HASH_READ_SIZE) + return 0; + + switch (zc->params.cParams.strategy) { + case ZSTD_fast: ZSTD_fillHashTable(zc, iend, zc->params.cParams.searchLength); break; + + case ZSTD_dfast: ZSTD_fillDoubleHashTable(zc, iend, zc->params.cParams.searchLength); break; + + case ZSTD_greedy: + case ZSTD_lazy: + case ZSTD_lazy2: + if (srcSize >= HASH_READ_SIZE) + ZSTD_insertAndFindFirstIndex(zc, iend - HASH_READ_SIZE, zc->params.cParams.searchLength); + break; + + case ZSTD_btlazy2: + case ZSTD_btopt: + case ZSTD_btopt2: + if (srcSize >= HASH_READ_SIZE) + ZSTD_updateTree(zc, iend - HASH_READ_SIZE, iend, 1 << zc->params.cParams.searchLog, zc->params.cParams.searchLength); + break; + + default: + return ERROR(GENERIC); /* strategy doesn't exist; impossible */ + } + + zc->nextToUpdate = (U32)(iend - zc->base); + return 0; +} + +/* Dictionaries that assign zero probability to symbols that show up causes problems + when FSE encoding. Refuse dictionaries that assign zero probability to symbols + that we may encounter during compression. + NOTE: This behavior is not standard and could be improved in the future. */ +static size_t ZSTD_checkDictNCount(short *normalizedCounter, unsigned dictMaxSymbolValue, unsigned maxSymbolValue) +{ + U32 s; + if (dictMaxSymbolValue < maxSymbolValue) + return ERROR(dictionary_corrupted); + for (s = 0; s <= maxSymbolValue; ++s) { + if (normalizedCounter[s] == 0) + return ERROR(dictionary_corrupted); + } + return 0; +} + +/* Dictionary format : + * See : + * https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#dictionary-format + */ +/*! ZSTD_loadZstdDictionary() : + * @return : 0, or an error code + * assumptions : magic number supposed already checked + * dictSize supposed > 8 + */ +static size_t ZSTD_loadZstdDictionary(ZSTD_CCtx *cctx, const void *dict, size_t dictSize) +{ + const BYTE *dictPtr = (const BYTE *)dict; + const BYTE *const dictEnd = dictPtr + dictSize; + short offcodeNCount[MaxOff + 1]; + unsigned offcodeMaxValue = MaxOff; + + dictPtr += 4; /* skip magic number */ + cctx->dictID = cctx->params.fParams.noDictIDFlag ? 0 : ZSTD_readLE32(dictPtr); + dictPtr += 4; + + { + size_t const hufHeaderSize = HUF_readCTable_wksp(cctx->hufTable, 255, dictPtr, dictEnd - dictPtr, cctx->tmpCounters, sizeof(cctx->tmpCounters)); + if (HUF_isError(hufHeaderSize)) + return ERROR(dictionary_corrupted); + dictPtr += hufHeaderSize; + } + + { + unsigned offcodeLog; + size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd - dictPtr); + if (FSE_isError(offcodeHeaderSize)) + return ERROR(dictionary_corrupted); + if (offcodeLog > OffFSELog) + return ERROR(dictionary_corrupted); + /* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */ + CHECK_E(FSE_buildCTable_wksp(cctx->offcodeCTable, offcodeNCount, offcodeMaxValue, offcodeLog, cctx->tmpCounters, sizeof(cctx->tmpCounters)), + dictionary_corrupted); + dictPtr += offcodeHeaderSize; + } + + { + short matchlengthNCount[MaxML + 1]; + unsigned matchlengthMaxValue = MaxML, matchlengthLog; + size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd - dictPtr); + if (FSE_isError(matchlengthHeaderSize)) + return ERROR(dictionary_corrupted); + if (matchlengthLog > MLFSELog) + return ERROR(dictionary_corrupted); + /* Every match length code must have non-zero probability */ + CHECK_F(ZSTD_checkDictNCount(matchlengthNCount, matchlengthMaxValue, MaxML)); + CHECK_E( + FSE_buildCTable_wksp(cctx->matchlengthCTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog, cctx->tmpCounters, sizeof(cctx->tmpCounters)), + dictionary_corrupted); + dictPtr += matchlengthHeaderSize; + } + + { + short litlengthNCount[MaxLL + 1]; + unsigned litlengthMaxValue = MaxLL, litlengthLog; + size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd - dictPtr); + if (FSE_isError(litlengthHeaderSize)) + return ERROR(dictionary_corrupted); + if (litlengthLog > LLFSELog) + return ERROR(dictionary_corrupted); + /* Every literal length code must have non-zero probability */ + CHECK_F(ZSTD_checkDictNCount(litlengthNCount, litlengthMaxValue, MaxLL)); + CHECK_E(FSE_buildCTable_wksp(cctx->litlengthCTable, litlengthNCount, litlengthMaxValue, litlengthLog, cctx->tmpCounters, sizeof(cctx->tmpCounters)), + dictionary_corrupted); + dictPtr += litlengthHeaderSize; + } + + if (dictPtr + 12 > dictEnd) + return ERROR(dictionary_corrupted); + cctx->rep[0] = ZSTD_readLE32(dictPtr + 0); + cctx->rep[1] = ZSTD_readLE32(dictPtr + 4); + cctx->rep[2] = ZSTD_readLE32(dictPtr + 8); + dictPtr += 12; + + { + size_t const dictContentSize = (size_t)(dictEnd - dictPtr); + U32 offcodeMax = MaxOff; + if (dictContentSize <= ((U32)-1) - 128 KB) { + U32 const maxOffset = (U32)dictContentSize + 128 KB; /* The maximum offset that must be supported */ + offcodeMax = ZSTD_highbit32(maxOffset); /* Calculate minimum offset code required to represent maxOffset */ + } + /* All offset values <= dictContentSize + 128 KB must be representable */ + CHECK_F(ZSTD_checkDictNCount(offcodeNCount, offcodeMaxValue, MIN(offcodeMax, MaxOff))); + /* All repCodes must be <= dictContentSize and != 0*/ + { + U32 u; + for (u = 0; u < 3; u++) { + if (cctx->rep[u] == 0) + return ERROR(dictionary_corrupted); + if (cctx->rep[u] > dictContentSize) + return ERROR(dictionary_corrupted); + } + } + + cctx->flagStaticTables = 1; + cctx->flagStaticHufTable = HUF_repeat_valid; + return ZSTD_loadDictionaryContent(cctx, dictPtr, dictContentSize); + } +} + +/** ZSTD_compress_insertDictionary() : +* @return : 0, or an error code */ +static size_t ZSTD_compress_insertDictionary(ZSTD_CCtx *cctx, const void *dict, size_t dictSize) +{ + if ((dict == NULL) || (dictSize <= 8)) + return 0; + + /* dict as pure content */ + if ((ZSTD_readLE32(dict) != ZSTD_DICT_MAGIC) || (cctx->forceRawDict)) + return ZSTD_loadDictionaryContent(cctx, dict, dictSize); + + /* dict as zstd dictionary */ + return ZSTD_loadZstdDictionary(cctx, dict, dictSize); +} + +/*! ZSTD_compressBegin_internal() : +* @return : 0, or an error code */ +static size_t ZSTD_compressBegin_internal(ZSTD_CCtx *cctx, const void *dict, size_t dictSize, ZSTD_parameters params, U64 pledgedSrcSize) +{ + ZSTD_compResetPolicy_e const crp = dictSize ? ZSTDcrp_fullReset : ZSTDcrp_continue; + CHECK_F(ZSTD_resetCCtx_advanced(cctx, params, pledgedSrcSize, crp)); + return ZSTD_compress_insertDictionary(cctx, dict, dictSize); +} + +/*! ZSTD_compressBegin_advanced() : +* @return : 0, or an error code */ +size_t ZSTD_compressBegin_advanced(ZSTD_CCtx *cctx, const void *dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize) +{ + /* compression parameters verification and optimization */ + CHECK_F(ZSTD_checkCParams(params.cParams)); + return ZSTD_compressBegin_internal(cctx, dict, dictSize, params, pledgedSrcSize); +} + +size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx *cctx, const void *dict, size_t dictSize, int compressionLevel) +{ + ZSTD_parameters const params = ZSTD_getParams(compressionLevel, 0, dictSize); + return ZSTD_compressBegin_internal(cctx, dict, dictSize, params, 0); +} + +size_t ZSTD_compressBegin(ZSTD_CCtx *cctx, int compressionLevel) { return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel); } + +/*! ZSTD_writeEpilogue() : +* Ends a frame. +* @return : nb of bytes written into dst (or an error code) */ +static size_t ZSTD_writeEpilogue(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity) +{ + BYTE *const ostart = (BYTE *)dst; + BYTE *op = ostart; + size_t fhSize = 0; + + if (cctx->stage == ZSTDcs_created) + return ERROR(stage_wrong); /* init missing */ + + /* special case : empty frame */ + if (cctx->stage == ZSTDcs_init) { + fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, cctx->params, 0, 0); + if (ZSTD_isError(fhSize)) + return fhSize; + dstCapacity -= fhSize; + op += fhSize; + cctx->stage = ZSTDcs_ongoing; + } + + if (cctx->stage != ZSTDcs_ending) { + /* write one last empty block, make it the "last" block */ + U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw) << 1) + 0; + if (dstCapacity < 4) + return ERROR(dstSize_tooSmall); + ZSTD_writeLE32(op, cBlockHeader24); + op += ZSTD_blockHeaderSize; + dstCapacity -= ZSTD_blockHeaderSize; + } + + if (cctx->params.fParams.checksumFlag) { + U32 const checksum = (U32)xxh64_digest(&cctx->xxhState); + if (dstCapacity < 4) + return ERROR(dstSize_tooSmall); + ZSTD_writeLE32(op, checksum); + op += 4; + } + + cctx->stage = ZSTDcs_created; /* return to "created but no init" status */ + return op - ostart; +} + +size_t ZSTD_compressEnd(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize) +{ + size_t endResult; + size_t const cSize = ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1, 1); + if (ZSTD_isError(cSize)) + return cSize; + endResult = ZSTD_writeEpilogue(cctx, (char *)dst + cSize, dstCapacity - cSize); + if (ZSTD_isError(endResult)) + return endResult; + return cSize + endResult; +} + +static size_t ZSTD_compress_internal(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, const void *dict, size_t dictSize, + ZSTD_parameters params) +{ + CHECK_F(ZSTD_compressBegin_internal(cctx, dict, dictSize, params, srcSize)); + return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize); +} + +size_t ZSTD_compress_usingDict(ZSTD_CCtx *ctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, const void *dict, size_t dictSize, + ZSTD_parameters params) +{ + return ZSTD_compress_internal(ctx, dst, dstCapacity, src, srcSize, dict, dictSize, params); +} + +size_t ZSTD_compressCCtx(ZSTD_CCtx *ctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, ZSTD_parameters params) +{ + return ZSTD_compress_internal(ctx, dst, dstCapacity, src, srcSize, NULL, 0, params); +} + +/* ===== Dictionary API ===== */ + +struct ZSTD_CDict_s { + void *dictBuffer; + const void *dictContent; + size_t dictContentSize; + ZSTD_CCtx *refContext; +}; /* typedef'd tp ZSTD_CDict within "zstd.h" */ + +size_t ZSTD_CDictWorkspaceBound(ZSTD_compressionParameters cParams) { return ZSTD_CCtxWorkspaceBound(cParams) + ZSTD_ALIGN(sizeof(ZSTD_CDict)); } + +static ZSTD_CDict *ZSTD_createCDict_advanced(const void *dictBuffer, size_t dictSize, unsigned byReference, ZSTD_parameters params, ZSTD_customMem customMem) +{ + if (!customMem.customAlloc || !customMem.customFree) + return NULL; + + { + ZSTD_CDict *const cdict = (ZSTD_CDict *)ZSTD_malloc(sizeof(ZSTD_CDict), customMem); + ZSTD_CCtx *const cctx = ZSTD_createCCtx_advanced(customMem); + + if (!cdict || !cctx) { + ZSTD_free(cdict, customMem); + ZSTD_freeCCtx(cctx); + return NULL; + } + + if ((byReference) || (!dictBuffer) || (!dictSize)) { + cdict->dictBuffer = NULL; + cdict->dictContent = dictBuffer; + } else { + void *const internalBuffer = ZSTD_malloc(dictSize, customMem); + if (!internalBuffer) { + ZSTD_free(cctx, customMem); + ZSTD_free(cdict, customMem); + return NULL; + } + memcpy(internalBuffer, dictBuffer, dictSize); + cdict->dictBuffer = internalBuffer; + cdict->dictContent = internalBuffer; + } + + { + size_t const errorCode = ZSTD_compressBegin_advanced(cctx, cdict->dictContent, dictSize, params, 0); + if (ZSTD_isError(errorCode)) { + ZSTD_free(cdict->dictBuffer, customMem); + ZSTD_free(cdict, customMem); + ZSTD_freeCCtx(cctx); + return NULL; + } + } + + cdict->refContext = cctx; + cdict->dictContentSize = dictSize; + return cdict; + } +} + +ZSTD_CDict *ZSTD_initCDict(const void *dict, size_t dictSize, ZSTD_parameters params, void *workspace, size_t workspaceSize) +{ + ZSTD_customMem const stackMem = ZSTD_initStack(workspace, workspaceSize); + return ZSTD_createCDict_advanced(dict, dictSize, 1, params, stackMem); +} + +size_t ZSTD_freeCDict(ZSTD_CDict *cdict) +{ + if (cdict == NULL) + return 0; /* support free on NULL */ + { + ZSTD_customMem const cMem = cdict->refContext->customMem; + ZSTD_freeCCtx(cdict->refContext); + ZSTD_free(cdict->dictBuffer, cMem); + ZSTD_free(cdict, cMem); + return 0; + } +} + +static ZSTD_parameters ZSTD_getParamsFromCDict(const ZSTD_CDict *cdict) { return ZSTD_getParamsFromCCtx(cdict->refContext); } + +size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx *cctx, const ZSTD_CDict *cdict, unsigned long long pledgedSrcSize) +{ + if (cdict->dictContentSize) + CHECK_F(ZSTD_copyCCtx(cctx, cdict->refContext, pledgedSrcSize)) + else { + ZSTD_parameters params = cdict->refContext->params; + params.fParams.contentSizeFlag = (pledgedSrcSize > 0); + CHECK_F(ZSTD_compressBegin_advanced(cctx, NULL, 0, params, pledgedSrcSize)); + } + return 0; +} + +/*! ZSTD_compress_usingCDict() : +* Compression using a digested Dictionary. +* Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times. +* Note that compression level is decided during dictionary creation */ +size_t ZSTD_compress_usingCDict(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, const ZSTD_CDict *cdict) +{ + CHECK_F(ZSTD_compressBegin_usingCDict(cctx, cdict, srcSize)); + + if (cdict->refContext->params.fParams.contentSizeFlag == 1) { + cctx->params.fParams.contentSizeFlag = 1; + cctx->frameContentSize = srcSize; + } else { + cctx->params.fParams.contentSizeFlag = 0; + } + + return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize); +} + +/* ****************************************************************** +* Streaming +********************************************************************/ + +typedef enum { zcss_init, zcss_load, zcss_flush, zcss_final } ZSTD_cStreamStage; + +struct ZSTD_CStream_s { + ZSTD_CCtx *cctx; + ZSTD_CDict *cdictLocal; + const ZSTD_CDict *cdict; + char *inBuff; + size_t inBuffSize; + size_t inToCompress; + size_t inBuffPos; + size_t inBuffTarget; + size_t blockSize; + char *outBuff; + size_t outBuffSize; + size_t outBuffContentSize; + size_t outBuffFlushedSize; + ZSTD_cStreamStage stage; + U32 checksum; + U32 frameEnded; + U64 pledgedSrcSize; + U64 inputProcessed; + ZSTD_parameters params; + ZSTD_customMem customMem; +}; /* typedef'd to ZSTD_CStream within "zstd.h" */ + +size_t ZSTD_CStreamWorkspaceBound(ZSTD_compressionParameters cParams) +{ + size_t const inBuffSize = (size_t)1 << cParams.windowLog; + size_t const blockSize = MIN(ZSTD_BLOCKSIZE_ABSOLUTEMAX, inBuffSize); + size_t const outBuffSize = ZSTD_compressBound(blockSize) + 1; + + return ZSTD_CCtxWorkspaceBound(cParams) + ZSTD_ALIGN(sizeof(ZSTD_CStream)) + ZSTD_ALIGN(inBuffSize) + ZSTD_ALIGN(outBuffSize); +} + +ZSTD_CStream *ZSTD_createCStream_advanced(ZSTD_customMem customMem) +{ + ZSTD_CStream *zcs; + + if (!customMem.customAlloc || !customMem.customFree) + return NULL; + + zcs = (ZSTD_CStream *)ZSTD_malloc(sizeof(ZSTD_CStream), customMem); + if (zcs == NULL) + return NULL; + memset(zcs, 0, sizeof(ZSTD_CStream)); + memcpy(&zcs->customMem, &customMem, sizeof(ZSTD_customMem)); + zcs->cctx = ZSTD_createCCtx_advanced(customMem); + if (zcs->cctx == NULL) { + ZSTD_freeCStream(zcs); + return NULL; + } + return zcs; +} + +size_t ZSTD_freeCStream(ZSTD_CStream *zcs) +{ + if (zcs == NULL) + return 0; /* support free on NULL */ + { + ZSTD_customMem const cMem = zcs->customMem; + ZSTD_freeCCtx(zcs->cctx); + zcs->cctx = NULL; + ZSTD_freeCDict(zcs->cdictLocal); + zcs->cdictLocal = NULL; + ZSTD_free(zcs->inBuff, cMem); + zcs->inBuff = NULL; + ZSTD_free(zcs->outBuff, cMem); + zcs->outBuff = NULL; + ZSTD_free(zcs, cMem); + return 0; + } +} + +/*====== Initialization ======*/ + +size_t ZSTD_CStreamInSize(void) { return ZSTD_BLOCKSIZE_ABSOLUTEMAX; } +size_t ZSTD_CStreamOutSize(void) { return ZSTD_compressBound(ZSTD_BLOCKSIZE_ABSOLUTEMAX) + ZSTD_blockHeaderSize + 4 /* 32-bits hash */; } + +static size_t ZSTD_resetCStream_internal(ZSTD_CStream *zcs, unsigned long long pledgedSrcSize) +{ + if (zcs->inBuffSize == 0) + return ERROR(stage_wrong); /* zcs has not been init at least once => can't reset */ + + if (zcs->cdict) + CHECK_F(ZSTD_compressBegin_usingCDict(zcs->cctx, zcs->cdict, pledgedSrcSize)) + else + CHECK_F(ZSTD_compressBegin_advanced(zcs->cctx, NULL, 0, zcs->params, pledgedSrcSize)); + + zcs->inToCompress = 0; + zcs->inBuffPos = 0; + zcs->inBuffTarget = zcs->blockSize; + zcs->outBuffContentSize = zcs->outBuffFlushedSize = 0; + zcs->stage = zcss_load; + zcs->frameEnded = 0; + zcs->pledgedSrcSize = pledgedSrcSize; + zcs->inputProcessed = 0; + return 0; /* ready to go */ +} + +size_t ZSTD_resetCStream(ZSTD_CStream *zcs, unsigned long long pledgedSrcSize) +{ + + zcs->params.fParams.contentSizeFlag = (pledgedSrcSize > 0); + + return ZSTD_resetCStream_internal(zcs, pledgedSrcSize); +} + +static size_t ZSTD_initCStream_advanced(ZSTD_CStream *zcs, const void *dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize) +{ + /* allocate buffers */ + { + size_t const neededInBuffSize = (size_t)1 << params.cParams.windowLog; + if (zcs->inBuffSize < neededInBuffSize) { + zcs->inBuffSize = neededInBuffSize; + ZSTD_free(zcs->inBuff, zcs->customMem); + zcs->inBuff = (char *)ZSTD_malloc(neededInBuffSize, zcs->customMem); + if (zcs->inBuff == NULL) + return ERROR(memory_allocation); + } + zcs->blockSize = MIN(ZSTD_BLOCKSIZE_ABSOLUTEMAX, neededInBuffSize); + } + if (zcs->outBuffSize < ZSTD_compressBound(zcs->blockSize) + 1) { + zcs->outBuffSize = ZSTD_compressBound(zcs->blockSize) + 1; + ZSTD_free(zcs->outBuff, zcs->customMem); + zcs->outBuff = (char *)ZSTD_malloc(zcs->outBuffSize, zcs->customMem); + if (zcs->outBuff == NULL) + return ERROR(memory_allocation); + } + + if (dict && dictSize >= 8) { + ZSTD_freeCDict(zcs->cdictLocal); + zcs->cdictLocal = ZSTD_createCDict_advanced(dict, dictSize, 0, params, zcs->customMem); + if (zcs->cdictLocal == NULL) + return ERROR(memory_allocation); + zcs->cdict = zcs->cdictLocal; + } else + zcs->cdict = NULL; + + zcs->checksum = params.fParams.checksumFlag > 0; + zcs->params = params; + + return ZSTD_resetCStream_internal(zcs, pledgedSrcSize); +} + +ZSTD_CStream *ZSTD_initCStream(ZSTD_parameters params, unsigned long long pledgedSrcSize, void *workspace, size_t workspaceSize) +{ + ZSTD_customMem const stackMem = ZSTD_initStack(workspace, workspaceSize); + ZSTD_CStream *const zcs = ZSTD_createCStream_advanced(stackMem); + if (zcs) { + size_t const code = ZSTD_initCStream_advanced(zcs, NULL, 0, params, pledgedSrcSize); + if (ZSTD_isError(code)) { + return NULL; + } + } + return zcs; +} + +ZSTD_CStream *ZSTD_initCStream_usingCDict(const ZSTD_CDict *cdict, unsigned long long pledgedSrcSize, void *workspace, size_t workspaceSize) +{ + ZSTD_parameters const params = ZSTD_getParamsFromCDict(cdict); + ZSTD_CStream *const zcs = ZSTD_initCStream(params, pledgedSrcSize, workspace, workspaceSize); + if (zcs) { + zcs->cdict = cdict; + if (ZSTD_isError(ZSTD_resetCStream_internal(zcs, pledgedSrcSize))) { + return NULL; + } + } + return zcs; +} + +/*====== Compression ======*/ + +typedef enum { zsf_gather, zsf_flush, zsf_end } ZSTD_flush_e; + +ZSTD_STATIC size_t ZSTD_limitCopy(void *dst, size_t dstCapacity, const void *src, size_t srcSize) +{ + size_t const length = MIN(dstCapacity, srcSize); + memcpy(dst, src, length); + return length; +} + +static size_t ZSTD_compressStream_generic(ZSTD_CStream *zcs, void *dst, size_t *dstCapacityPtr, const void *src, size_t *srcSizePtr, ZSTD_flush_e const flush) +{ + U32 someMoreWork = 1; + const char *const istart = (const char *)src; + const char *const iend = istart + *srcSizePtr; + const char *ip = istart; + char *const ostart = (char *)dst; + char *const oend = ostart + *dstCapacityPtr; + char *op = ostart; + + while (someMoreWork) { + switch (zcs->stage) { + case zcss_init: + return ERROR(init_missing); /* call ZBUFF_compressInit() first ! */ + + case zcss_load: + /* complete inBuffer */ + { + size_t const toLoad = zcs->inBuffTarget - zcs->inBuffPos; + size_t const loaded = ZSTD_limitCopy(zcs->inBuff + zcs->inBuffPos, toLoad, ip, iend - ip); + zcs->inBuffPos += loaded; + ip += loaded; + if ((zcs->inBuffPos == zcs->inToCompress) || (!flush && (toLoad != loaded))) { + someMoreWork = 0; + break; /* not enough input to get a full block : stop there, wait for more */ + } + } + /* compress curr block (note : this stage cannot be stopped in the middle) */ + { + void *cDst; + size_t cSize; + size_t const iSize = zcs->inBuffPos - zcs->inToCompress; + size_t oSize = oend - op; + if (oSize >= ZSTD_compressBound(iSize)) + cDst = op; /* compress directly into output buffer (avoid flush stage) */ + else + cDst = zcs->outBuff, oSize = zcs->outBuffSize; + cSize = (flush == zsf_end) ? ZSTD_compressEnd(zcs->cctx, cDst, oSize, zcs->inBuff + zcs->inToCompress, iSize) + : ZSTD_compressContinue(zcs->cctx, cDst, oSize, zcs->inBuff + zcs->inToCompress, iSize); + if (ZSTD_isError(cSize)) + return cSize; + if (flush == zsf_end) + zcs->frameEnded = 1; + /* prepare next block */ + zcs->inBuffTarget = zcs->inBuffPos + zcs->blockSize; + if (zcs->inBuffTarget > zcs->inBuffSize) + zcs->inBuffPos = 0, zcs->inBuffTarget = zcs->blockSize; /* note : inBuffSize >= blockSize */ + zcs->inToCompress = zcs->inBuffPos; + if (cDst == op) { + op += cSize; + break; + } /* no need to flush */ + zcs->outBuffContentSize = cSize; + zcs->outBuffFlushedSize = 0; + zcs->stage = zcss_flush; /* pass-through to flush stage */ + } + + case zcss_flush: { + size_t const toFlush = zcs->outBuffContentSize - zcs->outBuffFlushedSize; + size_t const flushed = ZSTD_limitCopy(op, oend - op, zcs->outBuff + zcs->outBuffFlushedSize, toFlush); + op += flushed; + zcs->outBuffFlushedSize += flushed; + if (toFlush != flushed) { + someMoreWork = 0; + break; + } /* dst too small to store flushed data : stop there */ + zcs->outBuffContentSize = zcs->outBuffFlushedSize = 0; + zcs->stage = zcss_load; + break; + } + + case zcss_final: + someMoreWork = 0; /* do nothing */ + break; + + default: + return ERROR(GENERIC); /* impossible */ + } + } + + *srcSizePtr = ip - istart; + *dstCapacityPtr = op - ostart; + zcs->inputProcessed += *srcSizePtr; + if (zcs->frameEnded) + return 0; + { + size_t hintInSize = zcs->inBuffTarget - zcs->inBuffPos; + if (hintInSize == 0) + hintInSize = zcs->blockSize; + return hintInSize; + } +} + +size_t ZSTD_compressStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output, ZSTD_inBuffer *input) +{ + size_t sizeRead = input->size - input->pos; + size_t sizeWritten = output->size - output->pos; + size_t const result = + ZSTD_compressStream_generic(zcs, (char *)(output->dst) + output->pos, &sizeWritten, (const char *)(input->src) + input->pos, &sizeRead, zsf_gather); + input->pos += sizeRead; + output->pos += sizeWritten; + return result; +} + +/*====== Finalize ======*/ + +/*! ZSTD_flushStream() : +* @return : amount of data remaining to flush */ +size_t ZSTD_flushStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output) +{ + size_t srcSize = 0; + size_t sizeWritten = output->size - output->pos; + size_t const result = ZSTD_compressStream_generic(zcs, (char *)(output->dst) + output->pos, &sizeWritten, &srcSize, + &srcSize, /* use a valid src address instead of NULL */ + zsf_flush); + output->pos += sizeWritten; + if (ZSTD_isError(result)) + return result; + return zcs->outBuffContentSize - zcs->outBuffFlushedSize; /* remaining to flush */ +} + +size_t ZSTD_endStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output) +{ + BYTE *const ostart = (BYTE *)(output->dst) + output->pos; + BYTE *const oend = (BYTE *)(output->dst) + output->size; + BYTE *op = ostart; + + if ((zcs->pledgedSrcSize) && (zcs->inputProcessed != zcs->pledgedSrcSize)) + return ERROR(srcSize_wrong); /* pledgedSrcSize not respected */ + + if (zcs->stage != zcss_final) { + /* flush whatever remains */ + size_t srcSize = 0; + size_t sizeWritten = output->size - output->pos; + size_t const notEnded = + ZSTD_compressStream_generic(zcs, ostart, &sizeWritten, &srcSize, &srcSize, zsf_end); /* use a valid src address instead of NULL */ + size_t const remainingToFlush = zcs->outBuffContentSize - zcs->outBuffFlushedSize; + op += sizeWritten; + if (remainingToFlush) { + output->pos += sizeWritten; + return remainingToFlush + ZSTD_BLOCKHEADERSIZE /* final empty block */ + (zcs->checksum * 4); + } + /* create epilogue */ + zcs->stage = zcss_final; + zcs->outBuffContentSize = !notEnded ? 0 : ZSTD_compressEnd(zcs->cctx, zcs->outBuff, zcs->outBuffSize, NULL, + 0); /* write epilogue, including final empty block, into outBuff */ + } + + /* flush epilogue */ + { + size_t const toFlush = zcs->outBuffContentSize - zcs->outBuffFlushedSize; + size_t const flushed = ZSTD_limitCopy(op, oend - op, zcs->outBuff + zcs->outBuffFlushedSize, toFlush); + op += flushed; + zcs->outBuffFlushedSize += flushed; + output->pos += op - ostart; + if (toFlush == flushed) + zcs->stage = zcss_init; /* end reached */ + return toFlush - flushed; + } +} + +/*-===== Pre-defined compression levels =====-*/ + +#define ZSTD_DEFAULT_CLEVEL 1 +#define ZSTD_MAX_CLEVEL 22 +int ZSTD_maxCLevel(void) { return ZSTD_MAX_CLEVEL; } + +static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEVEL + 1] = { + { + /* "default" */ + /* W, C, H, S, L, TL, strat */ + {18, 12, 12, 1, 7, 16, ZSTD_fast}, /* level 0 - never used */ + {19, 13, 14, 1, 7, 16, ZSTD_fast}, /* level 1 */ + {19, 15, 16, 1, 6, 16, ZSTD_fast}, /* level 2 */ + {20, 16, 17, 1, 5, 16, ZSTD_dfast}, /* level 3.*/ + {20, 18, 18, 1, 5, 16, ZSTD_dfast}, /* level 4.*/ + {20, 15, 18, 3, 5, 16, ZSTD_greedy}, /* level 5 */ + {21, 16, 19, 2, 5, 16, ZSTD_lazy}, /* level 6 */ + {21, 17, 20, 3, 5, 16, ZSTD_lazy}, /* level 7 */ + {21, 18, 20, 3, 5, 16, ZSTD_lazy2}, /* level 8 */ + {21, 20, 20, 3, 5, 16, ZSTD_lazy2}, /* level 9 */ + {21, 19, 21, 4, 5, 16, ZSTD_lazy2}, /* level 10 */ + {22, 20, 22, 4, 5, 16, ZSTD_lazy2}, /* level 11 */ + {22, 20, 22, 5, 5, 16, ZSTD_lazy2}, /* level 12 */ + {22, 21, 22, 5, 5, 16, ZSTD_lazy2}, /* level 13 */ + {22, 21, 22, 6, 5, 16, ZSTD_lazy2}, /* level 14 */ + {22, 21, 21, 5, 5, 16, ZSTD_btlazy2}, /* level 15 */ + {23, 22, 22, 5, 5, 16, ZSTD_btlazy2}, /* level 16 */ + {23, 21, 22, 4, 5, 24, ZSTD_btopt}, /* level 17 */ + {23, 23, 22, 6, 5, 32, ZSTD_btopt}, /* level 18 */ + {23, 23, 22, 6, 3, 48, ZSTD_btopt}, /* level 19 */ + {25, 25, 23, 7, 3, 64, ZSTD_btopt2}, /* level 20 */ + {26, 26, 23, 7, 3, 256, ZSTD_btopt2}, /* level 21 */ + {27, 27, 25, 9, 3, 512, ZSTD_btopt2}, /* level 22 */ + }, + { + /* for srcSize <= 256 KB */ + /* W, C, H, S, L, T, strat */ + {0, 0, 0, 0, 0, 0, ZSTD_fast}, /* level 0 - not used */ + {18, 13, 14, 1, 6, 8, ZSTD_fast}, /* level 1 */ + {18, 14, 13, 1, 5, 8, ZSTD_dfast}, /* level 2 */ + {18, 16, 15, 1, 5, 8, ZSTD_dfast}, /* level 3 */ + {18, 15, 17, 1, 5, 8, ZSTD_greedy}, /* level 4.*/ + {18, 16, 17, 4, 5, 8, ZSTD_greedy}, /* level 5.*/ + {18, 16, 17, 3, 5, 8, ZSTD_lazy}, /* level 6.*/ + {18, 17, 17, 4, 4, 8, ZSTD_lazy}, /* level 7 */ + {18, 17, 17, 4, 4, 8, ZSTD_lazy2}, /* level 8 */ + {18, 17, 17, 5, 4, 8, ZSTD_lazy2}, /* level 9 */ + {18, 17, 17, 6, 4, 8, ZSTD_lazy2}, /* level 10 */ + {18, 18, 17, 6, 4, 8, ZSTD_lazy2}, /* level 11.*/ + {18, 18, 17, 7, 4, 8, ZSTD_lazy2}, /* level 12.*/ + {18, 19, 17, 6, 4, 8, ZSTD_btlazy2}, /* level 13 */ + {18, 18, 18, 4, 4, 16, ZSTD_btopt}, /* level 14.*/ + {18, 18, 18, 4, 3, 16, ZSTD_btopt}, /* level 15.*/ + {18, 19, 18, 6, 3, 32, ZSTD_btopt}, /* level 16.*/ + {18, 19, 18, 8, 3, 64, ZSTD_btopt}, /* level 17.*/ + {18, 19, 18, 9, 3, 128, ZSTD_btopt}, /* level 18.*/ + {18, 19, 18, 10, 3, 256, ZSTD_btopt}, /* level 19.*/ + {18, 19, 18, 11, 3, 512, ZSTD_btopt2}, /* level 20.*/ + {18, 19, 18, 12, 3, 512, ZSTD_btopt2}, /* level 21.*/ + {18, 19, 18, 13, 3, 512, ZSTD_btopt2}, /* level 22.*/ + }, + { + /* for srcSize <= 128 KB */ + /* W, C, H, S, L, T, strat */ + {17, 12, 12, 1, 7, 8, ZSTD_fast}, /* level 0 - not used */ + {17, 12, 13, 1, 6, 8, ZSTD_fast}, /* level 1 */ + {17, 13, 16, 1, 5, 8, ZSTD_fast}, /* level 2 */ + {17, 16, 16, 2, 5, 8, ZSTD_dfast}, /* level 3 */ + {17, 13, 15, 3, 4, 8, ZSTD_greedy}, /* level 4 */ + {17, 15, 17, 4, 4, 8, ZSTD_greedy}, /* level 5 */ + {17, 16, 17, 3, 4, 8, ZSTD_lazy}, /* level 6 */ + {17, 15, 17, 4, 4, 8, ZSTD_lazy2}, /* level 7 */ + {17, 17, 17, 4, 4, 8, ZSTD_lazy2}, /* level 8 */ + {17, 17, 17, 5, 4, 8, ZSTD_lazy2}, /* level 9 */ + {17, 17, 17, 6, 4, 8, ZSTD_lazy2}, /* level 10 */ + {17, 17, 17, 7, 4, 8, ZSTD_lazy2}, /* level 11 */ + {17, 17, 17, 8, 4, 8, ZSTD_lazy2}, /* level 12 */ + {17, 18, 17, 6, 4, 8, ZSTD_btlazy2}, /* level 13.*/ + {17, 17, 17, 7, 3, 8, ZSTD_btopt}, /* level 14.*/ + {17, 17, 17, 7, 3, 16, ZSTD_btopt}, /* level 15.*/ + {17, 18, 17, 7, 3, 32, ZSTD_btopt}, /* level 16.*/ + {17, 18, 17, 7, 3, 64, ZSTD_btopt}, /* level 17.*/ + {17, 18, 17, 7, 3, 256, ZSTD_btopt}, /* level 18.*/ + {17, 18, 17, 8, 3, 256, ZSTD_btopt}, /* level 19.*/ + {17, 18, 17, 9, 3, 256, ZSTD_btopt2}, /* level 20.*/ + {17, 18, 17, 10, 3, 256, ZSTD_btopt2}, /* level 21.*/ + {17, 18, 17, 11, 3, 512, ZSTD_btopt2}, /* level 22.*/ + }, + { + /* for srcSize <= 16 KB */ + /* W, C, H, S, L, T, strat */ + {14, 12, 12, 1, 7, 6, ZSTD_fast}, /* level 0 - not used */ + {14, 14, 14, 1, 6, 6, ZSTD_fast}, /* level 1 */ + {14, 14, 14, 1, 4, 6, ZSTD_fast}, /* level 2 */ + {14, 14, 14, 1, 4, 6, ZSTD_dfast}, /* level 3.*/ + {14, 14, 14, 4, 4, 6, ZSTD_greedy}, /* level 4.*/ + {14, 14, 14, 3, 4, 6, ZSTD_lazy}, /* level 5.*/ + {14, 14, 14, 4, 4, 6, ZSTD_lazy2}, /* level 6 */ + {14, 14, 14, 5, 4, 6, ZSTD_lazy2}, /* level 7 */ + {14, 14, 14, 6, 4, 6, ZSTD_lazy2}, /* level 8.*/ + {14, 15, 14, 6, 4, 6, ZSTD_btlazy2}, /* level 9.*/ + {14, 15, 14, 3, 3, 6, ZSTD_btopt}, /* level 10.*/ + {14, 15, 14, 6, 3, 8, ZSTD_btopt}, /* level 11.*/ + {14, 15, 14, 6, 3, 16, ZSTD_btopt}, /* level 12.*/ + {14, 15, 14, 6, 3, 24, ZSTD_btopt}, /* level 13.*/ + {14, 15, 15, 6, 3, 48, ZSTD_btopt}, /* level 14.*/ + {14, 15, 15, 6, 3, 64, ZSTD_btopt}, /* level 15.*/ + {14, 15, 15, 6, 3, 96, ZSTD_btopt}, /* level 16.*/ + {14, 15, 15, 6, 3, 128, ZSTD_btopt}, /* level 17.*/ + {14, 15, 15, 6, 3, 256, ZSTD_btopt}, /* level 18.*/ + {14, 15, 15, 7, 3, 256, ZSTD_btopt}, /* level 19.*/ + {14, 15, 15, 8, 3, 256, ZSTD_btopt2}, /* level 20.*/ + {14, 15, 15, 9, 3, 256, ZSTD_btopt2}, /* level 21.*/ + {14, 15, 15, 10, 3, 256, ZSTD_btopt2}, /* level 22.*/ + }, +}; + +/*! ZSTD_getCParams() : +* @return ZSTD_compressionParameters structure for a selected compression level, `srcSize` and `dictSize`. +* Size values are optional, provide 0 if not known or unused */ +ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long srcSize, size_t dictSize) +{ + ZSTD_compressionParameters cp; + size_t const addedSize = srcSize ? 0 : 500; + U64 const rSize = srcSize + dictSize ? srcSize + dictSize + addedSize : (U64)-1; + U32 const tableID = (rSize <= 256 KB) + (rSize <= 128 KB) + (rSize <= 16 KB); /* intentional underflow for srcSizeHint == 0 */ + if (compressionLevel <= 0) + compressionLevel = ZSTD_DEFAULT_CLEVEL; /* 0 == default; no negative compressionLevel yet */ + if (compressionLevel > ZSTD_MAX_CLEVEL) + compressionLevel = ZSTD_MAX_CLEVEL; + cp = ZSTD_defaultCParameters[tableID][compressionLevel]; + if (ZSTD_32bits()) { /* auto-correction, for 32-bits mode */ + if (cp.windowLog > ZSTD_WINDOWLOG_MAX) + cp.windowLog = ZSTD_WINDOWLOG_MAX; + if (cp.chainLog > ZSTD_CHAINLOG_MAX) + cp.chainLog = ZSTD_CHAINLOG_MAX; + if (cp.hashLog > ZSTD_HASHLOG_MAX) + cp.hashLog = ZSTD_HASHLOG_MAX; + } + cp = ZSTD_adjustCParams(cp, srcSize, dictSize); + return cp; +} + +/*! ZSTD_getParams() : +* same as ZSTD_getCParams(), but @return a `ZSTD_parameters` object (instead of `ZSTD_compressionParameters`). +* All fields of `ZSTD_frameParameters` are set to default (0) */ +ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSize, size_t dictSize) +{ + ZSTD_parameters params; + ZSTD_compressionParameters const cParams = ZSTD_getCParams(compressionLevel, srcSize, dictSize); + memset(¶ms, 0, sizeof(params)); + params.cParams = cParams; + return params; +} + +EXPORT_SYMBOL(ZSTD_maxCLevel); +EXPORT_SYMBOL(ZSTD_compressBound); + +EXPORT_SYMBOL(ZSTD_CCtxWorkspaceBound); +EXPORT_SYMBOL(ZSTD_initCCtx); +EXPORT_SYMBOL(ZSTD_compressCCtx); +EXPORT_SYMBOL(ZSTD_compress_usingDict); + +EXPORT_SYMBOL(ZSTD_CDictWorkspaceBound); +EXPORT_SYMBOL(ZSTD_initCDict); +EXPORT_SYMBOL(ZSTD_compress_usingCDict); + +EXPORT_SYMBOL(ZSTD_CStreamWorkspaceBound); +EXPORT_SYMBOL(ZSTD_initCStream); +EXPORT_SYMBOL(ZSTD_initCStream_usingCDict); +EXPORT_SYMBOL(ZSTD_resetCStream); +EXPORT_SYMBOL(ZSTD_compressStream); +EXPORT_SYMBOL(ZSTD_flushStream); +EXPORT_SYMBOL(ZSTD_endStream); +EXPORT_SYMBOL(ZSTD_CStreamInSize); +EXPORT_SYMBOL(ZSTD_CStreamOutSize); + +EXPORT_SYMBOL(ZSTD_getCParams); +EXPORT_SYMBOL(ZSTD_getParams); +EXPORT_SYMBOL(ZSTD_checkCParams); +EXPORT_SYMBOL(ZSTD_adjustCParams); + +EXPORT_SYMBOL(ZSTD_compressBegin); +EXPORT_SYMBOL(ZSTD_compressBegin_usingDict); +EXPORT_SYMBOL(ZSTD_compressBegin_advanced); +EXPORT_SYMBOL(ZSTD_copyCCtx); +EXPORT_SYMBOL(ZSTD_compressBegin_usingCDict); +EXPORT_SYMBOL(ZSTD_compressContinue); +EXPORT_SYMBOL(ZSTD_compressEnd); + +EXPORT_SYMBOL(ZSTD_getBlockSizeMax); +EXPORT_SYMBOL(ZSTD_compressBlock); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("Zstd Compressor"); diff --git a/lib/zstd/decompress.c b/lib/zstd/decompress.c new file mode 100644 index 000000000000..b17846725ca0 --- /dev/null +++ b/lib/zstd/decompress.c @@ -0,0 +1,2528 @@ +/** + * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of https://github.com/facebook/zstd. + * An additional grant of patent rights can be found in the PATENTS file in the + * same directory. + * + * This program is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License version 2 as published by the + * Free Software Foundation. This program is dual-licensed; you may select + * either version 2 of the GNU General Public License ("GPL") or BSD license + * ("BSD"). + */ + +/* *************************************************************** +* Tuning parameters +*****************************************************************/ +/*! +* MAXWINDOWSIZE_DEFAULT : +* maximum window size accepted by DStream, by default. +* Frames requiring more memory will be rejected. +*/ +#ifndef ZSTD_MAXWINDOWSIZE_DEFAULT +#define ZSTD_MAXWINDOWSIZE_DEFAULT ((1 << ZSTD_WINDOWLOG_MAX) + 1) /* defined within zstd.h */ +#endif + +/*-******************************************************* +* Dependencies +*********************************************************/ +#include "fse.h" +#include "huf.h" +#include "mem.h" /* low level memory routines */ +#include "zstd_internal.h" +#include +#include +#include /* memcpy, memmove, memset */ + +#define ZSTD_PREFETCH(ptr) __builtin_prefetch(ptr, 0, 0) + +/*-************************************* +* Macros +***************************************/ +#define ZSTD_isError ERR_isError /* for inlining */ +#define FSE_isError ERR_isError +#define HUF_isError ERR_isError + +/*_******************************************************* +* Memory operations +**********************************************************/ +static void ZSTD_copy4(void *dst, const void *src) { memcpy(dst, src, 4); } + +/*-************************************************************* +* Context management +***************************************************************/ +typedef enum { + ZSTDds_getFrameHeaderSize, + ZSTDds_decodeFrameHeader, + ZSTDds_decodeBlockHeader, + ZSTDds_decompressBlock, + ZSTDds_decompressLastBlock, + ZSTDds_checkChecksum, + ZSTDds_decodeSkippableHeader, + ZSTDds_skipFrame +} ZSTD_dStage; + +typedef struct { + FSE_DTable LLTable[FSE_DTABLE_SIZE_U32(LLFSELog)]; + FSE_DTable OFTable[FSE_DTABLE_SIZE_U32(OffFSELog)]; + FSE_DTable MLTable[FSE_DTABLE_SIZE_U32(MLFSELog)]; + HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)]; /* can accommodate HUF_decompress4X */ + U64 workspace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32 / 2]; + U32 rep[ZSTD_REP_NUM]; +} ZSTD_entropyTables_t; + +struct ZSTD_DCtx_s { + const FSE_DTable *LLTptr; + const FSE_DTable *MLTptr; + const FSE_DTable *OFTptr; + const HUF_DTable *HUFptr; + ZSTD_entropyTables_t entropy; + const void *previousDstEnd; /* detect continuity */ + const void *base; /* start of curr segment */ + const void *vBase; /* virtual start of previous segment if it was just before curr one */ + const void *dictEnd; /* end of previous segment */ + size_t expected; + ZSTD_frameParams fParams; + blockType_e bType; /* used in ZSTD_decompressContinue(), to transfer blockType between header decoding and block decoding stages */ + ZSTD_dStage stage; + U32 litEntropy; + U32 fseEntropy; + struct xxh64_state xxhState; + size_t headerSize; + U32 dictID; + const BYTE *litPtr; + ZSTD_customMem customMem; + size_t litSize; + size_t rleSize; + BYTE litBuffer[ZSTD_BLOCKSIZE_ABSOLUTEMAX + WILDCOPY_OVERLENGTH]; + BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX]; +}; /* typedef'd to ZSTD_DCtx within "zstd.h" */ + +size_t ZSTD_DCtxWorkspaceBound(void) { return ZSTD_ALIGN(sizeof(ZSTD_stack)) + ZSTD_ALIGN(sizeof(ZSTD_DCtx)); } + +size_t ZSTD_decompressBegin(ZSTD_DCtx *dctx) +{ + dctx->expected = ZSTD_frameHeaderSize_prefix; + dctx->stage = ZSTDds_getFrameHeaderSize; + dctx->previousDstEnd = NULL; + dctx->base = NULL; + dctx->vBase = NULL; + dctx->dictEnd = NULL; + dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */ + dctx->litEntropy = dctx->fseEntropy = 0; + dctx->dictID = 0; + ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.rep) == sizeof(repStartValue)); + memcpy(dctx->entropy.rep, repStartValue, sizeof(repStartValue)); /* initial repcodes */ + dctx->LLTptr = dctx->entropy.LLTable; + dctx->MLTptr = dctx->entropy.MLTable; + dctx->OFTptr = dctx->entropy.OFTable; + dctx->HUFptr = dctx->entropy.hufTable; + return 0; +} + +ZSTD_DCtx *ZSTD_createDCtx_advanced(ZSTD_customMem customMem) +{ + ZSTD_DCtx *dctx; + + if (!customMem.customAlloc || !customMem.customFree) + return NULL; + + dctx = (ZSTD_DCtx *)ZSTD_malloc(sizeof(ZSTD_DCtx), customMem); + if (!dctx) + return NULL; + memcpy(&dctx->customMem, &customMem, sizeof(customMem)); + ZSTD_decompressBegin(dctx); + return dctx; +} + +ZSTD_DCtx *ZSTD_initDCtx(void *workspace, size_t workspaceSize) +{ + ZSTD_customMem const stackMem = ZSTD_initStack(workspace, workspaceSize); + return ZSTD_createDCtx_advanced(stackMem); +} + +size_t ZSTD_freeDCtx(ZSTD_DCtx *dctx) +{ + if (dctx == NULL) + return 0; /* support free on NULL */ + ZSTD_free(dctx, dctx->customMem); + return 0; /* reserved as a potential error code in the future */ +} + +void ZSTD_copyDCtx(ZSTD_DCtx *dstDCtx, const ZSTD_DCtx *srcDCtx) +{ + size_t const workSpaceSize = (ZSTD_BLOCKSIZE_ABSOLUTEMAX + WILDCOPY_OVERLENGTH) + ZSTD_frameHeaderSize_max; + memcpy(dstDCtx, srcDCtx, sizeof(ZSTD_DCtx) - workSpaceSize); /* no need to copy workspace */ +} + +static void ZSTD_refDDict(ZSTD_DCtx *dstDCtx, const ZSTD_DDict *ddict); + +/*-************************************************************* +* Decompression section +***************************************************************/ + +/*! ZSTD_isFrame() : + * Tells if the content of `buffer` starts with a valid Frame Identifier. + * Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0. + * Note 2 : Legacy Frame Identifiers are considered valid only if Legacy Support is enabled. + * Note 3 : Skippable Frame Identifiers are considered valid. */ +unsigned ZSTD_isFrame(const void *buffer, size_t size) +{ + if (size < 4) + return 0; + { + U32 const magic = ZSTD_readLE32(buffer); + if (magic == ZSTD_MAGICNUMBER) + return 1; + if ((magic & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) + return 1; + } + return 0; +} + +/** ZSTD_frameHeaderSize() : +* srcSize must be >= ZSTD_frameHeaderSize_prefix. +* @return : size of the Frame Header */ +static size_t ZSTD_frameHeaderSize(const void *src, size_t srcSize) +{ + if (srcSize < ZSTD_frameHeaderSize_prefix) + return ERROR(srcSize_wrong); + { + BYTE const fhd = ((const BYTE *)src)[4]; + U32 const dictID = fhd & 3; + U32 const singleSegment = (fhd >> 5) & 1; + U32 const fcsId = fhd >> 6; + return ZSTD_frameHeaderSize_prefix + !singleSegment + ZSTD_did_fieldSize[dictID] + ZSTD_fcs_fieldSize[fcsId] + (singleSegment && !fcsId); + } +} + +/** ZSTD_getFrameParams() : +* decode Frame Header, or require larger `srcSize`. +* @return : 0, `fparamsPtr` is correctly filled, +* >0, `srcSize` is too small, result is expected `srcSize`, +* or an error code, which can be tested using ZSTD_isError() */ +size_t ZSTD_getFrameParams(ZSTD_frameParams *fparamsPtr, const void *src, size_t srcSize) +{ + const BYTE *ip = (const BYTE *)src; + + if (srcSize < ZSTD_frameHeaderSize_prefix) + return ZSTD_frameHeaderSize_prefix; + if (ZSTD_readLE32(src) != ZSTD_MAGICNUMBER) { + if ((ZSTD_readLE32(src) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) { + if (srcSize < ZSTD_skippableHeaderSize) + return ZSTD_skippableHeaderSize; /* magic number + skippable frame length */ + memset(fparamsPtr, 0, sizeof(*fparamsPtr)); + fparamsPtr->frameContentSize = ZSTD_readLE32((const char *)src + 4); + fparamsPtr->windowSize = 0; /* windowSize==0 means a frame is skippable */ + return 0; + } + return ERROR(prefix_unknown); + } + + /* ensure there is enough `srcSize` to fully read/decode frame header */ + { + size_t const fhsize = ZSTD_frameHeaderSize(src, srcSize); + if (srcSize < fhsize) + return fhsize; + } + + { + BYTE const fhdByte = ip[4]; + size_t pos = 5; + U32 const dictIDSizeCode = fhdByte & 3; + U32 const checksumFlag = (fhdByte >> 2) & 1; + U32 const singleSegment = (fhdByte >> 5) & 1; + U32 const fcsID = fhdByte >> 6; + U32 const windowSizeMax = 1U << ZSTD_WINDOWLOG_MAX; + U32 windowSize = 0; + U32 dictID = 0; + U64 frameContentSize = 0; + if ((fhdByte & 0x08) != 0) + return ERROR(frameParameter_unsupported); /* reserved bits, which must be zero */ + if (!singleSegment) { + BYTE const wlByte = ip[pos++]; + U32 const windowLog = (wlByte >> 3) + ZSTD_WINDOWLOG_ABSOLUTEMIN; + if (windowLog > ZSTD_WINDOWLOG_MAX) + return ERROR(frameParameter_windowTooLarge); /* avoids issue with 1 << windowLog */ + windowSize = (1U << windowLog); + windowSize += (windowSize >> 3) * (wlByte & 7); + } + + switch (dictIDSizeCode) { + default: /* impossible */ + case 0: break; + case 1: + dictID = ip[pos]; + pos++; + break; + case 2: + dictID = ZSTD_readLE16(ip + pos); + pos += 2; + break; + case 3: + dictID = ZSTD_readLE32(ip + pos); + pos += 4; + break; + } + switch (fcsID) { + default: /* impossible */ + case 0: + if (singleSegment) + frameContentSize = ip[pos]; + break; + case 1: frameContentSize = ZSTD_readLE16(ip + pos) + 256; break; + case 2: frameContentSize = ZSTD_readLE32(ip + pos); break; + case 3: frameContentSize = ZSTD_readLE64(ip + pos); break; + } + if (!windowSize) + windowSize = (U32)frameContentSize; + if (windowSize > windowSizeMax) + return ERROR(frameParameter_windowTooLarge); + fparamsPtr->frameContentSize = frameContentSize; + fparamsPtr->windowSize = windowSize; + fparamsPtr->dictID = dictID; + fparamsPtr->checksumFlag = checksumFlag; + } + return 0; +} + +/** ZSTD_getFrameContentSize() : +* compatible with legacy mode +* @return : decompressed size of the single frame pointed to be `src` if known, otherwise +* - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined +* - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) */ +unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize) +{ + { + ZSTD_frameParams fParams; + if (ZSTD_getFrameParams(&fParams, src, srcSize) != 0) + return ZSTD_CONTENTSIZE_ERROR; + if (fParams.windowSize == 0) { + /* Either skippable or empty frame, size == 0 either way */ + return 0; + } else if (fParams.frameContentSize != 0) { + return fParams.frameContentSize; + } else { + return ZSTD_CONTENTSIZE_UNKNOWN; + } + } +} + +/** ZSTD_findDecompressedSize() : + * compatible with legacy mode + * `srcSize` must be the exact length of some number of ZSTD compressed and/or + * skippable frames + * @return : decompressed size of the frames contained */ +unsigned long long ZSTD_findDecompressedSize(const void *src, size_t srcSize) +{ + { + unsigned long long totalDstSize = 0; + while (srcSize >= ZSTD_frameHeaderSize_prefix) { + const U32 magicNumber = ZSTD_readLE32(src); + + if ((magicNumber & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) { + size_t skippableSize; + if (srcSize < ZSTD_skippableHeaderSize) + return ERROR(srcSize_wrong); + skippableSize = ZSTD_readLE32((const BYTE *)src + 4) + ZSTD_skippableHeaderSize; + if (srcSize < skippableSize) { + return ZSTD_CONTENTSIZE_ERROR; + } + + src = (const BYTE *)src + skippableSize; + srcSize -= skippableSize; + continue; + } + + { + unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize); + if (ret >= ZSTD_CONTENTSIZE_ERROR) + return ret; + + /* check for overflow */ + if (totalDstSize + ret < totalDstSize) + return ZSTD_CONTENTSIZE_ERROR; + totalDstSize += ret; + } + { + size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize); + if (ZSTD_isError(frameSrcSize)) { + return ZSTD_CONTENTSIZE_ERROR; + } + + src = (const BYTE *)src + frameSrcSize; + srcSize -= frameSrcSize; + } + } + + if (srcSize) { + return ZSTD_CONTENTSIZE_ERROR; + } + + return totalDstSize; + } +} + +/** ZSTD_decodeFrameHeader() : +* `headerSize` must be the size provided by ZSTD_frameHeaderSize(). +* @return : 0 if success, or an error code, which can be tested using ZSTD_isError() */ +static size_t ZSTD_decodeFrameHeader(ZSTD_DCtx *dctx, const void *src, size_t headerSize) +{ + size_t const result = ZSTD_getFrameParams(&(dctx->fParams), src, headerSize); + if (ZSTD_isError(result)) + return result; /* invalid header */ + if (result > 0) + return ERROR(srcSize_wrong); /* headerSize too small */ + if (dctx->fParams.dictID && (dctx->dictID != dctx->fParams.dictID)) + return ERROR(dictionary_wrong); + if (dctx->fParams.checksumFlag) + xxh64_reset(&dctx->xxhState, 0); + return 0; +} + +typedef struct { + blockType_e blockType; + U32 lastBlock; + U32 origSize; +} blockProperties_t; + +/*! ZSTD_getcBlockSize() : +* Provides the size of compressed block from block header `src` */ +size_t ZSTD_getcBlockSize(const void *src, size_t srcSize, blockProperties_t *bpPtr) +{ + if (srcSize < ZSTD_blockHeaderSize) + return ERROR(srcSize_wrong); + { + U32 const cBlockHeader = ZSTD_readLE24(src); + U32 const cSize = cBlockHeader >> 3; + bpPtr->lastBlock = cBlockHeader & 1; + bpPtr->blockType = (blockType_e)((cBlockHeader >> 1) & 3); + bpPtr->origSize = cSize; /* only useful for RLE */ + if (bpPtr->blockType == bt_rle) + return 1; + if (bpPtr->blockType == bt_reserved) + return ERROR(corruption_detected); + return cSize; + } +} + +static size_t ZSTD_copyRawBlock(void *dst, size_t dstCapacity, const void *src, size_t srcSize) +{ + if (srcSize > dstCapacity) + return ERROR(dstSize_tooSmall); + memcpy(dst, src, srcSize); + return srcSize; +} + +static size_t ZSTD_setRleBlock(void *dst, size_t dstCapacity, const void *src, size_t srcSize, size_t regenSize) +{ + if (srcSize != 1) + return ERROR(srcSize_wrong); + if (regenSize > dstCapacity) + return ERROR(dstSize_tooSmall); + memset(dst, *(const BYTE *)src, regenSize); + return regenSize; +} + +/*! ZSTD_decodeLiteralsBlock() : + @return : nb of bytes read from src (< srcSize ) */ +size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx *dctx, const void *src, size_t srcSize) /* note : srcSize < BLOCKSIZE */ +{ + if (srcSize < MIN_CBLOCK_SIZE) + return ERROR(corruption_detected); + + { + const BYTE *const istart = (const BYTE *)src; + symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3); + + switch (litEncType) { + case set_repeat: + if (dctx->litEntropy == 0) + return ERROR(dictionary_corrupted); + /* fall-through */ + case set_compressed: + if (srcSize < 5) + return ERROR(corruption_detected); /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3 */ + { + size_t lhSize, litSize, litCSize; + U32 singleStream = 0; + U32 const lhlCode = (istart[0] >> 2) & 3; + U32 const lhc = ZSTD_readLE32(istart); + switch (lhlCode) { + case 0: + case 1: + default: /* note : default is impossible, since lhlCode into [0..3] */ + /* 2 - 2 - 10 - 10 */ + singleStream = !lhlCode; + lhSize = 3; + litSize = (lhc >> 4) & 0x3FF; + litCSize = (lhc >> 14) & 0x3FF; + break; + case 2: + /* 2 - 2 - 14 - 14 */ + lhSize = 4; + litSize = (lhc >> 4) & 0x3FFF; + litCSize = lhc >> 18; + break; + case 3: + /* 2 - 2 - 18 - 18 */ + lhSize = 5; + litSize = (lhc >> 4) & 0x3FFFF; + litCSize = (lhc >> 22) + (istart[4] << 10); + break; + } + if (litSize > ZSTD_BLOCKSIZE_ABSOLUTEMAX) + return ERROR(corruption_detected); + if (litCSize + lhSize > srcSize) + return ERROR(corruption_detected); + + if (HUF_isError( + (litEncType == set_repeat) + ? (singleStream ? HUF_decompress1X_usingDTable(dctx->litBuffer, litSize, istart + lhSize, litCSize, dctx->HUFptr) + : HUF_decompress4X_usingDTable(dctx->litBuffer, litSize, istart + lhSize, litCSize, dctx->HUFptr)) + : (singleStream + ? HUF_decompress1X2_DCtx_wksp(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart + lhSize, litCSize, + dctx->entropy.workspace, sizeof(dctx->entropy.workspace)) + : HUF_decompress4X_hufOnly_wksp(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart + lhSize, litCSize, + dctx->entropy.workspace, sizeof(dctx->entropy.workspace))))) + return ERROR(corruption_detected); + + dctx->litPtr = dctx->litBuffer; + dctx->litSize = litSize; + dctx->litEntropy = 1; + if (litEncType == set_compressed) + dctx->HUFptr = dctx->entropy.hufTable; + memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH); + return litCSize + lhSize; + } + + case set_basic: { + size_t litSize, lhSize; + U32 const lhlCode = ((istart[0]) >> 2) & 3; + switch (lhlCode) { + case 0: + case 2: + default: /* note : default is impossible, since lhlCode into [0..3] */ + lhSize = 1; + litSize = istart[0] >> 3; + break; + case 1: + lhSize = 2; + litSize = ZSTD_readLE16(istart) >> 4; + break; + case 3: + lhSize = 3; + litSize = ZSTD_readLE24(istart) >> 4; + break; + } + + if (lhSize + litSize + WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */ + if (litSize + lhSize > srcSize) + return ERROR(corruption_detected); + memcpy(dctx->litBuffer, istart + lhSize, litSize); + dctx->litPtr = dctx->litBuffer; + dctx->litSize = litSize; + memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH); + return lhSize + litSize; + } + /* direct reference into compressed stream */ + dctx->litPtr = istart + lhSize; + dctx->litSize = litSize; + return lhSize + litSize; + } + + case set_rle: { + U32 const lhlCode = ((istart[0]) >> 2) & 3; + size_t litSize, lhSize; + switch (lhlCode) { + case 0: + case 2: + default: /* note : default is impossible, since lhlCode into [0..3] */ + lhSize = 1; + litSize = istart[0] >> 3; + break; + case 1: + lhSize = 2; + litSize = ZSTD_readLE16(istart) >> 4; + break; + case 3: + lhSize = 3; + litSize = ZSTD_readLE24(istart) >> 4; + if (srcSize < 4) + return ERROR(corruption_detected); /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4 */ + break; + } + if (litSize > ZSTD_BLOCKSIZE_ABSOLUTEMAX) + return ERROR(corruption_detected); + memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH); + dctx->litPtr = dctx->litBuffer; + dctx->litSize = litSize; + return lhSize + 1; + } + default: + return ERROR(corruption_detected); /* impossible */ + } + } +} + +typedef union { + FSE_decode_t realData; + U32 alignedBy4; +} FSE_decode_t4; + +static const FSE_decode_t4 LL_defaultDTable[(1 << LL_DEFAULTNORMLOG) + 1] = { + {{LL_DEFAULTNORMLOG, 1, 1}}, /* header : tableLog, fastMode, fastMode */ + {{0, 0, 4}}, /* 0 : base, symbol, bits */ + {{16, 0, 4}}, + {{32, 1, 5}}, + {{0, 3, 5}}, + {{0, 4, 5}}, + {{0, 6, 5}}, + {{0, 7, 5}}, + {{0, 9, 5}}, + {{0, 10, 5}}, + {{0, 12, 5}}, + {{0, 14, 6}}, + {{0, 16, 5}}, + {{0, 18, 5}}, + {{0, 19, 5}}, + {{0, 21, 5}}, + {{0, 22, 5}}, + {{0, 24, 5}}, + {{32, 25, 5}}, + {{0, 26, 5}}, + {{0, 27, 6}}, + {{0, 29, 6}}, + {{0, 31, 6}}, + {{32, 0, 4}}, + {{0, 1, 4}}, + {{0, 2, 5}}, + {{32, 4, 5}}, + {{0, 5, 5}}, + {{32, 7, 5}}, + {{0, 8, 5}}, + {{32, 10, 5}}, + {{0, 11, 5}}, + {{0, 13, 6}}, + {{32, 16, 5}}, + {{0, 17, 5}}, + {{32, 19, 5}}, + {{0, 20, 5}}, + {{32, 22, 5}}, + {{0, 23, 5}}, + {{0, 25, 4}}, + {{16, 25, 4}}, + {{32, 26, 5}}, + {{0, 28, 6}}, + {{0, 30, 6}}, + {{48, 0, 4}}, + {{16, 1, 4}}, + {{32, 2, 5}}, + {{32, 3, 5}}, + {{32, 5, 5}}, + {{32, 6, 5}}, + {{32, 8, 5}}, + {{32, 9, 5}}, + {{32, 11, 5}}, + {{32, 12, 5}}, + {{0, 15, 6}}, + {{32, 17, 5}}, + {{32, 18, 5}}, + {{32, 20, 5}}, + {{32, 21, 5}}, + {{32, 23, 5}}, + {{32, 24, 5}}, + {{0, 35, 6}}, + {{0, 34, 6}}, + {{0, 33, 6}}, + {{0, 32, 6}}, +}; /* LL_defaultDTable */ + +static const FSE_decode_t4 ML_defaultDTable[(1 << ML_DEFAULTNORMLOG) + 1] = { + {{ML_DEFAULTNORMLOG, 1, 1}}, /* header : tableLog, fastMode, fastMode */ + {{0, 0, 6}}, /* 0 : base, symbol, bits */ + {{0, 1, 4}}, + {{32, 2, 5}}, + {{0, 3, 5}}, + {{0, 5, 5}}, + {{0, 6, 5}}, + {{0, 8, 5}}, + {{0, 10, 6}}, + {{0, 13, 6}}, + {{0, 16, 6}}, + {{0, 19, 6}}, + {{0, 22, 6}}, + {{0, 25, 6}}, + {{0, 28, 6}}, + {{0, 31, 6}}, + {{0, 33, 6}}, + {{0, 35, 6}}, + {{0, 37, 6}}, + {{0, 39, 6}}, + {{0, 41, 6}}, + {{0, 43, 6}}, + {{0, 45, 6}}, + {{16, 1, 4}}, + {{0, 2, 4}}, + {{32, 3, 5}}, + {{0, 4, 5}}, + {{32, 6, 5}}, + {{0, 7, 5}}, + {{0, 9, 6}}, + {{0, 12, 6}}, + {{0, 15, 6}}, + {{0, 18, 6}}, + {{0, 21, 6}}, + {{0, 24, 6}}, + {{0, 27, 6}}, + {{0, 30, 6}}, + {{0, 32, 6}}, + {{0, 34, 6}}, + {{0, 36, 6}}, + {{0, 38, 6}}, + {{0, 40, 6}}, + {{0, 42, 6}}, + {{0, 44, 6}}, + {{32, 1, 4}}, + {{48, 1, 4}}, + {{16, 2, 4}}, + {{32, 4, 5}}, + {{32, 5, 5}}, + {{32, 7, 5}}, + {{32, 8, 5}}, + {{0, 11, 6}}, + {{0, 14, 6}}, + {{0, 17, 6}}, + {{0, 20, 6}}, + {{0, 23, 6}}, + {{0, 26, 6}}, + {{0, 29, 6}}, + {{0, 52, 6}}, + {{0, 51, 6}}, + {{0, 50, 6}}, + {{0, 49, 6}}, + {{0, 48, 6}}, + {{0, 47, 6}}, + {{0, 46, 6}}, +}; /* ML_defaultDTable */ + +static const FSE_decode_t4 OF_defaultDTable[(1 << OF_DEFAULTNORMLOG) + 1] = { + {{OF_DEFAULTNORMLOG, 1, 1}}, /* header : tableLog, fastMode, fastMode */ + {{0, 0, 5}}, /* 0 : base, symbol, bits */ + {{0, 6, 4}}, + {{0, 9, 5}}, + {{0, 15, 5}}, + {{0, 21, 5}}, + {{0, 3, 5}}, + {{0, 7, 4}}, + {{0, 12, 5}}, + {{0, 18, 5}}, + {{0, 23, 5}}, + {{0, 5, 5}}, + {{0, 8, 4}}, + {{0, 14, 5}}, + {{0, 20, 5}}, + {{0, 2, 5}}, + {{16, 7, 4}}, + {{0, 11, 5}}, + {{0, 17, 5}}, + {{0, 22, 5}}, + {{0, 4, 5}}, + {{16, 8, 4}}, + {{0, 13, 5}}, + {{0, 19, 5}}, + {{0, 1, 5}}, + {{16, 6, 4}}, + {{0, 10, 5}}, + {{0, 16, 5}}, + {{0, 28, 5}}, + {{0, 27, 5}}, + {{0, 26, 5}}, + {{0, 25, 5}}, + {{0, 24, 5}}, +}; /* OF_defaultDTable */ + +/*! ZSTD_buildSeqTable() : + @return : nb bytes read from src, + or an error code if it fails, testable with ZSTD_isError() +*/ +static size_t ZSTD_buildSeqTable(FSE_DTable *DTableSpace, const FSE_DTable **DTablePtr, symbolEncodingType_e type, U32 max, U32 maxLog, const void *src, + size_t srcSize, const FSE_decode_t4 *defaultTable, U32 flagRepeatTable, void *workspace, size_t workspaceSize) +{ + const void *const tmpPtr = defaultTable; /* bypass strict aliasing */ + switch (type) { + case set_rle: + if (!srcSize) + return ERROR(srcSize_wrong); + if ((*(const BYTE *)src) > max) + return ERROR(corruption_detected); + FSE_buildDTable_rle(DTableSpace, *(const BYTE *)src); + *DTablePtr = DTableSpace; + return 1; + case set_basic: *DTablePtr = (const FSE_DTable *)tmpPtr; return 0; + case set_repeat: + if (!flagRepeatTable) + return ERROR(corruption_detected); + return 0; + default: /* impossible */ + case set_compressed: { + U32 tableLog; + S16 *norm = (S16 *)workspace; + size_t const spaceUsed32 = ALIGN(sizeof(S16) * (MaxSeq + 1), sizeof(U32)) >> 2; + + if ((spaceUsed32 << 2) > workspaceSize) + return ERROR(GENERIC); + workspace = (U32 *)workspace + spaceUsed32; + workspaceSize -= (spaceUsed32 << 2); + { + size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize); + if (FSE_isError(headerSize)) + return ERROR(corruption_detected); + if (tableLog > maxLog) + return ERROR(corruption_detected); + FSE_buildDTable_wksp(DTableSpace, norm, max, tableLog, workspace, workspaceSize); + *DTablePtr = DTableSpace; + return headerSize; + } + } + } +} + +size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx *dctx, int *nbSeqPtr, const void *src, size_t srcSize) +{ + const BYTE *const istart = (const BYTE *const)src; + const BYTE *const iend = istart + srcSize; + const BYTE *ip = istart; + + /* check */ + if (srcSize < MIN_SEQUENCES_SIZE) + return ERROR(srcSize_wrong); + + /* SeqHead */ + { + int nbSeq = *ip++; + if (!nbSeq) { + *nbSeqPtr = 0; + return 1; + } + if (nbSeq > 0x7F) { + if (nbSeq == 0xFF) { + if (ip + 2 > iend) + return ERROR(srcSize_wrong); + nbSeq = ZSTD_readLE16(ip) + LONGNBSEQ, ip += 2; + } else { + if (ip >= iend) + return ERROR(srcSize_wrong); + nbSeq = ((nbSeq - 0x80) << 8) + *ip++; + } + } + *nbSeqPtr = nbSeq; + } + + /* FSE table descriptors */ + if (ip + 4 > iend) + return ERROR(srcSize_wrong); /* minimum possible size */ + { + symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6); + symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3); + symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3); + ip++; + + /* Build DTables */ + { + size_t const llhSize = ZSTD_buildSeqTable(dctx->entropy.LLTable, &dctx->LLTptr, LLtype, MaxLL, LLFSELog, ip, iend - ip, + LL_defaultDTable, dctx->fseEntropy, dctx->entropy.workspace, sizeof(dctx->entropy.workspace)); + if (ZSTD_isError(llhSize)) + return ERROR(corruption_detected); + ip += llhSize; + } + { + size_t const ofhSize = ZSTD_buildSeqTable(dctx->entropy.OFTable, &dctx->OFTptr, OFtype, MaxOff, OffFSELog, ip, iend - ip, + OF_defaultDTable, dctx->fseEntropy, dctx->entropy.workspace, sizeof(dctx->entropy.workspace)); + if (ZSTD_isError(ofhSize)) + return ERROR(corruption_detected); + ip += ofhSize; + } + { + size_t const mlhSize = ZSTD_buildSeqTable(dctx->entropy.MLTable, &dctx->MLTptr, MLtype, MaxML, MLFSELog, ip, iend - ip, + ML_defaultDTable, dctx->fseEntropy, dctx->entropy.workspace, sizeof(dctx->entropy.workspace)); + if (ZSTD_isError(mlhSize)) + return ERROR(corruption_detected); + ip += mlhSize; + } + } + + return ip - istart; +} + +typedef struct { + size_t litLength; + size_t matchLength; + size_t offset; + const BYTE *match; +} seq_t; + +typedef struct { + BIT_DStream_t DStream; + FSE_DState_t stateLL; + FSE_DState_t stateOffb; + FSE_DState_t stateML; + size_t prevOffset[ZSTD_REP_NUM]; + const BYTE *base; + size_t pos; + uPtrDiff gotoDict; +} seqState_t; + +FORCE_NOINLINE +size_t ZSTD_execSequenceLast7(BYTE *op, BYTE *const oend, seq_t sequence, const BYTE **litPtr, const BYTE *const litLimit, const BYTE *const base, + const BYTE *const vBase, const BYTE *const dictEnd) +{ + BYTE *const oLitEnd = op + sequence.litLength; + size_t const sequenceLength = sequence.litLength + sequence.matchLength; + BYTE *const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */ + BYTE *const oend_w = oend - WILDCOPY_OVERLENGTH; + const BYTE *const iLitEnd = *litPtr + sequence.litLength; + const BYTE *match = oLitEnd - sequence.offset; + + /* check */ + if (oMatchEnd > oend) + return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */ + if (iLitEnd > litLimit) + return ERROR(corruption_detected); /* over-read beyond lit buffer */ + if (oLitEnd <= oend_w) + return ERROR(GENERIC); /* Precondition */ + + /* copy literals */ + if (op < oend_w) { + ZSTD_wildcopy(op, *litPtr, oend_w - op); + *litPtr += oend_w - op; + op = oend_w; + } + while (op < oLitEnd) + *op++ = *(*litPtr)++; + + /* copy Match */ + if (sequence.offset > (size_t)(oLitEnd - base)) { + /* offset beyond prefix */ + if (sequence.offset > (size_t)(oLitEnd - vBase)) + return ERROR(corruption_detected); + match = dictEnd - (base - match); + if (match + sequence.matchLength <= dictEnd) { + memmove(oLitEnd, match, sequence.matchLength); + return sequenceLength; + } + /* span extDict & currPrefixSegment */ + { + size_t const length1 = dictEnd - match; + memmove(oLitEnd, match, length1); + op = oLitEnd + length1; + sequence.matchLength -= length1; + match = base; + } + } + while (op < oMatchEnd) + *op++ = *match++; + return sequenceLength; +} + +static seq_t ZSTD_decodeSequence(seqState_t *seqState) +{ + seq_t seq; + + U32 const llCode = FSE_peekSymbol(&seqState->stateLL); + U32 const mlCode = FSE_peekSymbol(&seqState->stateML); + U32 const ofCode = FSE_peekSymbol(&seqState->stateOffb); /* <= maxOff, by table construction */ + + U32 const llBits = LL_bits[llCode]; + U32 const mlBits = ML_bits[mlCode]; + U32 const ofBits = ofCode; + U32 const totalBits = llBits + mlBits + ofBits; + + static const U32 LL_base[MaxLL + 1] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, + 20, 22, 24, 28, 32, 40, 48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000, 0x10000}; + + static const U32 ML_base[MaxML + 1] = {3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 37, 39, 41, + 43, 47, 51, 59, 67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803, 0x1003, 0x2003, 0x4003, 0x8003, 0x10003}; + + static const U32 OF_base[MaxOff + 1] = {0, 1, 1, 5, 0xD, 0x1D, 0x3D, 0x7D, 0xFD, 0x1FD, + 0x3FD, 0x7FD, 0xFFD, 0x1FFD, 0x3FFD, 0x7FFD, 0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, + 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD, 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD}; + + /* sequence */ + { + size_t offset; + if (!ofCode) + offset = 0; + else { + offset = OF_base[ofCode] + BIT_readBitsFast(&seqState->DStream, ofBits); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ + if (ZSTD_32bits()) + BIT_reloadDStream(&seqState->DStream); + } + + if (ofCode <= 1) { + offset += (llCode == 0); + if (offset) { + size_t temp = (offset == 3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; + temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */ + if (offset != 1) + seqState->prevOffset[2] = seqState->prevOffset[1]; + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset = temp; + } else { + offset = seqState->prevOffset[0]; + } + } else { + seqState->prevOffset[2] = seqState->prevOffset[1]; + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset; + } + seq.offset = offset; + } + + seq.matchLength = ML_base[mlCode] + ((mlCode > 31) ? BIT_readBitsFast(&seqState->DStream, mlBits) : 0); /* <= 16 bits */ + if (ZSTD_32bits() && (mlBits + llBits > 24)) + BIT_reloadDStream(&seqState->DStream); + + seq.litLength = LL_base[llCode] + ((llCode > 15) ? BIT_readBitsFast(&seqState->DStream, llBits) : 0); /* <= 16 bits */ + if (ZSTD_32bits() || (totalBits > 64 - 7 - (LLFSELog + MLFSELog + OffFSELog))) + BIT_reloadDStream(&seqState->DStream); + + /* ANS state update */ + FSE_updateState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */ + FSE_updateState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */ + if (ZSTD_32bits()) + BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ + FSE_updateState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */ + + seq.match = NULL; + + return seq; +} + +FORCE_INLINE +size_t ZSTD_execSequence(BYTE *op, BYTE *const oend, seq_t sequence, const BYTE **litPtr, const BYTE *const litLimit, const BYTE *const base, + const BYTE *const vBase, const BYTE *const dictEnd) +{ + BYTE *const oLitEnd = op + sequence.litLength; + size_t const sequenceLength = sequence.litLength + sequence.matchLength; + BYTE *const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */ + BYTE *const oend_w = oend - WILDCOPY_OVERLENGTH; + const BYTE *const iLitEnd = *litPtr + sequence.litLength; + const BYTE *match = oLitEnd - sequence.offset; + + /* check */ + if (oMatchEnd > oend) + return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */ + if (iLitEnd > litLimit) + return ERROR(corruption_detected); /* over-read beyond lit buffer */ + if (oLitEnd > oend_w) + return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, base, vBase, dictEnd); + + /* copy Literals */ + ZSTD_copy8(op, *litPtr); + if (sequence.litLength > 8) + ZSTD_wildcopy(op + 8, (*litPtr) + 8, + sequence.litLength - 8); /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */ + op = oLitEnd; + *litPtr = iLitEnd; /* update for next sequence */ + + /* copy Match */ + if (sequence.offset > (size_t)(oLitEnd - base)) { + /* offset beyond prefix */ + if (sequence.offset > (size_t)(oLitEnd - vBase)) + return ERROR(corruption_detected); + match = dictEnd + (match - base); + if (match + sequence.matchLength <= dictEnd) { + memmove(oLitEnd, match, sequence.matchLength); + return sequenceLength; + } + /* span extDict & currPrefixSegment */ + { + size_t const length1 = dictEnd - match; + memmove(oLitEnd, match, length1); + op = oLitEnd + length1; + sequence.matchLength -= length1; + match = base; + if (op > oend_w || sequence.matchLength < MINMATCH) { + U32 i; + for (i = 0; i < sequence.matchLength; ++i) + op[i] = match[i]; + return sequenceLength; + } + } + } + /* Requirement: op <= oend_w && sequence.matchLength >= MINMATCH */ + + /* match within prefix */ + if (sequence.offset < 8) { + /* close range match, overlap */ + static const U32 dec32table[] = {0, 1, 2, 1, 4, 4, 4, 4}; /* added */ + static const int dec64table[] = {8, 8, 8, 7, 8, 9, 10, 11}; /* subtracted */ + int const sub2 = dec64table[sequence.offset]; + op[0] = match[0]; + op[1] = match[1]; + op[2] = match[2]; + op[3] = match[3]; + match += dec32table[sequence.offset]; + ZSTD_copy4(op + 4, match); + match -= sub2; + } else { + ZSTD_copy8(op, match); + } + op += 8; + match += 8; + + if (oMatchEnd > oend - (16 - MINMATCH)) { + if (op < oend_w) { + ZSTD_wildcopy(op, match, oend_w - op); + match += oend_w - op; + op = oend_w; + } + while (op < oMatchEnd) + *op++ = *match++; + } else { + ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8); /* works even if matchLength < 8 */ + } + return sequenceLength; +} + +static size_t ZSTD_decompressSequences(ZSTD_DCtx *dctx, void *dst, size_t maxDstSize, const void *seqStart, size_t seqSize) +{ + const BYTE *ip = (const BYTE *)seqStart; + const BYTE *const iend = ip + seqSize; + BYTE *const ostart = (BYTE * const)dst; + BYTE *const oend = ostart + maxDstSize; + BYTE *op = ostart; + const BYTE *litPtr = dctx->litPtr; + const BYTE *const litEnd = litPtr + dctx->litSize; + const BYTE *const base = (const BYTE *)(dctx->base); + const BYTE *const vBase = (const BYTE *)(dctx->vBase); + const BYTE *const dictEnd = (const BYTE *)(dctx->dictEnd); + int nbSeq; + + /* Build Decoding Tables */ + { + size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, seqSize); + if (ZSTD_isError(seqHSize)) + return seqHSize; + ip += seqHSize; + } + + /* Regen sequences */ + if (nbSeq) { + seqState_t seqState; + dctx->fseEntropy = 1; + { + U32 i; + for (i = 0; i < ZSTD_REP_NUM; i++) + seqState.prevOffset[i] = dctx->entropy.rep[i]; + } + CHECK_E(BIT_initDStream(&seqState.DStream, ip, iend - ip), corruption_detected); + FSE_initDState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); + FSE_initDState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); + FSE_initDState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); + + for (; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && nbSeq;) { + nbSeq--; + { + seq_t const sequence = ZSTD_decodeSequence(&seqState); + size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, base, vBase, dictEnd); + if (ZSTD_isError(oneSeqSize)) + return oneSeqSize; + op += oneSeqSize; + } + } + + /* check if reached exact end */ + if (nbSeq) + return ERROR(corruption_detected); + /* save reps for next block */ + { + U32 i; + for (i = 0; i < ZSTD_REP_NUM; i++) + dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); + } + } + + /* last literal segment */ + { + size_t const lastLLSize = litEnd - litPtr; + if (lastLLSize > (size_t)(oend - op)) + return ERROR(dstSize_tooSmall); + memcpy(op, litPtr, lastLLSize); + op += lastLLSize; + } + + return op - ostart; +} + +FORCE_INLINE seq_t ZSTD_decodeSequenceLong_generic(seqState_t *seqState, int const longOffsets) +{ + seq_t seq; + + U32 const llCode = FSE_peekSymbol(&seqState->stateLL); + U32 const mlCode = FSE_peekSymbol(&seqState->stateML); + U32 const ofCode = FSE_peekSymbol(&seqState->stateOffb); /* <= maxOff, by table construction */ + + U32 const llBits = LL_bits[llCode]; + U32 const mlBits = ML_bits[mlCode]; + U32 const ofBits = ofCode; + U32 const totalBits = llBits + mlBits + ofBits; + + static const U32 LL_base[MaxLL + 1] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, + 20, 22, 24, 28, 32, 40, 48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000, 0x10000}; + + static const U32 ML_base[MaxML + 1] = {3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 37, 39, 41, + 43, 47, 51, 59, 67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803, 0x1003, 0x2003, 0x4003, 0x8003, 0x10003}; + + static const U32 OF_base[MaxOff + 1] = {0, 1, 1, 5, 0xD, 0x1D, 0x3D, 0x7D, 0xFD, 0x1FD, + 0x3FD, 0x7FD, 0xFFD, 0x1FFD, 0x3FFD, 0x7FFD, 0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, + 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD, 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD}; + + /* sequence */ + { + size_t offset; + if (!ofCode) + offset = 0; + else { + if (longOffsets) { + int const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN); + offset = OF_base[ofCode] + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); + if (ZSTD_32bits() || extraBits) + BIT_reloadDStream(&seqState->DStream); + if (extraBits) + offset += BIT_readBitsFast(&seqState->DStream, extraBits); + } else { + offset = OF_base[ofCode] + BIT_readBitsFast(&seqState->DStream, ofBits); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ + if (ZSTD_32bits()) + BIT_reloadDStream(&seqState->DStream); + } + } + + if (ofCode <= 1) { + offset += (llCode == 0); + if (offset) { + size_t temp = (offset == 3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; + temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */ + if (offset != 1) + seqState->prevOffset[2] = seqState->prevOffset[1]; + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset = temp; + } else { + offset = seqState->prevOffset[0]; + } + } else { + seqState->prevOffset[2] = seqState->prevOffset[1]; + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset; + } + seq.offset = offset; + } + + seq.matchLength = ML_base[mlCode] + ((mlCode > 31) ? BIT_readBitsFast(&seqState->DStream, mlBits) : 0); /* <= 16 bits */ + if (ZSTD_32bits() && (mlBits + llBits > 24)) + BIT_reloadDStream(&seqState->DStream); + + seq.litLength = LL_base[llCode] + ((llCode > 15) ? BIT_readBitsFast(&seqState->DStream, llBits) : 0); /* <= 16 bits */ + if (ZSTD_32bits() || (totalBits > 64 - 7 - (LLFSELog + MLFSELog + OffFSELog))) + BIT_reloadDStream(&seqState->DStream); + + { + size_t const pos = seqState->pos + seq.litLength; + seq.match = seqState->base + pos - seq.offset; /* single memory segment */ + if (seq.offset > pos) + seq.match += seqState->gotoDict; /* separate memory segment */ + seqState->pos = pos + seq.matchLength; + } + + /* ANS state update */ + FSE_updateState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */ + FSE_updateState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */ + if (ZSTD_32bits()) + BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ + FSE_updateState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */ + + return seq; +} + +static seq_t ZSTD_decodeSequenceLong(seqState_t *seqState, unsigned const windowSize) +{ + if (ZSTD_highbit32(windowSize) > STREAM_ACCUMULATOR_MIN) { + return ZSTD_decodeSequenceLong_generic(seqState, 1); + } else { + return ZSTD_decodeSequenceLong_generic(seqState, 0); + } +} + +FORCE_INLINE +size_t ZSTD_execSequenceLong(BYTE *op, BYTE *const oend, seq_t sequence, const BYTE **litPtr, const BYTE *const litLimit, const BYTE *const base, + const BYTE *const vBase, const BYTE *const dictEnd) +{ + BYTE *const oLitEnd = op + sequence.litLength; + size_t const sequenceLength = sequence.litLength + sequence.matchLength; + BYTE *const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */ + BYTE *const oend_w = oend - WILDCOPY_OVERLENGTH; + const BYTE *const iLitEnd = *litPtr + sequence.litLength; + const BYTE *match = sequence.match; + + /* check */ + if (oMatchEnd > oend) + return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */ + if (iLitEnd > litLimit) + return ERROR(corruption_detected); /* over-read beyond lit buffer */ + if (oLitEnd > oend_w) + return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, base, vBase, dictEnd); + + /* copy Literals */ + ZSTD_copy8(op, *litPtr); + if (sequence.litLength > 8) + ZSTD_wildcopy(op + 8, (*litPtr) + 8, + sequence.litLength - 8); /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */ + op = oLitEnd; + *litPtr = iLitEnd; /* update for next sequence */ + + /* copy Match */ + if (sequence.offset > (size_t)(oLitEnd - base)) { + /* offset beyond prefix */ + if (sequence.offset > (size_t)(oLitEnd - vBase)) + return ERROR(corruption_detected); + if (match + sequence.matchLength <= dictEnd) { + memmove(oLitEnd, match, sequence.matchLength); + return sequenceLength; + } + /* span extDict & currPrefixSegment */ + { + size_t const length1 = dictEnd - match; + memmove(oLitEnd, match, length1); + op = oLitEnd + length1; + sequence.matchLength -= length1; + match = base; + if (op > oend_w || sequence.matchLength < MINMATCH) { + U32 i; + for (i = 0; i < sequence.matchLength; ++i) + op[i] = match[i]; + return sequenceLength; + } + } + } + /* Requirement: op <= oend_w && sequence.matchLength >= MINMATCH */ + + /* match within prefix */ + if (sequence.offset < 8) { + /* close range match, overlap */ + static const U32 dec32table[] = {0, 1, 2, 1, 4, 4, 4, 4}; /* added */ + static const int dec64table[] = {8, 8, 8, 7, 8, 9, 10, 11}; /* subtracted */ + int const sub2 = dec64table[sequence.offset]; + op[0] = match[0]; + op[1] = match[1]; + op[2] = match[2]; + op[3] = match[3]; + match += dec32table[sequence.offset]; + ZSTD_copy4(op + 4, match); + match -= sub2; + } else { + ZSTD_copy8(op, match); + } + op += 8; + match += 8; + + if (oMatchEnd > oend - (16 - MINMATCH)) { + if (op < oend_w) { + ZSTD_wildcopy(op, match, oend_w - op); + match += oend_w - op; + op = oend_w; + } + while (op < oMatchEnd) + *op++ = *match++; + } else { + ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8); /* works even if matchLength < 8 */ + } + return sequenceLength; +} + +static size_t ZSTD_decompressSequencesLong(ZSTD_DCtx *dctx, void *dst, size_t maxDstSize, const void *seqStart, size_t seqSize) +{ + const BYTE *ip = (const BYTE *)seqStart; + const BYTE *const iend = ip + seqSize; + BYTE *const ostart = (BYTE * const)dst; + BYTE *const oend = ostart + maxDstSize; + BYTE *op = ostart; + const BYTE *litPtr = dctx->litPtr; + const BYTE *const litEnd = litPtr + dctx->litSize; + const BYTE *const base = (const BYTE *)(dctx->base); + const BYTE *const vBase = (const BYTE *)(dctx->vBase); + const BYTE *const dictEnd = (const BYTE *)(dctx->dictEnd); + unsigned const windowSize = dctx->fParams.windowSize; + int nbSeq; + + /* Build Decoding Tables */ + { + size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, seqSize); + if (ZSTD_isError(seqHSize)) + return seqHSize; + ip += seqHSize; + } + + /* Regen sequences */ + if (nbSeq) { +#define STORED_SEQS 4 +#define STOSEQ_MASK (STORED_SEQS - 1) +#define ADVANCED_SEQS 4 + seq_t *sequences = (seq_t *)dctx->entropy.workspace; + int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS); + seqState_t seqState; + int seqNb; + ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.workspace) >= sizeof(seq_t) * STORED_SEQS); + dctx->fseEntropy = 1; + { + U32 i; + for (i = 0; i < ZSTD_REP_NUM; i++) + seqState.prevOffset[i] = dctx->entropy.rep[i]; + } + seqState.base = base; + seqState.pos = (size_t)(op - base); + seqState.gotoDict = (uPtrDiff)dictEnd - (uPtrDiff)base; /* cast to avoid undefined behaviour */ + CHECK_E(BIT_initDStream(&seqState.DStream, ip, iend - ip), corruption_detected); + FSE_initDState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); + FSE_initDState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); + FSE_initDState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); + + /* prepare in advance */ + for (seqNb = 0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && seqNb < seqAdvance; seqNb++) { + sequences[seqNb] = ZSTD_decodeSequenceLong(&seqState, windowSize); + } + if (seqNb < seqAdvance) + return ERROR(corruption_detected); + + /* decode and decompress */ + for (; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && seqNb < nbSeq; seqNb++) { + seq_t const sequence = ZSTD_decodeSequenceLong(&seqState, windowSize); + size_t const oneSeqSize = + ZSTD_execSequenceLong(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STOSEQ_MASK], &litPtr, litEnd, base, vBase, dictEnd); + if (ZSTD_isError(oneSeqSize)) + return oneSeqSize; + ZSTD_PREFETCH(sequence.match); + sequences[seqNb & STOSEQ_MASK] = sequence; + op += oneSeqSize; + } + if (seqNb < nbSeq) + return ERROR(corruption_detected); + + /* finish queue */ + seqNb -= seqAdvance; + for (; seqNb < nbSeq; seqNb++) { + size_t const oneSeqSize = ZSTD_execSequenceLong(op, oend, sequences[seqNb & STOSEQ_MASK], &litPtr, litEnd, base, vBase, dictEnd); + if (ZSTD_isError(oneSeqSize)) + return oneSeqSize; + op += oneSeqSize; + } + + /* save reps for next block */ + { + U32 i; + for (i = 0; i < ZSTD_REP_NUM; i++) + dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); + } + } + + /* last literal segment */ + { + size_t const lastLLSize = litEnd - litPtr; + if (lastLLSize > (size_t)(oend - op)) + return ERROR(dstSize_tooSmall); + memcpy(op, litPtr, lastLLSize); + op += lastLLSize; + } + + return op - ostart; +} + +static size_t ZSTD_decompressBlock_internal(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize) +{ /* blockType == blockCompressed */ + const BYTE *ip = (const BYTE *)src; + + if (srcSize >= ZSTD_BLOCKSIZE_ABSOLUTEMAX) + return ERROR(srcSize_wrong); + + /* Decode literals section */ + { + size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize); + if (ZSTD_isError(litCSize)) + return litCSize; + ip += litCSize; + srcSize -= litCSize; + } + if (sizeof(size_t) > 4) /* do not enable prefetching on 32-bits x86, as it's performance detrimental */ + /* likely because of register pressure */ + /* if that's the correct cause, then 32-bits ARM should be affected differently */ + /* it would be good to test this on ARM real hardware, to see if prefetch version improves speed */ + if (dctx->fParams.windowSize > (1 << 23)) + return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize); + return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize); +} + +static void ZSTD_checkContinuity(ZSTD_DCtx *dctx, const void *dst) +{ + if (dst != dctx->previousDstEnd) { /* not contiguous */ + dctx->dictEnd = dctx->previousDstEnd; + dctx->vBase = (const char *)dst - ((const char *)(dctx->previousDstEnd) - (const char *)(dctx->base)); + dctx->base = dst; + dctx->previousDstEnd = dst; + } +} + +size_t ZSTD_decompressBlock(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize) +{ + size_t dSize; + ZSTD_checkContinuity(dctx, dst); + dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize); + dctx->previousDstEnd = (char *)dst + dSize; + return dSize; +} + +/** ZSTD_insertBlock() : + insert `src` block into `dctx` history. Useful to track uncompressed blocks. */ +size_t ZSTD_insertBlock(ZSTD_DCtx *dctx, const void *blockStart, size_t blockSize) +{ + ZSTD_checkContinuity(dctx, blockStart); + dctx->previousDstEnd = (const char *)blockStart + blockSize; + return blockSize; +} + +size_t ZSTD_generateNxBytes(void *dst, size_t dstCapacity, BYTE byte, size_t length) +{ + if (length > dstCapacity) + return ERROR(dstSize_tooSmall); + memset(dst, byte, length); + return length; +} + +/** ZSTD_findFrameCompressedSize() : + * compatible with legacy mode + * `src` must point to the start of a ZSTD frame, ZSTD legacy frame, or skippable frame + * `srcSize` must be at least as large as the frame contained + * @return : the compressed size of the frame starting at `src` */ +size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize) +{ + if (srcSize >= ZSTD_skippableHeaderSize && (ZSTD_readLE32(src) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) { + return ZSTD_skippableHeaderSize + ZSTD_readLE32((const BYTE *)src + 4); + } else { + const BYTE *ip = (const BYTE *)src; + const BYTE *const ipstart = ip; + size_t remainingSize = srcSize; + ZSTD_frameParams fParams; + + size_t const headerSize = ZSTD_frameHeaderSize(ip, remainingSize); + if (ZSTD_isError(headerSize)) + return headerSize; + + /* Frame Header */ + { + size_t const ret = ZSTD_getFrameParams(&fParams, ip, remainingSize); + if (ZSTD_isError(ret)) + return ret; + if (ret > 0) + return ERROR(srcSize_wrong); + } + + ip += headerSize; + remainingSize -= headerSize; + + /* Loop on each block */ + while (1) { + blockProperties_t blockProperties; + size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSize, &blockProperties); + if (ZSTD_isError(cBlockSize)) + return cBlockSize; + + if (ZSTD_blockHeaderSize + cBlockSize > remainingSize) + return ERROR(srcSize_wrong); + + ip += ZSTD_blockHeaderSize + cBlockSize; + remainingSize -= ZSTD_blockHeaderSize + cBlockSize; + + if (blockProperties.lastBlock) + break; + } + + if (fParams.checksumFlag) { /* Frame content checksum */ + if (remainingSize < 4) + return ERROR(srcSize_wrong); + ip += 4; + remainingSize -= 4; + } + + return ip - ipstart; + } +} + +/*! ZSTD_decompressFrame() : +* @dctx must be properly initialized */ +static size_t ZSTD_decompressFrame(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, const void **srcPtr, size_t *srcSizePtr) +{ + const BYTE *ip = (const BYTE *)(*srcPtr); + BYTE *const ostart = (BYTE * const)dst; + BYTE *const oend = ostart + dstCapacity; + BYTE *op = ostart; + size_t remainingSize = *srcSizePtr; + + /* check */ + if (remainingSize < ZSTD_frameHeaderSize_min + ZSTD_blockHeaderSize) + return ERROR(srcSize_wrong); + + /* Frame Header */ + { + size_t const frameHeaderSize = ZSTD_frameHeaderSize(ip, ZSTD_frameHeaderSize_prefix); + if (ZSTD_isError(frameHeaderSize)) + return frameHeaderSize; + if (remainingSize < frameHeaderSize + ZSTD_blockHeaderSize) + return ERROR(srcSize_wrong); + CHECK_F(ZSTD_decodeFrameHeader(dctx, ip, frameHeaderSize)); + ip += frameHeaderSize; + remainingSize -= frameHeaderSize; + } + + /* Loop on each block */ + while (1) { + size_t decodedSize; + blockProperties_t blockProperties; + size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSize, &blockProperties); + if (ZSTD_isError(cBlockSize)) + return cBlockSize; + + ip += ZSTD_blockHeaderSize; + remainingSize -= ZSTD_blockHeaderSize; + if (cBlockSize > remainingSize) + return ERROR(srcSize_wrong); + + switch (blockProperties.blockType) { + case bt_compressed: decodedSize = ZSTD_decompressBlock_internal(dctx, op, oend - op, ip, cBlockSize); break; + case bt_raw: decodedSize = ZSTD_copyRawBlock(op, oend - op, ip, cBlockSize); break; + case bt_rle: decodedSize = ZSTD_generateNxBytes(op, oend - op, *ip, blockProperties.origSize); break; + case bt_reserved: + default: return ERROR(corruption_detected); + } + + if (ZSTD_isError(decodedSize)) + return decodedSize; + if (dctx->fParams.checksumFlag) + xxh64_update(&dctx->xxhState, op, decodedSize); + op += decodedSize; + ip += cBlockSize; + remainingSize -= cBlockSize; + if (blockProperties.lastBlock) + break; + } + + if (dctx->fParams.checksumFlag) { /* Frame content checksum verification */ + U32 const checkCalc = (U32)xxh64_digest(&dctx->xxhState); + U32 checkRead; + if (remainingSize < 4) + return ERROR(checksum_wrong); + checkRead = ZSTD_readLE32(ip); + if (checkRead != checkCalc) + return ERROR(checksum_wrong); + ip += 4; + remainingSize -= 4; + } + + /* Allow caller to get size read */ + *srcPtr = ip; + *srcSizePtr = remainingSize; + return op - ostart; +} + +static const void *ZSTD_DDictDictContent(const ZSTD_DDict *ddict); +static size_t ZSTD_DDictDictSize(const ZSTD_DDict *ddict); + +static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, const void *dict, size_t dictSize, + const ZSTD_DDict *ddict) +{ + void *const dststart = dst; + + if (ddict) { + if (dict) { + /* programmer error, these two cases should be mutually exclusive */ + return ERROR(GENERIC); + } + + dict = ZSTD_DDictDictContent(ddict); + dictSize = ZSTD_DDictDictSize(ddict); + } + + while (srcSize >= ZSTD_frameHeaderSize_prefix) { + U32 magicNumber; + + magicNumber = ZSTD_readLE32(src); + if (magicNumber != ZSTD_MAGICNUMBER) { + if ((magicNumber & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) { + size_t skippableSize; + if (srcSize < ZSTD_skippableHeaderSize) + return ERROR(srcSize_wrong); + skippableSize = ZSTD_readLE32((const BYTE *)src + 4) + ZSTD_skippableHeaderSize; + if (srcSize < skippableSize) { + return ERROR(srcSize_wrong); + } + + src = (const BYTE *)src + skippableSize; + srcSize -= skippableSize; + continue; + } else { + return ERROR(prefix_unknown); + } + } + + if (ddict) { + /* we were called from ZSTD_decompress_usingDDict */ + ZSTD_refDDict(dctx, ddict); + } else { + /* this will initialize correctly with no dict if dict == NULL, so + * use this in all cases but ddict */ + CHECK_F(ZSTD_decompressBegin_usingDict(dctx, dict, dictSize)); + } + ZSTD_checkContinuity(dctx, dst); + + { + const size_t res = ZSTD_decompressFrame(dctx, dst, dstCapacity, &src, &srcSize); + if (ZSTD_isError(res)) + return res; + /* don't need to bounds check this, ZSTD_decompressFrame will have + * already */ + dst = (BYTE *)dst + res; + dstCapacity -= res; + } + } + + if (srcSize) + return ERROR(srcSize_wrong); /* input not entirely consumed */ + + return (BYTE *)dst - (BYTE *)dststart; +} + +size_t ZSTD_decompress_usingDict(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, const void *dict, size_t dictSize) +{ + return ZSTD_decompressMultiFrame(dctx, dst, dstCapacity, src, srcSize, dict, dictSize, NULL); +} + +size_t ZSTD_decompressDCtx(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize) +{ + return ZSTD_decompress_usingDict(dctx, dst, dstCapacity, src, srcSize, NULL, 0); +} + +/*-************************************** +* Advanced Streaming Decompression API +* Bufferless and synchronous +****************************************/ +size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx *dctx) { return dctx->expected; } + +ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx *dctx) +{ + switch (dctx->stage) { + default: /* should not happen */ + case ZSTDds_getFrameHeaderSize: + case ZSTDds_decodeFrameHeader: return ZSTDnit_frameHeader; + case ZSTDds_decodeBlockHeader: return ZSTDnit_blockHeader; + case ZSTDds_decompressBlock: return ZSTDnit_block; + case ZSTDds_decompressLastBlock: return ZSTDnit_lastBlock; + case ZSTDds_checkChecksum: return ZSTDnit_checksum; + case ZSTDds_decodeSkippableHeader: + case ZSTDds_skipFrame: return ZSTDnit_skippableFrame; + } +} + +int ZSTD_isSkipFrame(ZSTD_DCtx *dctx) { return dctx->stage == ZSTDds_skipFrame; } /* for zbuff */ + +/** ZSTD_decompressContinue() : +* @return : nb of bytes generated into `dst` (necessarily <= `dstCapacity) +* or an error code, which can be tested using ZSTD_isError() */ +size_t ZSTD_decompressContinue(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize) +{ + /* Sanity check */ + if (srcSize != dctx->expected) + return ERROR(srcSize_wrong); + if (dstCapacity) + ZSTD_checkContinuity(dctx, dst); + + switch (dctx->stage) { + case ZSTDds_getFrameHeaderSize: + if (srcSize != ZSTD_frameHeaderSize_prefix) + return ERROR(srcSize_wrong); /* impossible */ + if ((ZSTD_readLE32(src) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) { /* skippable frame */ + memcpy(dctx->headerBuffer, src, ZSTD_frameHeaderSize_prefix); + dctx->expected = ZSTD_skippableHeaderSize - ZSTD_frameHeaderSize_prefix; /* magic number + skippable frame length */ + dctx->stage = ZSTDds_decodeSkippableHeader; + return 0; + } + dctx->headerSize = ZSTD_frameHeaderSize(src, ZSTD_frameHeaderSize_prefix); + if (ZSTD_isError(dctx->headerSize)) + return dctx->headerSize; + memcpy(dctx->headerBuffer, src, ZSTD_frameHeaderSize_prefix); + if (dctx->headerSize > ZSTD_frameHeaderSize_prefix) { + dctx->expected = dctx->headerSize - ZSTD_frameHeaderSize_prefix; + dctx->stage = ZSTDds_decodeFrameHeader; + return 0; + } + dctx->expected = 0; /* not necessary to copy more */ + + case ZSTDds_decodeFrameHeader: + memcpy(dctx->headerBuffer + ZSTD_frameHeaderSize_prefix, src, dctx->expected); + CHECK_F(ZSTD_decodeFrameHeader(dctx, dctx->headerBuffer, dctx->headerSize)); + dctx->expected = ZSTD_blockHeaderSize; + dctx->stage = ZSTDds_decodeBlockHeader; + return 0; + + case ZSTDds_decodeBlockHeader: { + blockProperties_t bp; + size_t const cBlockSize = ZSTD_getcBlockSize(src, ZSTD_blockHeaderSize, &bp); + if (ZSTD_isError(cBlockSize)) + return cBlockSize; + dctx->expected = cBlockSize; + dctx->bType = bp.blockType; + dctx->rleSize = bp.origSize; + if (cBlockSize) { + dctx->stage = bp.lastBlock ? ZSTDds_decompressLastBlock : ZSTDds_decompressBlock; + return 0; + } + /* empty block */ + if (bp.lastBlock) { + if (dctx->fParams.checksumFlag) { + dctx->expected = 4; + dctx->stage = ZSTDds_checkChecksum; + } else { + dctx->expected = 0; /* end of frame */ + dctx->stage = ZSTDds_getFrameHeaderSize; + } + } else { + dctx->expected = 3; /* go directly to next header */ + dctx->stage = ZSTDds_decodeBlockHeader; + } + return 0; + } + case ZSTDds_decompressLastBlock: + case ZSTDds_decompressBlock: { + size_t rSize; + switch (dctx->bType) { + case bt_compressed: rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize); break; + case bt_raw: rSize = ZSTD_copyRawBlock(dst, dstCapacity, src, srcSize); break; + case bt_rle: rSize = ZSTD_setRleBlock(dst, dstCapacity, src, srcSize, dctx->rleSize); break; + case bt_reserved: /* should never happen */ + default: return ERROR(corruption_detected); + } + if (ZSTD_isError(rSize)) + return rSize; + if (dctx->fParams.checksumFlag) + xxh64_update(&dctx->xxhState, dst, rSize); + + if (dctx->stage == ZSTDds_decompressLastBlock) { /* end of frame */ + if (dctx->fParams.checksumFlag) { /* another round for frame checksum */ + dctx->expected = 4; + dctx->stage = ZSTDds_checkChecksum; + } else { + dctx->expected = 0; /* ends here */ + dctx->stage = ZSTDds_getFrameHeaderSize; + } + } else { + dctx->stage = ZSTDds_decodeBlockHeader; + dctx->expected = ZSTD_blockHeaderSize; + dctx->previousDstEnd = (char *)dst + rSize; + } + return rSize; + } + case ZSTDds_checkChecksum: { + U32 const h32 = (U32)xxh64_digest(&dctx->xxhState); + U32 const check32 = ZSTD_readLE32(src); /* srcSize == 4, guaranteed by dctx->expected */ + if (check32 != h32) + return ERROR(checksum_wrong); + dctx->expected = 0; + dctx->stage = ZSTDds_getFrameHeaderSize; + return 0; + } + case ZSTDds_decodeSkippableHeader: { + memcpy(dctx->headerBuffer + ZSTD_frameHeaderSize_prefix, src, dctx->expected); + dctx->expected = ZSTD_readLE32(dctx->headerBuffer + 4); + dctx->stage = ZSTDds_skipFrame; + return 0; + } + case ZSTDds_skipFrame: { + dctx->expected = 0; + dctx->stage = ZSTDds_getFrameHeaderSize; + return 0; + } + default: + return ERROR(GENERIC); /* impossible */ + } +} + +static size_t ZSTD_refDictContent(ZSTD_DCtx *dctx, const void *dict, size_t dictSize) +{ + dctx->dictEnd = dctx->previousDstEnd; + dctx->vBase = (const char *)dict - ((const char *)(dctx->previousDstEnd) - (const char *)(dctx->base)); + dctx->base = dict; + dctx->previousDstEnd = (const char *)dict + dictSize; + return 0; +} + +/* ZSTD_loadEntropy() : + * dict : must point at beginning of a valid zstd dictionary + * @return : size of entropy tables read */ +static size_t ZSTD_loadEntropy(ZSTD_entropyTables_t *entropy, const void *const dict, size_t const dictSize) +{ + const BYTE *dictPtr = (const BYTE *)dict; + const BYTE *const dictEnd = dictPtr + dictSize; + + if (dictSize <= 8) + return ERROR(dictionary_corrupted); + dictPtr += 8; /* skip header = magic + dictID */ + + { + size_t const hSize = HUF_readDTableX4_wksp(entropy->hufTable, dictPtr, dictEnd - dictPtr, entropy->workspace, sizeof(entropy->workspace)); + if (HUF_isError(hSize)) + return ERROR(dictionary_corrupted); + dictPtr += hSize; + } + + { + short offcodeNCount[MaxOff + 1]; + U32 offcodeMaxValue = MaxOff, offcodeLog; + size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd - dictPtr); + if (FSE_isError(offcodeHeaderSize)) + return ERROR(dictionary_corrupted); + if (offcodeLog > OffFSELog) + return ERROR(dictionary_corrupted); + CHECK_E(FSE_buildDTable_wksp(entropy->OFTable, offcodeNCount, offcodeMaxValue, offcodeLog, entropy->workspace, sizeof(entropy->workspace)), dictionary_corrupted); + dictPtr += offcodeHeaderSize; + } + + { + short matchlengthNCount[MaxML + 1]; + unsigned matchlengthMaxValue = MaxML, matchlengthLog; + size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd - dictPtr); + if (FSE_isError(matchlengthHeaderSize)) + return ERROR(dictionary_corrupted); + if (matchlengthLog > MLFSELog) + return ERROR(dictionary_corrupted); + CHECK_E(FSE_buildDTable_wksp(entropy->MLTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog, entropy->workspace, sizeof(entropy->workspace)), dictionary_corrupted); + dictPtr += matchlengthHeaderSize; + } + + { + short litlengthNCount[MaxLL + 1]; + unsigned litlengthMaxValue = MaxLL, litlengthLog; + size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd - dictPtr); + if (FSE_isError(litlengthHeaderSize)) + return ERROR(dictionary_corrupted); + if (litlengthLog > LLFSELog) + return ERROR(dictionary_corrupted); + CHECK_E(FSE_buildDTable_wksp(entropy->LLTable, litlengthNCount, litlengthMaxValue, litlengthLog, entropy->workspace, sizeof(entropy->workspace)), dictionary_corrupted); + dictPtr += litlengthHeaderSize; + } + + if (dictPtr + 12 > dictEnd) + return ERROR(dictionary_corrupted); + { + int i; + size_t const dictContentSize = (size_t)(dictEnd - (dictPtr + 12)); + for (i = 0; i < 3; i++) { + U32 const rep = ZSTD_readLE32(dictPtr); + dictPtr += 4; + if (rep == 0 || rep >= dictContentSize) + return ERROR(dictionary_corrupted); + entropy->rep[i] = rep; + } + } + + return dictPtr - (const BYTE *)dict; +} + +static size_t ZSTD_decompress_insertDictionary(ZSTD_DCtx *dctx, const void *dict, size_t dictSize) +{ + if (dictSize < 8) + return ZSTD_refDictContent(dctx, dict, dictSize); + { + U32 const magic = ZSTD_readLE32(dict); + if (magic != ZSTD_DICT_MAGIC) { + return ZSTD_refDictContent(dctx, dict, dictSize); /* pure content mode */ + } + } + dctx->dictID = ZSTD_readLE32((const char *)dict + 4); + + /* load entropy tables */ + { + size_t const eSize = ZSTD_loadEntropy(&dctx->entropy, dict, dictSize); + if (ZSTD_isError(eSize)) + return ERROR(dictionary_corrupted); + dict = (const char *)dict + eSize; + dictSize -= eSize; + } + dctx->litEntropy = dctx->fseEntropy = 1; + + /* reference dictionary content */ + return ZSTD_refDictContent(dctx, dict, dictSize); +} + +size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx *dctx, const void *dict, size_t dictSize) +{ + CHECK_F(ZSTD_decompressBegin(dctx)); + if (dict && dictSize) + CHECK_E(ZSTD_decompress_insertDictionary(dctx, dict, dictSize), dictionary_corrupted); + return 0; +} + +/* ====== ZSTD_DDict ====== */ + +struct ZSTD_DDict_s { + void *dictBuffer; + const void *dictContent; + size_t dictSize; + ZSTD_entropyTables_t entropy; + U32 dictID; + U32 entropyPresent; + ZSTD_customMem cMem; +}; /* typedef'd to ZSTD_DDict within "zstd.h" */ + +size_t ZSTD_DDictWorkspaceBound(void) { return ZSTD_ALIGN(sizeof(ZSTD_stack)) + ZSTD_ALIGN(sizeof(ZSTD_DDict)); } + +static const void *ZSTD_DDictDictContent(const ZSTD_DDict *ddict) { return ddict->dictContent; } + +static size_t ZSTD_DDictDictSize(const ZSTD_DDict *ddict) { return ddict->dictSize; } + +static void ZSTD_refDDict(ZSTD_DCtx *dstDCtx, const ZSTD_DDict *ddict) +{ + ZSTD_decompressBegin(dstDCtx); /* init */ + if (ddict) { /* support refDDict on NULL */ + dstDCtx->dictID = ddict->dictID; + dstDCtx->base = ddict->dictContent; + dstDCtx->vBase = ddict->dictContent; + dstDCtx->dictEnd = (const BYTE *)ddict->dictContent + ddict->dictSize; + dstDCtx->previousDstEnd = dstDCtx->dictEnd; + if (ddict->entropyPresent) { + dstDCtx->litEntropy = 1; + dstDCtx->fseEntropy = 1; + dstDCtx->LLTptr = ddict->entropy.LLTable; + dstDCtx->MLTptr = ddict->entropy.MLTable; + dstDCtx->OFTptr = ddict->entropy.OFTable; + dstDCtx->HUFptr = ddict->entropy.hufTable; + dstDCtx->entropy.rep[0] = ddict->entropy.rep[0]; + dstDCtx->entropy.rep[1] = ddict->entropy.rep[1]; + dstDCtx->entropy.rep[2] = ddict->entropy.rep[2]; + } else { + dstDCtx->litEntropy = 0; + dstDCtx->fseEntropy = 0; + } + } +} + +static size_t ZSTD_loadEntropy_inDDict(ZSTD_DDict *ddict) +{ + ddict->dictID = 0; + ddict->entropyPresent = 0; + if (ddict->dictSize < 8) + return 0; + { + U32 const magic = ZSTD_readLE32(ddict->dictContent); + if (magic != ZSTD_DICT_MAGIC) + return 0; /* pure content mode */ + } + ddict->dictID = ZSTD_readLE32((const char *)ddict->dictContent + 4); + + /* load entropy tables */ + CHECK_E(ZSTD_loadEntropy(&ddict->entropy, ddict->dictContent, ddict->dictSize), dictionary_corrupted); + ddict->entropyPresent = 1; + return 0; +} + +static ZSTD_DDict *ZSTD_createDDict_advanced(const void *dict, size_t dictSize, unsigned byReference, ZSTD_customMem customMem) +{ + if (!customMem.customAlloc || !customMem.customFree) + return NULL; + + { + ZSTD_DDict *const ddict = (ZSTD_DDict *)ZSTD_malloc(sizeof(ZSTD_DDict), customMem); + if (!ddict) + return NULL; + ddict->cMem = customMem; + + if ((byReference) || (!dict) || (!dictSize)) { + ddict->dictBuffer = NULL; + ddict->dictContent = dict; + } else { + void *const internalBuffer = ZSTD_malloc(dictSize, customMem); + if (!internalBuffer) { + ZSTD_freeDDict(ddict); + return NULL; + } + memcpy(internalBuffer, dict, dictSize); + ddict->dictBuffer = internalBuffer; + ddict->dictContent = internalBuffer; + } + ddict->dictSize = dictSize; + ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */ + /* parse dictionary content */ + { + size_t const errorCode = ZSTD_loadEntropy_inDDict(ddict); + if (ZSTD_isError(errorCode)) { + ZSTD_freeDDict(ddict); + return NULL; + } + } + + return ddict; + } +} + +/*! ZSTD_initDDict() : +* Create a digested dictionary, to start decompression without startup delay. +* `dict` content is copied inside DDict. +* Consequently, `dict` can be released after `ZSTD_DDict` creation */ +ZSTD_DDict *ZSTD_initDDict(const void *dict, size_t dictSize, void *workspace, size_t workspaceSize) +{ + ZSTD_customMem const stackMem = ZSTD_initStack(workspace, workspaceSize); + return ZSTD_createDDict_advanced(dict, dictSize, 1, stackMem); +} + +size_t ZSTD_freeDDict(ZSTD_DDict *ddict) +{ + if (ddict == NULL) + return 0; /* support free on NULL */ + { + ZSTD_customMem const cMem = ddict->cMem; + ZSTD_free(ddict->dictBuffer, cMem); + ZSTD_free(ddict, cMem); + return 0; + } +} + +/*! ZSTD_getDictID_fromDict() : + * Provides the dictID stored within dictionary. + * if @return == 0, the dictionary is not conformant with Zstandard specification. + * It can still be loaded, but as a content-only dictionary. */ +unsigned ZSTD_getDictID_fromDict(const void *dict, size_t dictSize) +{ + if (dictSize < 8) + return 0; + if (ZSTD_readLE32(dict) != ZSTD_DICT_MAGIC) + return 0; + return ZSTD_readLE32((const char *)dict + 4); +} + +/*! ZSTD_getDictID_fromDDict() : + * Provides the dictID of the dictionary loaded into `ddict`. + * If @return == 0, the dictionary is not conformant to Zstandard specification, or empty. + * Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */ +unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict *ddict) +{ + if (ddict == NULL) + return 0; + return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize); +} + +/*! ZSTD_getDictID_fromFrame() : + * Provides the dictID required to decompressed the frame stored within `src`. + * If @return == 0, the dictID could not be decoded. + * This could for one of the following reasons : + * - The frame does not require a dictionary to be decoded (most common case). + * - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information. + * Note : this use case also happens when using a non-conformant dictionary. + * - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`). + * - This is not a Zstandard frame. + * When identifying the exact failure cause, it's possible to used ZSTD_getFrameParams(), which will provide a more precise error code. */ +unsigned ZSTD_getDictID_fromFrame(const void *src, size_t srcSize) +{ + ZSTD_frameParams zfp = {0, 0, 0, 0}; + size_t const hError = ZSTD_getFrameParams(&zfp, src, srcSize); + if (ZSTD_isError(hError)) + return 0; + return zfp.dictID; +} + +/*! ZSTD_decompress_usingDDict() : +* Decompression using a pre-digested Dictionary +* Use dictionary without significant overhead. */ +size_t ZSTD_decompress_usingDDict(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, const ZSTD_DDict *ddict) +{ + /* pass content and size in case legacy frames are encountered */ + return ZSTD_decompressMultiFrame(dctx, dst, dstCapacity, src, srcSize, NULL, 0, ddict); +} + +/*===================================== +* Streaming decompression +*====================================*/ + +typedef enum { zdss_init, zdss_loadHeader, zdss_read, zdss_load, zdss_flush } ZSTD_dStreamStage; + +/* *** Resource management *** */ +struct ZSTD_DStream_s { + ZSTD_DCtx *dctx; + ZSTD_DDict *ddictLocal; + const ZSTD_DDict *ddict; + ZSTD_frameParams fParams; + ZSTD_dStreamStage stage; + char *inBuff; + size_t inBuffSize; + size_t inPos; + size_t maxWindowSize; + char *outBuff; + size_t outBuffSize; + size_t outStart; + size_t outEnd; + size_t blockSize; + BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX]; /* tmp buffer to store frame header */ + size_t lhSize; + ZSTD_customMem customMem; + void *legacyContext; + U32 previousLegacyVersion; + U32 legacyVersion; + U32 hostageByte; +}; /* typedef'd to ZSTD_DStream within "zstd.h" */ + +size_t ZSTD_DStreamWorkspaceBound(size_t maxWindowSize) +{ + size_t const blockSize = MIN(maxWindowSize, ZSTD_BLOCKSIZE_ABSOLUTEMAX); + size_t const inBuffSize = blockSize; + size_t const outBuffSize = maxWindowSize + blockSize + WILDCOPY_OVERLENGTH * 2; + return ZSTD_DCtxWorkspaceBound() + ZSTD_ALIGN(sizeof(ZSTD_DStream)) + ZSTD_ALIGN(inBuffSize) + ZSTD_ALIGN(outBuffSize); +} + +static ZSTD_DStream *ZSTD_createDStream_advanced(ZSTD_customMem customMem) +{ + ZSTD_DStream *zds; + + if (!customMem.customAlloc || !customMem.customFree) + return NULL; + + zds = (ZSTD_DStream *)ZSTD_malloc(sizeof(ZSTD_DStream), customMem); + if (zds == NULL) + return NULL; + memset(zds, 0, sizeof(ZSTD_DStream)); + memcpy(&zds->customMem, &customMem, sizeof(ZSTD_customMem)); + zds->dctx = ZSTD_createDCtx_advanced(customMem); + if (zds->dctx == NULL) { + ZSTD_freeDStream(zds); + return NULL; + } + zds->stage = zdss_init; + zds->maxWindowSize = ZSTD_MAXWINDOWSIZE_DEFAULT; + return zds; +} + +ZSTD_DStream *ZSTD_initDStream(size_t maxWindowSize, void *workspace, size_t workspaceSize) +{ + ZSTD_customMem const stackMem = ZSTD_initStack(workspace, workspaceSize); + ZSTD_DStream *zds = ZSTD_createDStream_advanced(stackMem); + if (!zds) { + return NULL; + } + + zds->maxWindowSize = maxWindowSize; + zds->stage = zdss_loadHeader; + zds->lhSize = zds->inPos = zds->outStart = zds->outEnd = 0; + ZSTD_freeDDict(zds->ddictLocal); + zds->ddictLocal = NULL; + zds->ddict = zds->ddictLocal; + zds->legacyVersion = 0; + zds->hostageByte = 0; + + { + size_t const blockSize = MIN(zds->maxWindowSize, ZSTD_BLOCKSIZE_ABSOLUTEMAX); + size_t const neededOutSize = zds->maxWindowSize + blockSize + WILDCOPY_OVERLENGTH * 2; + + zds->inBuff = (char *)ZSTD_malloc(blockSize, zds->customMem); + zds->inBuffSize = blockSize; + zds->outBuff = (char *)ZSTD_malloc(neededOutSize, zds->customMem); + zds->outBuffSize = neededOutSize; + if (zds->inBuff == NULL || zds->outBuff == NULL) { + ZSTD_freeDStream(zds); + return NULL; + } + } + return zds; +} + +ZSTD_DStream *ZSTD_initDStream_usingDDict(size_t maxWindowSize, const ZSTD_DDict *ddict, void *workspace, size_t workspaceSize) +{ + ZSTD_DStream *zds = ZSTD_initDStream(maxWindowSize, workspace, workspaceSize); + if (zds) { + zds->ddict = ddict; + } + return zds; +} + +size_t ZSTD_freeDStream(ZSTD_DStream *zds) +{ + if (zds == NULL) + return 0; /* support free on null */ + { + ZSTD_customMem const cMem = zds->customMem; + ZSTD_freeDCtx(zds->dctx); + zds->dctx = NULL; + ZSTD_freeDDict(zds->ddictLocal); + zds->ddictLocal = NULL; + ZSTD_free(zds->inBuff, cMem); + zds->inBuff = NULL; + ZSTD_free(zds->outBuff, cMem); + zds->outBuff = NULL; + ZSTD_free(zds, cMem); + return 0; + } +} + +/* *** Initialization *** */ + +size_t ZSTD_DStreamInSize(void) { return ZSTD_BLOCKSIZE_ABSOLUTEMAX + ZSTD_blockHeaderSize; } +size_t ZSTD_DStreamOutSize(void) { return ZSTD_BLOCKSIZE_ABSOLUTEMAX; } + +size_t ZSTD_resetDStream(ZSTD_DStream *zds) +{ + zds->stage = zdss_loadHeader; + zds->lhSize = zds->inPos = zds->outStart = zds->outEnd = 0; + zds->legacyVersion = 0; + zds->hostageByte = 0; + return ZSTD_frameHeaderSize_prefix; +} + +/* ***** Decompression ***** */ + +ZSTD_STATIC size_t ZSTD_limitCopy(void *dst, size_t dstCapacity, const void *src, size_t srcSize) +{ + size_t const length = MIN(dstCapacity, srcSize); + memcpy(dst, src, length); + return length; +} + +size_t ZSTD_decompressStream(ZSTD_DStream *zds, ZSTD_outBuffer *output, ZSTD_inBuffer *input) +{ + const char *const istart = (const char *)(input->src) + input->pos; + const char *const iend = (const char *)(input->src) + input->size; + const char *ip = istart; + char *const ostart = (char *)(output->dst) + output->pos; + char *const oend = (char *)(output->dst) + output->size; + char *op = ostart; + U32 someMoreWork = 1; + + while (someMoreWork) { + switch (zds->stage) { + case zdss_init: + ZSTD_resetDStream(zds); /* transparent reset on starting decoding a new frame */ + /* fall-through */ + + case zdss_loadHeader: { + size_t const hSize = ZSTD_getFrameParams(&zds->fParams, zds->headerBuffer, zds->lhSize); + if (ZSTD_isError(hSize)) + return hSize; + if (hSize != 0) { /* need more input */ + size_t const toLoad = hSize - zds->lhSize; /* if hSize!=0, hSize > zds->lhSize */ + if (toLoad > (size_t)(iend - ip)) { /* not enough input to load full header */ + memcpy(zds->headerBuffer + zds->lhSize, ip, iend - ip); + zds->lhSize += iend - ip; + input->pos = input->size; + return (MAX(ZSTD_frameHeaderSize_min, hSize) - zds->lhSize) + + ZSTD_blockHeaderSize; /* remaining header bytes + next block header */ + } + memcpy(zds->headerBuffer + zds->lhSize, ip, toLoad); + zds->lhSize = hSize; + ip += toLoad; + break; + } + + /* check for single-pass mode opportunity */ + if (zds->fParams.frameContentSize && zds->fParams.windowSize /* skippable frame if == 0 */ + && (U64)(size_t)(oend - op) >= zds->fParams.frameContentSize) { + size_t const cSize = ZSTD_findFrameCompressedSize(istart, iend - istart); + if (cSize <= (size_t)(iend - istart)) { + size_t const decompressedSize = ZSTD_decompress_usingDDict(zds->dctx, op, oend - op, istart, cSize, zds->ddict); + if (ZSTD_isError(decompressedSize)) + return decompressedSize; + ip = istart + cSize; + op += decompressedSize; + zds->dctx->expected = 0; + zds->stage = zdss_init; + someMoreWork = 0; + break; + } + } + + /* Consume header */ + ZSTD_refDDict(zds->dctx, zds->ddict); + { + size_t const h1Size = ZSTD_nextSrcSizeToDecompress(zds->dctx); /* == ZSTD_frameHeaderSize_prefix */ + CHECK_F(ZSTD_decompressContinue(zds->dctx, NULL, 0, zds->headerBuffer, h1Size)); + { + size_t const h2Size = ZSTD_nextSrcSizeToDecompress(zds->dctx); + CHECK_F(ZSTD_decompressContinue(zds->dctx, NULL, 0, zds->headerBuffer + h1Size, h2Size)); + } + } + + zds->fParams.windowSize = MAX(zds->fParams.windowSize, 1U << ZSTD_WINDOWLOG_ABSOLUTEMIN); + if (zds->fParams.windowSize > zds->maxWindowSize) + return ERROR(frameParameter_windowTooLarge); + + /* Buffers are preallocated, but double check */ + { + size_t const blockSize = MIN(zds->maxWindowSize, ZSTD_BLOCKSIZE_ABSOLUTEMAX); + size_t const neededOutSize = zds->maxWindowSize + blockSize + WILDCOPY_OVERLENGTH * 2; + if (zds->inBuffSize < blockSize) { + return ERROR(GENERIC); + } + if (zds->outBuffSize < neededOutSize) { + return ERROR(GENERIC); + } + zds->blockSize = blockSize; + } + zds->stage = zdss_read; + } + /* pass-through */ + + case zdss_read: { + size_t const neededInSize = ZSTD_nextSrcSizeToDecompress(zds->dctx); + if (neededInSize == 0) { /* end of frame */ + zds->stage = zdss_init; + someMoreWork = 0; + break; + } + if ((size_t)(iend - ip) >= neededInSize) { /* decode directly from src */ + const int isSkipFrame = ZSTD_isSkipFrame(zds->dctx); + size_t const decodedSize = ZSTD_decompressContinue(zds->dctx, zds->outBuff + zds->outStart, + (isSkipFrame ? 0 : zds->outBuffSize - zds->outStart), ip, neededInSize); + if (ZSTD_isError(decodedSize)) + return decodedSize; + ip += neededInSize; + if (!decodedSize && !isSkipFrame) + break; /* this was just a header */ + zds->outEnd = zds->outStart + decodedSize; + zds->stage = zdss_flush; + break; + } + if (ip == iend) { + someMoreWork = 0; + break; + } /* no more input */ + zds->stage = zdss_load; + /* pass-through */ + } + + case zdss_load: { + size_t const neededInSize = ZSTD_nextSrcSizeToDecompress(zds->dctx); + size_t const toLoad = neededInSize - zds->inPos; /* should always be <= remaining space within inBuff */ + size_t loadedSize; + if (toLoad > zds->inBuffSize - zds->inPos) + return ERROR(corruption_detected); /* should never happen */ + loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, iend - ip); + ip += loadedSize; + zds->inPos += loadedSize; + if (loadedSize < toLoad) { + someMoreWork = 0; + break; + } /* not enough input, wait for more */ + + /* decode loaded input */ + { + const int isSkipFrame = ZSTD_isSkipFrame(zds->dctx); + size_t const decodedSize = ZSTD_decompressContinue(zds->dctx, zds->outBuff + zds->outStart, zds->outBuffSize - zds->outStart, + zds->inBuff, neededInSize); + if (ZSTD_isError(decodedSize)) + return decodedSize; + zds->inPos = 0; /* input is consumed */ + if (!decodedSize && !isSkipFrame) { + zds->stage = zdss_read; + break; + } /* this was just a header */ + zds->outEnd = zds->outStart + decodedSize; + zds->stage = zdss_flush; + /* pass-through */ + } + } + + case zdss_flush: { + size_t const toFlushSize = zds->outEnd - zds->outStart; + size_t const flushedSize = ZSTD_limitCopy(op, oend - op, zds->outBuff + zds->outStart, toFlushSize); + op += flushedSize; + zds->outStart += flushedSize; + if (flushedSize == toFlushSize) { /* flush completed */ + zds->stage = zdss_read; + if (zds->outStart + zds->blockSize > zds->outBuffSize) + zds->outStart = zds->outEnd = 0; + break; + } + /* cannot complete flush */ + someMoreWork = 0; + break; + } + default: + return ERROR(GENERIC); /* impossible */ + } + } + + /* result */ + input->pos += (size_t)(ip - istart); + output->pos += (size_t)(op - ostart); + { + size_t nextSrcSizeHint = ZSTD_nextSrcSizeToDecompress(zds->dctx); + if (!nextSrcSizeHint) { /* frame fully decoded */ + if (zds->outEnd == zds->outStart) { /* output fully flushed */ + if (zds->hostageByte) { + if (input->pos >= input->size) { + zds->stage = zdss_read; + return 1; + } /* can't release hostage (not present) */ + input->pos++; /* release hostage */ + } + return 0; + } + if (!zds->hostageByte) { /* output not fully flushed; keep last byte as hostage; will be released when all output is flushed */ + input->pos--; /* note : pos > 0, otherwise, impossible to finish reading last block */ + zds->hostageByte = 1; + } + return 1; + } + nextSrcSizeHint += ZSTD_blockHeaderSize * (ZSTD_nextInputType(zds->dctx) == ZSTDnit_block); /* preload header of next block */ + if (zds->inPos > nextSrcSizeHint) + return ERROR(GENERIC); /* should never happen */ + nextSrcSizeHint -= zds->inPos; /* already loaded*/ + return nextSrcSizeHint; + } +} + +EXPORT_SYMBOL(ZSTD_DCtxWorkspaceBound); +EXPORT_SYMBOL(ZSTD_initDCtx); +EXPORT_SYMBOL(ZSTD_decompressDCtx); +EXPORT_SYMBOL(ZSTD_decompress_usingDict); + +EXPORT_SYMBOL(ZSTD_DDictWorkspaceBound); +EXPORT_SYMBOL(ZSTD_initDDict); +EXPORT_SYMBOL(ZSTD_decompress_usingDDict); + +EXPORT_SYMBOL(ZSTD_DStreamWorkspaceBound); +EXPORT_SYMBOL(ZSTD_initDStream); +EXPORT_SYMBOL(ZSTD_initDStream_usingDDict); +EXPORT_SYMBOL(ZSTD_resetDStream); +EXPORT_SYMBOL(ZSTD_decompressStream); +EXPORT_SYMBOL(ZSTD_DStreamInSize); +EXPORT_SYMBOL(ZSTD_DStreamOutSize); + +EXPORT_SYMBOL(ZSTD_findFrameCompressedSize); +EXPORT_SYMBOL(ZSTD_getFrameContentSize); +EXPORT_SYMBOL(ZSTD_findDecompressedSize); + +EXPORT_SYMBOL(ZSTD_isFrame); +EXPORT_SYMBOL(ZSTD_getDictID_fromDict); +EXPORT_SYMBOL(ZSTD_getDictID_fromDDict); +EXPORT_SYMBOL(ZSTD_getDictID_fromFrame); + +EXPORT_SYMBOL(ZSTD_getFrameParams); +EXPORT_SYMBOL(ZSTD_decompressBegin); +EXPORT_SYMBOL(ZSTD_decompressBegin_usingDict); +EXPORT_SYMBOL(ZSTD_copyDCtx); +EXPORT_SYMBOL(ZSTD_nextSrcSizeToDecompress); +EXPORT_SYMBOL(ZSTD_decompressContinue); +EXPORT_SYMBOL(ZSTD_nextInputType); + +EXPORT_SYMBOL(ZSTD_decompressBlock); +EXPORT_SYMBOL(ZSTD_insertBlock); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("Zstd Decompressor"); diff --git a/lib/zstd/entropy_common.c b/lib/zstd/entropy_common.c new file mode 100644 index 000000000000..2b0a643c32c4 --- /dev/null +++ b/lib/zstd/entropy_common.c @@ -0,0 +1,243 @@ +/* + * Common functions of New Generation Entropy library + * Copyright (C) 2016, Yann Collet. + * + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * This program is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License version 2 as published by the + * Free Software Foundation. This program is dual-licensed; you may select + * either version 2 of the GNU General Public License ("GPL") or BSD license + * ("BSD"). + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy + */ + +/* ************************************* +* Dependencies +***************************************/ +#include "error_private.h" /* ERR_*, ERROR */ +#include "fse.h" +#include "huf.h" +#include "mem.h" + +/*=== Version ===*/ +unsigned FSE_versionNumber(void) { return FSE_VERSION_NUMBER; } + +/*=== Error Management ===*/ +unsigned FSE_isError(size_t code) { return ERR_isError(code); } + +unsigned HUF_isError(size_t code) { return ERR_isError(code); } + +/*-************************************************************** +* FSE NCount encoding-decoding +****************************************************************/ +size_t FSE_readNCount(short *normalizedCounter, unsigned *maxSVPtr, unsigned *tableLogPtr, const void *headerBuffer, size_t hbSize) +{ + const BYTE *const istart = (const BYTE *)headerBuffer; + const BYTE *const iend = istart + hbSize; + const BYTE *ip = istart; + int nbBits; + int remaining; + int threshold; + U32 bitStream; + int bitCount; + unsigned charnum = 0; + int previous0 = 0; + + if (hbSize < 4) + return ERROR(srcSize_wrong); + bitStream = ZSTD_readLE32(ip); + nbBits = (bitStream & 0xF) + FSE_MIN_TABLELOG; /* extract tableLog */ + if (nbBits > FSE_TABLELOG_ABSOLUTE_MAX) + return ERROR(tableLog_tooLarge); + bitStream >>= 4; + bitCount = 4; + *tableLogPtr = nbBits; + remaining = (1 << nbBits) + 1; + threshold = 1 << nbBits; + nbBits++; + + while ((remaining > 1) & (charnum <= *maxSVPtr)) { + if (previous0) { + unsigned n0 = charnum; + while ((bitStream & 0xFFFF) == 0xFFFF) { + n0 += 24; + if (ip < iend - 5) { + ip += 2; + bitStream = ZSTD_readLE32(ip) >> bitCount; + } else { + bitStream >>= 16; + bitCount += 16; + } + } + while ((bitStream & 3) == 3) { + n0 += 3; + bitStream >>= 2; + bitCount += 2; + } + n0 += bitStream & 3; + bitCount += 2; + if (n0 > *maxSVPtr) + return ERROR(maxSymbolValue_tooSmall); + while (charnum < n0) + normalizedCounter[charnum++] = 0; + if ((ip <= iend - 7) || (ip + (bitCount >> 3) <= iend - 4)) { + ip += bitCount >> 3; + bitCount &= 7; + bitStream = ZSTD_readLE32(ip) >> bitCount; + } else { + bitStream >>= 2; + } + } + { + int const max = (2 * threshold - 1) - remaining; + int count; + + if ((bitStream & (threshold - 1)) < (U32)max) { + count = bitStream & (threshold - 1); + bitCount += nbBits - 1; + } else { + count = bitStream & (2 * threshold - 1); + if (count >= threshold) + count -= max; + bitCount += nbBits; + } + + count--; /* extra accuracy */ + remaining -= count < 0 ? -count : count; /* -1 means +1 */ + normalizedCounter[charnum++] = (short)count; + previous0 = !count; + while (remaining < threshold) { + nbBits--; + threshold >>= 1; + } + + if ((ip <= iend - 7) || (ip + (bitCount >> 3) <= iend - 4)) { + ip += bitCount >> 3; + bitCount &= 7; + } else { + bitCount -= (int)(8 * (iend - 4 - ip)); + ip = iend - 4; + } + bitStream = ZSTD_readLE32(ip) >> (bitCount & 31); + } + } /* while ((remaining>1) & (charnum<=*maxSVPtr)) */ + if (remaining != 1) + return ERROR(corruption_detected); + if (bitCount > 32) + return ERROR(corruption_detected); + *maxSVPtr = charnum - 1; + + ip += (bitCount + 7) >> 3; + return ip - istart; +} + +/*! HUF_readStats() : + Read compact Huffman tree, saved by HUF_writeCTable(). + `huffWeight` is destination buffer. + `rankStats` is assumed to be a table of at least HUF_TABLELOG_MAX U32. + @return : size read from `src` , or an error Code . + Note : Needed by HUF_readCTable() and HUF_readDTableX?() . +*/ +size_t HUF_readStats_wksp(BYTE *huffWeight, size_t hwSize, U32 *rankStats, U32 *nbSymbolsPtr, U32 *tableLogPtr, const void *src, size_t srcSize, void *workspace, size_t workspaceSize) +{ + U32 weightTotal; + const BYTE *ip = (const BYTE *)src; + size_t iSize; + size_t oSize; + + if (!srcSize) + return ERROR(srcSize_wrong); + iSize = ip[0]; + /* memset(huffWeight, 0, hwSize); */ /* is not necessary, even though some analyzer complain ... */ + + if (iSize >= 128) { /* special header */ + oSize = iSize - 127; + iSize = ((oSize + 1) / 2); + if (iSize + 1 > srcSize) + return ERROR(srcSize_wrong); + if (oSize >= hwSize) + return ERROR(corruption_detected); + ip += 1; + { + U32 n; + for (n = 0; n < oSize; n += 2) { + huffWeight[n] = ip[n / 2] >> 4; + huffWeight[n + 1] = ip[n / 2] & 15; + } + } + } else { /* header compressed with FSE (normal case) */ + if (iSize + 1 > srcSize) + return ERROR(srcSize_wrong); + oSize = FSE_decompress_wksp(huffWeight, hwSize - 1, ip + 1, iSize, 6, workspace, workspaceSize); /* max (hwSize-1) values decoded, as last one is implied */ + if (FSE_isError(oSize)) + return oSize; + } + + /* collect weight stats */ + memset(rankStats, 0, (HUF_TABLELOG_MAX + 1) * sizeof(U32)); + weightTotal = 0; + { + U32 n; + for (n = 0; n < oSize; n++) { + if (huffWeight[n] >= HUF_TABLELOG_MAX) + return ERROR(corruption_detected); + rankStats[huffWeight[n]]++; + weightTotal += (1 << huffWeight[n]) >> 1; + } + } + if (weightTotal == 0) + return ERROR(corruption_detected); + + /* get last non-null symbol weight (implied, total must be 2^n) */ + { + U32 const tableLog = BIT_highbit32(weightTotal) + 1; + if (tableLog > HUF_TABLELOG_MAX) + return ERROR(corruption_detected); + *tableLogPtr = tableLog; + /* determine last weight */ + { + U32 const total = 1 << tableLog; + U32 const rest = total - weightTotal; + U32 const verif = 1 << BIT_highbit32(rest); + U32 const lastWeight = BIT_highbit32(rest) + 1; + if (verif != rest) + return ERROR(corruption_detected); /* last value must be a clean power of 2 */ + huffWeight[oSize] = (BYTE)lastWeight; + rankStats[lastWeight]++; + } + } + + /* check tree construction validity */ + if ((rankStats[1] < 2) || (rankStats[1] & 1)) + return ERROR(corruption_detected); /* by construction : at least 2 elts of rank 1, must be even */ + + /* results */ + *nbSymbolsPtr = (U32)(oSize + 1); + return iSize + 1; +} diff --git a/lib/zstd/error_private.h b/lib/zstd/error_private.h new file mode 100644 index 000000000000..1a60b31f706c --- /dev/null +++ b/lib/zstd/error_private.h @@ -0,0 +1,53 @@ +/** + * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of https://github.com/facebook/zstd. + * An additional grant of patent rights can be found in the PATENTS file in the + * same directory. + * + * This program is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License version 2 as published by the + * Free Software Foundation. This program is dual-licensed; you may select + * either version 2 of the GNU General Public License ("GPL") or BSD license + * ("BSD"). + */ + +/* Note : this module is expected to remain private, do not expose it */ + +#ifndef ERROR_H_MODULE +#define ERROR_H_MODULE + +/* **************************************** +* Dependencies +******************************************/ +#include /* size_t */ +#include /* enum list */ + +/* **************************************** +* Compiler-specific +******************************************/ +#define ERR_STATIC static __attribute__((unused)) + +/*-**************************************** +* Customization (error_public.h) +******************************************/ +typedef ZSTD_ErrorCode ERR_enum; +#define PREFIX(name) ZSTD_error_##name + +/*-**************************************** +* Error codes handling +******************************************/ +#define ERROR(name) ((size_t)-PREFIX(name)) + +ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); } + +ERR_STATIC ERR_enum ERR_getErrorCode(size_t code) +{ + if (!ERR_isError(code)) + return (ERR_enum)0; + return (ERR_enum)(0 - code); +} + +#endif /* ERROR_H_MODULE */ diff --git a/lib/zstd/fse.h b/lib/zstd/fse.h new file mode 100644 index 000000000000..7460ab04b191 --- /dev/null +++ b/lib/zstd/fse.h @@ -0,0 +1,575 @@ +/* + * FSE : Finite State Entropy codec + * Public Prototypes declaration + * Copyright (C) 2013-2016, Yann Collet. + * + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * This program is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License version 2 as published by the + * Free Software Foundation. This program is dual-licensed; you may select + * either version 2 of the GNU General Public License ("GPL") or BSD license + * ("BSD"). + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy + */ +#ifndef FSE_H +#define FSE_H + +/*-***************************************** +* Dependencies +******************************************/ +#include /* size_t, ptrdiff_t */ + +/*-***************************************** +* FSE_PUBLIC_API : control library symbols visibility +******************************************/ +#define FSE_PUBLIC_API + +/*------ Version ------*/ +#define FSE_VERSION_MAJOR 0 +#define FSE_VERSION_MINOR 9 +#define FSE_VERSION_RELEASE 0 + +#define FSE_LIB_VERSION FSE_VERSION_MAJOR.FSE_VERSION_MINOR.FSE_VERSION_RELEASE +#define FSE_QUOTE(str) #str +#define FSE_EXPAND_AND_QUOTE(str) FSE_QUOTE(str) +#define FSE_VERSION_STRING FSE_EXPAND_AND_QUOTE(FSE_LIB_VERSION) + +#define FSE_VERSION_NUMBER (FSE_VERSION_MAJOR * 100 * 100 + FSE_VERSION_MINOR * 100 + FSE_VERSION_RELEASE) +FSE_PUBLIC_API unsigned FSE_versionNumber(void); /**< library version number; to be used when checking dll version */ + +/*-***************************************** +* Tool functions +******************************************/ +FSE_PUBLIC_API size_t FSE_compressBound(size_t size); /* maximum compressed size */ + +/* Error Management */ +FSE_PUBLIC_API unsigned FSE_isError(size_t code); /* tells if a return value is an error code */ + +/*-***************************************** +* FSE detailed API +******************************************/ +/*! +FSE_compress() does the following: +1. count symbol occurrence from source[] into table count[] +2. normalize counters so that sum(count[]) == Power_of_2 (2^tableLog) +3. save normalized counters to memory buffer using writeNCount() +4. build encoding table 'CTable' from normalized counters +5. encode the data stream using encoding table 'CTable' + +FSE_decompress() does the following: +1. read normalized counters with readNCount() +2. build decoding table 'DTable' from normalized counters +3. decode the data stream using decoding table 'DTable' + +The following API allows targeting specific sub-functions for advanced tasks. +For example, it's possible to compress several blocks using the same 'CTable', +or to save and provide normalized distribution using external method. +*/ + +/* *** COMPRESSION *** */ +/*! FSE_optimalTableLog(): + dynamically downsize 'tableLog' when conditions are met. + It saves CPU time, by using smaller tables, while preserving or even improving compression ratio. + @return : recommended tableLog (necessarily <= 'maxTableLog') */ +FSE_PUBLIC_API unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue); + +/*! FSE_normalizeCount(): + normalize counts so that sum(count[]) == Power_of_2 (2^tableLog) + 'normalizedCounter' is a table of short, of minimum size (maxSymbolValue+1). + @return : tableLog, + or an errorCode, which can be tested using FSE_isError() */ +FSE_PUBLIC_API size_t FSE_normalizeCount(short *normalizedCounter, unsigned tableLog, const unsigned *count, size_t srcSize, unsigned maxSymbolValue); + +/*! FSE_NCountWriteBound(): + Provides the maximum possible size of an FSE normalized table, given 'maxSymbolValue' and 'tableLog'. + Typically useful for allocation purpose. */ +FSE_PUBLIC_API size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog); + +/*! FSE_writeNCount(): + Compactly save 'normalizedCounter' into 'buffer'. + @return : size of the compressed table, + or an errorCode, which can be tested using FSE_isError(). */ +FSE_PUBLIC_API size_t FSE_writeNCount(void *buffer, size_t bufferSize, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog); + +/*! Constructor and Destructor of FSE_CTable. + Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */ +typedef unsigned FSE_CTable; /* don't allocate that. It's only meant to be more restrictive than void* */ + +/*! FSE_compress_usingCTable(): + Compress `src` using `ct` into `dst` which must be already allocated. + @return : size of compressed data (<= `dstCapacity`), + or 0 if compressed data could not fit into `dst`, + or an errorCode, which can be tested using FSE_isError() */ +FSE_PUBLIC_API size_t FSE_compress_usingCTable(void *dst, size_t dstCapacity, const void *src, size_t srcSize, const FSE_CTable *ct); + +/*! +Tutorial : +---------- +The first step is to count all symbols. FSE_count() does this job very fast. +Result will be saved into 'count', a table of unsigned int, which must be already allocated, and have 'maxSymbolValuePtr[0]+1' cells. +'src' is a table of bytes of size 'srcSize'. All values within 'src' MUST be <= maxSymbolValuePtr[0] +maxSymbolValuePtr[0] will be updated, with its real value (necessarily <= original value) +FSE_count() will return the number of occurrence of the most frequent symbol. +This can be used to know if there is a single symbol within 'src', and to quickly evaluate its compressibility. +If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()). + +The next step is to normalize the frequencies. +FSE_normalizeCount() will ensure that sum of frequencies is == 2 ^'tableLog'. +It also guarantees a minimum of 1 to any Symbol with frequency >= 1. +You can use 'tableLog'==0 to mean "use default tableLog value". +If you are unsure of which tableLog value to use, you can ask FSE_optimalTableLog(), +which will provide the optimal valid tableLog given sourceSize, maxSymbolValue, and a user-defined maximum (0 means "default"). + +The result of FSE_normalizeCount() will be saved into a table, +called 'normalizedCounter', which is a table of signed short. +'normalizedCounter' must be already allocated, and have at least 'maxSymbolValue+1' cells. +The return value is tableLog if everything proceeded as expected. +It is 0 if there is a single symbol within distribution. +If there is an error (ex: invalid tableLog value), the function will return an ErrorCode (which can be tested using FSE_isError()). + +'normalizedCounter' can be saved in a compact manner to a memory area using FSE_writeNCount(). +'buffer' must be already allocated. +For guaranteed success, buffer size must be at least FSE_headerBound(). +The result of the function is the number of bytes written into 'buffer'. +If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError(); ex : buffer size too small). + +'normalizedCounter' can then be used to create the compression table 'CTable'. +The space required by 'CTable' must be already allocated, using FSE_createCTable(). +You can then use FSE_buildCTable() to fill 'CTable'. +If there is an error, both functions will return an ErrorCode (which can be tested using FSE_isError()). + +'CTable' can then be used to compress 'src', with FSE_compress_usingCTable(). +Similar to FSE_count(), the convention is that 'src' is assumed to be a table of char of size 'srcSize' +The function returns the size of compressed data (without header), necessarily <= `dstCapacity`. +If it returns '0', compressed data could not fit into 'dst'. +If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()). +*/ + +/* *** DECOMPRESSION *** */ + +/*! FSE_readNCount(): + Read compactly saved 'normalizedCounter' from 'rBuffer'. + @return : size read from 'rBuffer', + or an errorCode, which can be tested using FSE_isError(). + maxSymbolValuePtr[0] and tableLogPtr[0] will also be updated with their respective values */ +FSE_PUBLIC_API size_t FSE_readNCount(short *normalizedCounter, unsigned *maxSymbolValuePtr, unsigned *tableLogPtr, const void *rBuffer, size_t rBuffSize); + +/*! Constructor and Destructor of FSE_DTable. + Note that its size depends on 'tableLog' */ +typedef unsigned FSE_DTable; /* don't allocate that. It's just a way to be more restrictive than void* */ + +/*! FSE_buildDTable(): + Builds 'dt', which must be already allocated, using FSE_createDTable(). + return : 0, or an errorCode, which can be tested using FSE_isError() */ +FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable *dt, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void *workspace, size_t workspaceSize); + +/*! FSE_decompress_usingDTable(): + Decompress compressed source `cSrc` of size `cSrcSize` using `dt` + into `dst` which must be already allocated. + @return : size of regenerated data (necessarily <= `dstCapacity`), + or an errorCode, which can be tested using FSE_isError() */ +FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void *dst, size_t dstCapacity, const void *cSrc, size_t cSrcSize, const FSE_DTable *dt); + +/*! +Tutorial : +---------- +(Note : these functions only decompress FSE-compressed blocks. + If block is uncompressed, use memcpy() instead + If block is a single repeated byte, use memset() instead ) + +The first step is to obtain the normalized frequencies of symbols. +This can be performed by FSE_readNCount() if it was saved using FSE_writeNCount(). +'normalizedCounter' must be already allocated, and have at least 'maxSymbolValuePtr[0]+1' cells of signed short. +In practice, that means it's necessary to know 'maxSymbolValue' beforehand, +or size the table to handle worst case situations (typically 256). +FSE_readNCount() will provide 'tableLog' and 'maxSymbolValue'. +The result of FSE_readNCount() is the number of bytes read from 'rBuffer'. +Note that 'rBufferSize' must be at least 4 bytes, even if useful information is less than that. +If there is an error, the function will return an error code, which can be tested using FSE_isError(). + +The next step is to build the decompression tables 'FSE_DTable' from 'normalizedCounter'. +This is performed by the function FSE_buildDTable(). +The space required by 'FSE_DTable' must be already allocated using FSE_createDTable(). +If there is an error, the function will return an error code, which can be tested using FSE_isError(). + +`FSE_DTable` can then be used to decompress `cSrc`, with FSE_decompress_usingDTable(). +`cSrcSize` must be strictly correct, otherwise decompression will fail. +FSE_decompress_usingDTable() result will tell how many bytes were regenerated (<=`dstCapacity`). +If there is an error, the function will return an error code, which can be tested using FSE_isError(). (ex: dst buffer too small) +*/ + +/* *** Dependency *** */ +#include "bitstream.h" + +/* ***************************************** +* Static allocation +*******************************************/ +/* FSE buffer bounds */ +#define FSE_NCOUNTBOUND 512 +#define FSE_BLOCKBOUND(size) (size + (size >> 7)) +#define FSE_COMPRESSBOUND(size) (FSE_NCOUNTBOUND + FSE_BLOCKBOUND(size)) /* Macro version, useful for static allocation */ + +/* It is possible to statically allocate FSE CTable/DTable as a table of FSE_CTable/FSE_DTable using below macros */ +#define FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) (1 + (1 << (maxTableLog - 1)) + ((maxSymbolValue + 1) * 2)) +#define FSE_DTABLE_SIZE_U32(maxTableLog) (1 + (1 << maxTableLog)) + +/* ***************************************** +* FSE advanced API +*******************************************/ +/* FSE_count_wksp() : + * Same as FSE_count(), but using an externally provided scratch buffer. + * `workSpace` size must be table of >= `1024` unsigned + */ +size_t FSE_count_wksp(unsigned *count, unsigned *maxSymbolValuePtr, const void *source, size_t sourceSize, unsigned *workSpace); + +/* FSE_countFast_wksp() : + * Same as FSE_countFast(), but using an externally provided scratch buffer. + * `workSpace` must be a table of minimum `1024` unsigned + */ +size_t FSE_countFast_wksp(unsigned *count, unsigned *maxSymbolValuePtr, const void *src, size_t srcSize, unsigned *workSpace); + +/*! FSE_count_simple + * Same as FSE_countFast(), but does not use any additional memory (not even on stack). + * This function is unsafe, and will segfault if any value within `src` is `> *maxSymbolValuePtr` (presuming it's also the size of `count`). +*/ +size_t FSE_count_simple(unsigned *count, unsigned *maxSymbolValuePtr, const void *src, size_t srcSize); + +unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus); +/**< same as FSE_optimalTableLog(), which used `minus==2` */ + +size_t FSE_buildCTable_raw(FSE_CTable *ct, unsigned nbBits); +/**< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */ + +size_t FSE_buildCTable_rle(FSE_CTable *ct, unsigned char symbolValue); +/**< build a fake FSE_CTable, designed to compress always the same symbolValue */ + +/* FSE_buildCTable_wksp() : + * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`). + * `wkspSize` must be >= `(1<= BIT_DStream_completed + +When it's done, verify decompression is fully completed, by checking both DStream and the relevant states. +Checking if DStream has reached its end is performed by : + BIT_endOfDStream(&DStream); +Check also the states. There might be some symbols left there, if some high probability ones (>50%) are possible. + FSE_endOfDState(&DState); +*/ + +/* ***************************************** +* FSE unsafe API +*******************************************/ +static unsigned char FSE_decodeSymbolFast(FSE_DState_t *DStatePtr, BIT_DStream_t *bitD); +/* faster, but works only if nbBits is always >= 1 (otherwise, result will be corrupted) */ + +/* ***************************************** +* Implementation of inlined functions +*******************************************/ +typedef struct { + int deltaFindState; + U32 deltaNbBits; +} FSE_symbolCompressionTransform; /* total 8 bytes */ + +ZSTD_STATIC void FSE_initCState(FSE_CState_t *statePtr, const FSE_CTable *ct) +{ + const void *ptr = ct; + const U16 *u16ptr = (const U16 *)ptr; + const U32 tableLog = ZSTD_read16(ptr); + statePtr->value = (ptrdiff_t)1 << tableLog; + statePtr->stateTable = u16ptr + 2; + statePtr->symbolTT = ((const U32 *)ct + 1 + (tableLog ? (1 << (tableLog - 1)) : 1)); + statePtr->stateLog = tableLog; +} + +/*! FSE_initCState2() : +* Same as FSE_initCState(), but the first symbol to include (which will be the last to be read) +* uses the smallest state value possible, saving the cost of this symbol */ +ZSTD_STATIC void FSE_initCState2(FSE_CState_t *statePtr, const FSE_CTable *ct, U32 symbol) +{ + FSE_initCState(statePtr, ct); + { + const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform *)(statePtr->symbolTT))[symbol]; + const U16 *stateTable = (const U16 *)(statePtr->stateTable); + U32 nbBitsOut = (U32)((symbolTT.deltaNbBits + (1 << 15)) >> 16); + statePtr->value = (nbBitsOut << 16) - symbolTT.deltaNbBits; + statePtr->value = stateTable[(statePtr->value >> nbBitsOut) + symbolTT.deltaFindState]; + } +} + +ZSTD_STATIC void FSE_encodeSymbol(BIT_CStream_t *bitC, FSE_CState_t *statePtr, U32 symbol) +{ + const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform *)(statePtr->symbolTT))[symbol]; + const U16 *const stateTable = (const U16 *)(statePtr->stateTable); + U32 nbBitsOut = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16); + BIT_addBits(bitC, statePtr->value, nbBitsOut); + statePtr->value = stateTable[(statePtr->value >> nbBitsOut) + symbolTT.deltaFindState]; +} + +ZSTD_STATIC void FSE_flushCState(BIT_CStream_t *bitC, const FSE_CState_t *statePtr) +{ + BIT_addBits(bitC, statePtr->value, statePtr->stateLog); + BIT_flushBits(bitC); +} + +/* ====== Decompression ====== */ + +typedef struct { + U16 tableLog; + U16 fastMode; +} FSE_DTableHeader; /* sizeof U32 */ + +typedef struct { + unsigned short newState; + unsigned char symbol; + unsigned char nbBits; +} FSE_decode_t; /* size == U32 */ + +ZSTD_STATIC void FSE_initDState(FSE_DState_t *DStatePtr, BIT_DStream_t *bitD, const FSE_DTable *dt) +{ + const void *ptr = dt; + const FSE_DTableHeader *const DTableH = (const FSE_DTableHeader *)ptr; + DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog); + BIT_reloadDStream(bitD); + DStatePtr->table = dt + 1; +} + +ZSTD_STATIC BYTE FSE_peekSymbol(const FSE_DState_t *DStatePtr) +{ + FSE_decode_t const DInfo = ((const FSE_decode_t *)(DStatePtr->table))[DStatePtr->state]; + return DInfo.symbol; +} + +ZSTD_STATIC void FSE_updateState(FSE_DState_t *DStatePtr, BIT_DStream_t *bitD) +{ + FSE_decode_t const DInfo = ((const FSE_decode_t *)(DStatePtr->table))[DStatePtr->state]; + U32 const nbBits = DInfo.nbBits; + size_t const lowBits = BIT_readBits(bitD, nbBits); + DStatePtr->state = DInfo.newState + lowBits; +} + +ZSTD_STATIC BYTE FSE_decodeSymbol(FSE_DState_t *DStatePtr, BIT_DStream_t *bitD) +{ + FSE_decode_t const DInfo = ((const FSE_decode_t *)(DStatePtr->table))[DStatePtr->state]; + U32 const nbBits = DInfo.nbBits; + BYTE const symbol = DInfo.symbol; + size_t const lowBits = BIT_readBits(bitD, nbBits); + + DStatePtr->state = DInfo.newState + lowBits; + return symbol; +} + +/*! FSE_decodeSymbolFast() : + unsafe, only works if no symbol has a probability > 50% */ +ZSTD_STATIC BYTE FSE_decodeSymbolFast(FSE_DState_t *DStatePtr, BIT_DStream_t *bitD) +{ + FSE_decode_t const DInfo = ((const FSE_decode_t *)(DStatePtr->table))[DStatePtr->state]; + U32 const nbBits = DInfo.nbBits; + BYTE const symbol = DInfo.symbol; + size_t const lowBits = BIT_readBitsFast(bitD, nbBits); + + DStatePtr->state = DInfo.newState + lowBits; + return symbol; +} + +ZSTD_STATIC unsigned FSE_endOfDState(const FSE_DState_t *DStatePtr) { return DStatePtr->state == 0; } + +/* ************************************************************** +* Tuning parameters +****************************************************************/ +/*!MEMORY_USAGE : +* Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.) +* Increasing memory usage improves compression ratio +* Reduced memory usage can improve speed, due to cache effect +* Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */ +#ifndef FSE_MAX_MEMORY_USAGE +#define FSE_MAX_MEMORY_USAGE 14 +#endif +#ifndef FSE_DEFAULT_MEMORY_USAGE +#define FSE_DEFAULT_MEMORY_USAGE 13 +#endif + +/*!FSE_MAX_SYMBOL_VALUE : +* Maximum symbol value authorized. +* Required for proper stack allocation */ +#ifndef FSE_MAX_SYMBOL_VALUE +#define FSE_MAX_SYMBOL_VALUE 255 +#endif + +/* ************************************************************** +* template functions type & suffix +****************************************************************/ +#define FSE_FUNCTION_TYPE BYTE +#define FSE_FUNCTION_EXTENSION +#define FSE_DECODE_TYPE FSE_decode_t + +/* *************************************************************** +* Constants +*****************************************************************/ +#define FSE_MAX_TABLELOG (FSE_MAX_MEMORY_USAGE - 2) +#define FSE_MAX_TABLESIZE (1U << FSE_MAX_TABLELOG) +#define FSE_MAXTABLESIZE_MASK (FSE_MAX_TABLESIZE - 1) +#define FSE_DEFAULT_TABLELOG (FSE_DEFAULT_MEMORY_USAGE - 2) +#define FSE_MIN_TABLELOG 5 + +#define FSE_TABLELOG_ABSOLUTE_MAX 15 +#if FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX +#error "FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX is not supported" +#endif + +#define FSE_TABLESTEP(tableSize) ((tableSize >> 1) + (tableSize >> 3) + 3) + +#endif /* FSE_H */ diff --git a/lib/zstd/fse_compress.c b/lib/zstd/fse_compress.c new file mode 100644 index 000000000000..ef3d1741d532 --- /dev/null +++ b/lib/zstd/fse_compress.c @@ -0,0 +1,795 @@ +/* + * FSE : Finite State Entropy encoder + * Copyright (C) 2013-2015, Yann Collet. + * + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * This program is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License version 2 as published by the + * Free Software Foundation. This program is dual-licensed; you may select + * either version 2 of the GNU General Public License ("GPL") or BSD license + * ("BSD"). + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy + */ + +/* ************************************************************** +* Compiler specifics +****************************************************************/ +#define FORCE_INLINE static __always_inline + +/* ************************************************************** +* Includes +****************************************************************/ +#include "bitstream.h" +#include "fse.h" +#include +#include +#include +#include /* memcpy, memset */ + +/* ************************************************************** +* Error Management +****************************************************************/ +#define FSE_STATIC_ASSERT(c) \ + { \ + enum { FSE_static_assert = 1 / (int)(!!(c)) }; \ + } /* use only *after* variable declarations */ + +/* ************************************************************** +* Templates +****************************************************************/ +/* + designed to be included + for type-specific functions (template emulation in C) + Objective is to write these functions only once, for improved maintenance +*/ + +/* safety checks */ +#ifndef FSE_FUNCTION_EXTENSION +#error "FSE_FUNCTION_EXTENSION must be defined" +#endif +#ifndef FSE_FUNCTION_TYPE +#error "FSE_FUNCTION_TYPE must be defined" +#endif + +/* Function names */ +#define FSE_CAT(X, Y) X##Y +#define FSE_FUNCTION_NAME(X, Y) FSE_CAT(X, Y) +#define FSE_TYPE_NAME(X, Y) FSE_CAT(X, Y) + +/* Function templates */ + +/* FSE_buildCTable_wksp() : + * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`). + * wkspSize should be sized to handle worst case situation, which is `1<> 1 : 1); + FSE_symbolCompressionTransform *const symbolTT = (FSE_symbolCompressionTransform *)(FSCT); + U32 const step = FSE_TABLESTEP(tableSize); + U32 highThreshold = tableSize - 1; + + U32 *cumul; + FSE_FUNCTION_TYPE *tableSymbol; + size_t spaceUsed32 = 0; + + cumul = (U32 *)workspace + spaceUsed32; + spaceUsed32 += FSE_MAX_SYMBOL_VALUE + 2; + tableSymbol = (FSE_FUNCTION_TYPE *)((U32 *)workspace + spaceUsed32); + spaceUsed32 += ALIGN(sizeof(FSE_FUNCTION_TYPE) * ((size_t)1 << tableLog), sizeof(U32)) >> 2; + + if ((spaceUsed32 << 2) > workspaceSize) + return ERROR(tableLog_tooLarge); + workspace = (U32 *)workspace + spaceUsed32; + workspaceSize -= (spaceUsed32 << 2); + + /* CTable header */ + tableU16[-2] = (U16)tableLog; + tableU16[-1] = (U16)maxSymbolValue; + + /* For explanations on how to distribute symbol values over the table : + * http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */ + + /* symbol start positions */ + { + U32 u; + cumul[0] = 0; + for (u = 1; u <= maxSymbolValue + 1; u++) { + if (normalizedCounter[u - 1] == -1) { /* Low proba symbol */ + cumul[u] = cumul[u - 1] + 1; + tableSymbol[highThreshold--] = (FSE_FUNCTION_TYPE)(u - 1); + } else { + cumul[u] = cumul[u - 1] + normalizedCounter[u - 1]; + } + } + cumul[maxSymbolValue + 1] = tableSize + 1; + } + + /* Spread symbols */ + { + U32 position = 0; + U32 symbol; + for (symbol = 0; symbol <= maxSymbolValue; symbol++) { + int nbOccurences; + for (nbOccurences = 0; nbOccurences < normalizedCounter[symbol]; nbOccurences++) { + tableSymbol[position] = (FSE_FUNCTION_TYPE)symbol; + position = (position + step) & tableMask; + while (position > highThreshold) + position = (position + step) & tableMask; /* Low proba area */ + } + } + + if (position != 0) + return ERROR(GENERIC); /* Must have gone through all positions */ + } + + /* Build table */ + { + U32 u; + for (u = 0; u < tableSize; u++) { + FSE_FUNCTION_TYPE s = tableSymbol[u]; /* note : static analyzer may not understand tableSymbol is properly initialized */ + tableU16[cumul[s]++] = (U16)(tableSize + u); /* TableU16 : sorted by symbol order; gives next state value */ + } + } + + /* Build Symbol Transformation Table */ + { + unsigned total = 0; + unsigned s; + for (s = 0; s <= maxSymbolValue; s++) { + switch (normalizedCounter[s]) { + case 0: break; + + case -1: + case 1: + symbolTT[s].deltaNbBits = (tableLog << 16) - (1 << tableLog); + symbolTT[s].deltaFindState = total - 1; + total++; + break; + default: { + U32 const maxBitsOut = tableLog - BIT_highbit32(normalizedCounter[s] - 1); + U32 const minStatePlus = normalizedCounter[s] << maxBitsOut; + symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus; + symbolTT[s].deltaFindState = total - normalizedCounter[s]; + total += normalizedCounter[s]; + } + } + } + } + + return 0; +} + +/*-************************************************************** +* FSE NCount encoding-decoding +****************************************************************/ +size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog) +{ + size_t const maxHeaderSize = (((maxSymbolValue + 1) * tableLog) >> 3) + 3; + return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND; /* maxSymbolValue==0 ? use default */ +} + +static size_t FSE_writeNCount_generic(void *header, size_t headerBufferSize, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, + unsigned writeIsSafe) +{ + BYTE *const ostart = (BYTE *)header; + BYTE *out = ostart; + BYTE *const oend = ostart + headerBufferSize; + int nbBits; + const int tableSize = 1 << tableLog; + int remaining; + int threshold; + U32 bitStream; + int bitCount; + unsigned charnum = 0; + int previous0 = 0; + + bitStream = 0; + bitCount = 0; + /* Table Size */ + bitStream += (tableLog - FSE_MIN_TABLELOG) << bitCount; + bitCount += 4; + + /* Init */ + remaining = tableSize + 1; /* +1 for extra accuracy */ + threshold = tableSize; + nbBits = tableLog + 1; + + while (remaining > 1) { /* stops at 1 */ + if (previous0) { + unsigned start = charnum; + while (!normalizedCounter[charnum]) + charnum++; + while (charnum >= start + 24) { + start += 24; + bitStream += 0xFFFFU << bitCount; + if ((!writeIsSafe) && (out > oend - 2)) + return ERROR(dstSize_tooSmall); /* Buffer overflow */ + out[0] = (BYTE)bitStream; + out[1] = (BYTE)(bitStream >> 8); + out += 2; + bitStream >>= 16; + } + while (charnum >= start + 3) { + start += 3; + bitStream += 3 << bitCount; + bitCount += 2; + } + bitStream += (charnum - start) << bitCount; + bitCount += 2; + if (bitCount > 16) { + if ((!writeIsSafe) && (out > oend - 2)) + return ERROR(dstSize_tooSmall); /* Buffer overflow */ + out[0] = (BYTE)bitStream; + out[1] = (BYTE)(bitStream >> 8); + out += 2; + bitStream >>= 16; + bitCount -= 16; + } + } + { + int count = normalizedCounter[charnum++]; + int const max = (2 * threshold - 1) - remaining; + remaining -= count < 0 ? -count : count; + count++; /* +1 for extra accuracy */ + if (count >= threshold) + count += max; /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */ + bitStream += count << bitCount; + bitCount += nbBits; + bitCount -= (count < max); + previous0 = (count == 1); + if (remaining < 1) + return ERROR(GENERIC); + while (remaining < threshold) + nbBits--, threshold >>= 1; + } + if (bitCount > 16) { + if ((!writeIsSafe) && (out > oend - 2)) + return ERROR(dstSize_tooSmall); /* Buffer overflow */ + out[0] = (BYTE)bitStream; + out[1] = (BYTE)(bitStream >> 8); + out += 2; + bitStream >>= 16; + bitCount -= 16; + } + } + + /* flush remaining bitStream */ + if ((!writeIsSafe) && (out > oend - 2)) + return ERROR(dstSize_tooSmall); /* Buffer overflow */ + out[0] = (BYTE)bitStream; + out[1] = (BYTE)(bitStream >> 8); + out += (bitCount + 7) / 8; + + if (charnum > maxSymbolValue + 1) + return ERROR(GENERIC); + + return (out - ostart); +} + +size_t FSE_writeNCount(void *buffer, size_t bufferSize, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog) +{ + if (tableLog > FSE_MAX_TABLELOG) + return ERROR(tableLog_tooLarge); /* Unsupported */ + if (tableLog < FSE_MIN_TABLELOG) + return ERROR(GENERIC); /* Unsupported */ + + if (bufferSize < FSE_NCountWriteBound(maxSymbolValue, tableLog)) + return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 0); + + return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 1); +} + +/*-************************************************************** +* Counting histogram +****************************************************************/ +/*! FSE_count_simple + This function counts byte values within `src`, and store the histogram into table `count`. + It doesn't use any additional memory. + But this function is unsafe : it doesn't check that all values within `src` can fit into `count`. + For this reason, prefer using a table `count` with 256 elements. + @return : count of most numerous element +*/ +size_t FSE_count_simple(unsigned *count, unsigned *maxSymbolValuePtr, const void *src, size_t srcSize) +{ + const BYTE *ip = (const BYTE *)src; + const BYTE *const end = ip + srcSize; + unsigned maxSymbolValue = *maxSymbolValuePtr; + unsigned max = 0; + + memset(count, 0, (maxSymbolValue + 1) * sizeof(*count)); + if (srcSize == 0) { + *maxSymbolValuePtr = 0; + return 0; + } + + while (ip < end) + count[*ip++]++; + + while (!count[maxSymbolValue]) + maxSymbolValue--; + *maxSymbolValuePtr = maxSymbolValue; + + { + U32 s; + for (s = 0; s <= maxSymbolValue; s++) + if (count[s] > max) + max = count[s]; + } + + return (size_t)max; +} + +/* FSE_count_parallel_wksp() : + * Same as FSE_count_parallel(), but using an externally provided scratch buffer. + * `workSpace` size must be a minimum of `1024 * sizeof(unsigned)`` */ +static size_t FSE_count_parallel_wksp(unsigned *count, unsigned *maxSymbolValuePtr, const void *source, size_t sourceSize, unsigned checkMax, + unsigned *const workSpace) +{ + const BYTE *ip = (const BYTE *)source; + const BYTE *const iend = ip + sourceSize; + unsigned maxSymbolValue = *maxSymbolValuePtr; + unsigned max = 0; + U32 *const Counting1 = workSpace; + U32 *const Counting2 = Counting1 + 256; + U32 *const Counting3 = Counting2 + 256; + U32 *const Counting4 = Counting3 + 256; + + memset(Counting1, 0, 4 * 256 * sizeof(unsigned)); + + /* safety checks */ + if (!sourceSize) { + memset(count, 0, maxSymbolValue + 1); + *maxSymbolValuePtr = 0; + return 0; + } + if (!maxSymbolValue) + maxSymbolValue = 255; /* 0 == default */ + + /* by stripes of 16 bytes */ + { + U32 cached = ZSTD_read32(ip); + ip += 4; + while (ip < iend - 15) { + U32 c = cached; + cached = ZSTD_read32(ip); + ip += 4; + Counting1[(BYTE)c]++; + Counting2[(BYTE)(c >> 8)]++; + Counting3[(BYTE)(c >> 16)]++; + Counting4[c >> 24]++; + c = cached; + cached = ZSTD_read32(ip); + ip += 4; + Counting1[(BYTE)c]++; + Counting2[(BYTE)(c >> 8)]++; + Counting3[(BYTE)(c >> 16)]++; + Counting4[c >> 24]++; + c = cached; + cached = ZSTD_read32(ip); + ip += 4; + Counting1[(BYTE)c]++; + Counting2[(BYTE)(c >> 8)]++; + Counting3[(BYTE)(c >> 16)]++; + Counting4[c >> 24]++; + c = cached; + cached = ZSTD_read32(ip); + ip += 4; + Counting1[(BYTE)c]++; + Counting2[(BYTE)(c >> 8)]++; + Counting3[(BYTE)(c >> 16)]++; + Counting4[c >> 24]++; + } + ip -= 4; + } + + /* finish last symbols */ + while (ip < iend) + Counting1[*ip++]++; + + if (checkMax) { /* verify stats will fit into destination table */ + U32 s; + for (s = 255; s > maxSymbolValue; s--) { + Counting1[s] += Counting2[s] + Counting3[s] + Counting4[s]; + if (Counting1[s]) + return ERROR(maxSymbolValue_tooSmall); + } + } + + { + U32 s; + for (s = 0; s <= maxSymbolValue; s++) { + count[s] = Counting1[s] + Counting2[s] + Counting3[s] + Counting4[s]; + if (count[s] > max) + max = count[s]; + } + } + + while (!count[maxSymbolValue]) + maxSymbolValue--; + *maxSymbolValuePtr = maxSymbolValue; + return (size_t)max; +} + +/* FSE_countFast_wksp() : + * Same as FSE_countFast(), but using an externally provided scratch buffer. + * `workSpace` size must be table of >= `1024` unsigned */ +size_t FSE_countFast_wksp(unsigned *count, unsigned *maxSymbolValuePtr, const void *source, size_t sourceSize, unsigned *workSpace) +{ + if (sourceSize < 1500) + return FSE_count_simple(count, maxSymbolValuePtr, source, sourceSize); + return FSE_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, 0, workSpace); +} + +/* FSE_count_wksp() : + * Same as FSE_count(), but using an externally provided scratch buffer. + * `workSpace` size must be table of >= `1024` unsigned */ +size_t FSE_count_wksp(unsigned *count, unsigned *maxSymbolValuePtr, const void *source, size_t sourceSize, unsigned *workSpace) +{ + if (*maxSymbolValuePtr < 255) + return FSE_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, 1, workSpace); + *maxSymbolValuePtr = 255; + return FSE_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, workSpace); +} + +/*-************************************************************** +* FSE Compression Code +****************************************************************/ +/*! FSE_sizeof_CTable() : + FSE_CTable is a variable size structure which contains : + `U16 tableLog;` + `U16 maxSymbolValue;` + `U16 nextStateNumber[1 << tableLog];` // This size is variable + `FSE_symbolCompressionTransform symbolTT[maxSymbolValue+1];` // This size is variable +Allocation is manual (C standard does not support variable-size structures). +*/ +size_t FSE_sizeof_CTable(unsigned maxSymbolValue, unsigned tableLog) +{ + if (tableLog > FSE_MAX_TABLELOG) + return ERROR(tableLog_tooLarge); + return FSE_CTABLE_SIZE_U32(tableLog, maxSymbolValue) * sizeof(U32); +} + +/* provides the minimum logSize to safely represent a distribution */ +static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue) +{ + U32 minBitsSrc = BIT_highbit32((U32)(srcSize - 1)) + 1; + U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2; + U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols; + return minBits; +} + +unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus) +{ + U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus; + U32 tableLog = maxTableLog; + U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue); + if (tableLog == 0) + tableLog = FSE_DEFAULT_TABLELOG; + if (maxBitsSrc < tableLog) + tableLog = maxBitsSrc; /* Accuracy can be reduced */ + if (minBits > tableLog) + tableLog = minBits; /* Need a minimum to safely represent all symbol values */ + if (tableLog < FSE_MIN_TABLELOG) + tableLog = FSE_MIN_TABLELOG; + if (tableLog > FSE_MAX_TABLELOG) + tableLog = FSE_MAX_TABLELOG; + return tableLog; +} + +unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue) +{ + return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 2); +} + +/* Secondary normalization method. + To be used when primary method fails. */ + +static size_t FSE_normalizeM2(short *norm, U32 tableLog, const unsigned *count, size_t total, U32 maxSymbolValue) +{ + short const NOT_YET_ASSIGNED = -2; + U32 s; + U32 distributed = 0; + U32 ToDistribute; + + /* Init */ + U32 const lowThreshold = (U32)(total >> tableLog); + U32 lowOne = (U32)((total * 3) >> (tableLog + 1)); + + for (s = 0; s <= maxSymbolValue; s++) { + if (count[s] == 0) { + norm[s] = 0; + continue; + } + if (count[s] <= lowThreshold) { + norm[s] = -1; + distributed++; + total -= count[s]; + continue; + } + if (count[s] <= lowOne) { + norm[s] = 1; + distributed++; + total -= count[s]; + continue; + } + + norm[s] = NOT_YET_ASSIGNED; + } + ToDistribute = (1 << tableLog) - distributed; + + if ((total / ToDistribute) > lowOne) { + /* risk of rounding to zero */ + lowOne = (U32)((total * 3) / (ToDistribute * 2)); + for (s = 0; s <= maxSymbolValue; s++) { + if ((norm[s] == NOT_YET_ASSIGNED) && (count[s] <= lowOne)) { + norm[s] = 1; + distributed++; + total -= count[s]; + continue; + } + } + ToDistribute = (1 << tableLog) - distributed; + } + + if (distributed == maxSymbolValue + 1) { + /* all values are pretty poor; + probably incompressible data (should have already been detected); + find max, then give all remaining points to max */ + U32 maxV = 0, maxC = 0; + for (s = 0; s <= maxSymbolValue; s++) + if (count[s] > maxC) + maxV = s, maxC = count[s]; + norm[maxV] += (short)ToDistribute; + return 0; + } + + if (total == 0) { + /* all of the symbols were low enough for the lowOne or lowThreshold */ + for (s = 0; ToDistribute > 0; s = (s + 1) % (maxSymbolValue + 1)) + if (norm[s] > 0) + ToDistribute--, norm[s]++; + return 0; + } + + { + U64 const vStepLog = 62 - tableLog; + U64 const mid = (1ULL << (vStepLog - 1)) - 1; + U64 const rStep = div_u64((((U64)1 << vStepLog) * ToDistribute) + mid, (U32)total); /* scale on remaining */ + U64 tmpTotal = mid; + for (s = 0; s <= maxSymbolValue; s++) { + if (norm[s] == NOT_YET_ASSIGNED) { + U64 const end = tmpTotal + (count[s] * rStep); + U32 const sStart = (U32)(tmpTotal >> vStepLog); + U32 const sEnd = (U32)(end >> vStepLog); + U32 const weight = sEnd - sStart; + if (weight < 1) + return ERROR(GENERIC); + norm[s] = (short)weight; + tmpTotal = end; + } + } + } + + return 0; +} + +size_t FSE_normalizeCount(short *normalizedCounter, unsigned tableLog, const unsigned *count, size_t total, unsigned maxSymbolValue) +{ + /* Sanity checks */ + if (tableLog == 0) + tableLog = FSE_DEFAULT_TABLELOG; + if (tableLog < FSE_MIN_TABLELOG) + return ERROR(GENERIC); /* Unsupported size */ + if (tableLog > FSE_MAX_TABLELOG) + return ERROR(tableLog_tooLarge); /* Unsupported size */ + if (tableLog < FSE_minTableLog(total, maxSymbolValue)) + return ERROR(GENERIC); /* Too small tableLog, compression potentially impossible */ + + { + U32 const rtbTable[] = {0, 473195, 504333, 520860, 550000, 700000, 750000, 830000}; + U64 const scale = 62 - tableLog; + U64 const step = div_u64((U64)1 << 62, (U32)total); /* <== here, one division ! */ + U64 const vStep = 1ULL << (scale - 20); + int stillToDistribute = 1 << tableLog; + unsigned s; + unsigned largest = 0; + short largestP = 0; + U32 lowThreshold = (U32)(total >> tableLog); + + for (s = 0; s <= maxSymbolValue; s++) { + if (count[s] == total) + return 0; /* rle special case */ + if (count[s] == 0) { + normalizedCounter[s] = 0; + continue; + } + if (count[s] <= lowThreshold) { + normalizedCounter[s] = -1; + stillToDistribute--; + } else { + short proba = (short)((count[s] * step) >> scale); + if (proba < 8) { + U64 restToBeat = vStep * rtbTable[proba]; + proba += (count[s] * step) - ((U64)proba << scale) > restToBeat; + } + if (proba > largestP) + largestP = proba, largest = s; + normalizedCounter[s] = proba; + stillToDistribute -= proba; + } + } + if (-stillToDistribute >= (normalizedCounter[largest] >> 1)) { + /* corner case, need another normalization method */ + size_t const errorCode = FSE_normalizeM2(normalizedCounter, tableLog, count, total, maxSymbolValue); + if (FSE_isError(errorCode)) + return errorCode; + } else + normalizedCounter[largest] += (short)stillToDistribute; + } + + return tableLog; +} + +/* fake FSE_CTable, for raw (uncompressed) input */ +size_t FSE_buildCTable_raw(FSE_CTable *ct, unsigned nbBits) +{ + const unsigned tableSize = 1 << nbBits; + const unsigned tableMask = tableSize - 1; + const unsigned maxSymbolValue = tableMask; + void *const ptr = ct; + U16 *const tableU16 = ((U16 *)ptr) + 2; + void *const FSCT = ((U32 *)ptr) + 1 /* header */ + (tableSize >> 1); /* assumption : tableLog >= 1 */ + FSE_symbolCompressionTransform *const symbolTT = (FSE_symbolCompressionTransform *)(FSCT); + unsigned s; + + /* Sanity checks */ + if (nbBits < 1) + return ERROR(GENERIC); /* min size */ + + /* header */ + tableU16[-2] = (U16)nbBits; + tableU16[-1] = (U16)maxSymbolValue; + + /* Build table */ + for (s = 0; s < tableSize; s++) + tableU16[s] = (U16)(tableSize + s); + + /* Build Symbol Transformation Table */ + { + const U32 deltaNbBits = (nbBits << 16) - (1 << nbBits); + for (s = 0; s <= maxSymbolValue; s++) { + symbolTT[s].deltaNbBits = deltaNbBits; + symbolTT[s].deltaFindState = s - 1; + } + } + + return 0; +} + +/* fake FSE_CTable, for rle input (always same symbol) */ +size_t FSE_buildCTable_rle(FSE_CTable *ct, BYTE symbolValue) +{ + void *ptr = ct; + U16 *tableU16 = ((U16 *)ptr) + 2; + void *FSCTptr = (U32 *)ptr + 2; + FSE_symbolCompressionTransform *symbolTT = (FSE_symbolCompressionTransform *)FSCTptr; + + /* header */ + tableU16[-2] = (U16)0; + tableU16[-1] = (U16)symbolValue; + + /* Build table */ + tableU16[0] = 0; + tableU16[1] = 0; /* just in case */ + + /* Build Symbol Transformation Table */ + symbolTT[symbolValue].deltaNbBits = 0; + symbolTT[symbolValue].deltaFindState = 0; + + return 0; +} + +static size_t FSE_compress_usingCTable_generic(void *dst, size_t dstSize, const void *src, size_t srcSize, const FSE_CTable *ct, const unsigned fast) +{ + const BYTE *const istart = (const BYTE *)src; + const BYTE *const iend = istart + srcSize; + const BYTE *ip = iend; + + BIT_CStream_t bitC; + FSE_CState_t CState1, CState2; + + /* init */ + if (srcSize <= 2) + return 0; + { + size_t const initError = BIT_initCStream(&bitC, dst, dstSize); + if (FSE_isError(initError)) + return 0; /* not enough space available to write a bitstream */ + } + +#define FSE_FLUSHBITS(s) (fast ? BIT_flushBitsFast(s) : BIT_flushBits(s)) + + if (srcSize & 1) { + FSE_initCState2(&CState1, ct, *--ip); + FSE_initCState2(&CState2, ct, *--ip); + FSE_encodeSymbol(&bitC, &CState1, *--ip); + FSE_FLUSHBITS(&bitC); + } else { + FSE_initCState2(&CState2, ct, *--ip); + FSE_initCState2(&CState1, ct, *--ip); + } + + /* join to mod 4 */ + srcSize -= 2; + if ((sizeof(bitC.bitContainer) * 8 > FSE_MAX_TABLELOG * 4 + 7) && (srcSize & 2)) { /* test bit 2 */ + FSE_encodeSymbol(&bitC, &CState2, *--ip); + FSE_encodeSymbol(&bitC, &CState1, *--ip); + FSE_FLUSHBITS(&bitC); + } + + /* 2 or 4 encoding per loop */ + while (ip > istart) { + + FSE_encodeSymbol(&bitC, &CState2, *--ip); + + if (sizeof(bitC.bitContainer) * 8 < FSE_MAX_TABLELOG * 2 + 7) /* this test must be static */ + FSE_FLUSHBITS(&bitC); + + FSE_encodeSymbol(&bitC, &CState1, *--ip); + + if (sizeof(bitC.bitContainer) * 8 > FSE_MAX_TABLELOG * 4 + 7) { /* this test must be static */ + FSE_encodeSymbol(&bitC, &CState2, *--ip); + FSE_encodeSymbol(&bitC, &CState1, *--ip); + } + + FSE_FLUSHBITS(&bitC); + } + + FSE_flushCState(&bitC, &CState2); + FSE_flushCState(&bitC, &CState1); + return BIT_closeCStream(&bitC); +} + +size_t FSE_compress_usingCTable(void *dst, size_t dstSize, const void *src, size_t srcSize, const FSE_CTable *ct) +{ + unsigned const fast = (dstSize >= FSE_BLOCKBOUND(srcSize)); + + if (fast) + return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 1); + else + return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 0); +} + +size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); } diff --git a/lib/zstd/fse_decompress.c b/lib/zstd/fse_decompress.c new file mode 100644 index 000000000000..a84300e5a013 --- /dev/null +++ b/lib/zstd/fse_decompress.c @@ -0,0 +1,332 @@ +/* + * FSE : Finite State Entropy decoder + * Copyright (C) 2013-2015, Yann Collet. + * + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * This program is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License version 2 as published by the + * Free Software Foundation. This program is dual-licensed; you may select + * either version 2 of the GNU General Public License ("GPL") or BSD license + * ("BSD"). + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy + */ + +/* ************************************************************** +* Compiler specifics +****************************************************************/ +#define FORCE_INLINE static __always_inline + +/* ************************************************************** +* Includes +****************************************************************/ +#include "bitstream.h" +#include "fse.h" +#include +#include +#include /* memcpy, memset */ + +/* ************************************************************** +* Error Management +****************************************************************/ +#define FSE_isError ERR_isError +#define FSE_STATIC_ASSERT(c) \ + { \ + enum { FSE_static_assert = 1 / (int)(!!(c)) }; \ + } /* use only *after* variable declarations */ + +/* check and forward error code */ +#define CHECK_F(f) \ + { \ + size_t const e = f; \ + if (FSE_isError(e)) \ + return e; \ + } + +/* ************************************************************** +* Templates +****************************************************************/ +/* + designed to be included + for type-specific functions (template emulation in C) + Objective is to write these functions only once, for improved maintenance +*/ + +/* safety checks */ +#ifndef FSE_FUNCTION_EXTENSION +#error "FSE_FUNCTION_EXTENSION must be defined" +#endif +#ifndef FSE_FUNCTION_TYPE +#error "FSE_FUNCTION_TYPE must be defined" +#endif + +/* Function names */ +#define FSE_CAT(X, Y) X##Y +#define FSE_FUNCTION_NAME(X, Y) FSE_CAT(X, Y) +#define FSE_TYPE_NAME(X, Y) FSE_CAT(X, Y) + +/* Function templates */ + +size_t FSE_buildDTable_wksp(FSE_DTable *dt, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void *workspace, size_t workspaceSize) +{ + void *const tdPtr = dt + 1; /* because *dt is unsigned, 32-bits aligned on 32-bits */ + FSE_DECODE_TYPE *const tableDecode = (FSE_DECODE_TYPE *)(tdPtr); + U16 *symbolNext = (U16 *)workspace; + + U32 const maxSV1 = maxSymbolValue + 1; + U32 const tableSize = 1 << tableLog; + U32 highThreshold = tableSize - 1; + + /* Sanity Checks */ + if (workspaceSize < sizeof(U16) * (FSE_MAX_SYMBOL_VALUE + 1)) + return ERROR(tableLog_tooLarge); + if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE) + return ERROR(maxSymbolValue_tooLarge); + if (tableLog > FSE_MAX_TABLELOG) + return ERROR(tableLog_tooLarge); + + /* Init, lay down lowprob symbols */ + { + FSE_DTableHeader DTableH; + DTableH.tableLog = (U16)tableLog; + DTableH.fastMode = 1; + { + S16 const largeLimit = (S16)(1 << (tableLog - 1)); + U32 s; + for (s = 0; s < maxSV1; s++) { + if (normalizedCounter[s] == -1) { + tableDecode[highThreshold--].symbol = (FSE_FUNCTION_TYPE)s; + symbolNext[s] = 1; + } else { + if (normalizedCounter[s] >= largeLimit) + DTableH.fastMode = 0; + symbolNext[s] = normalizedCounter[s]; + } + } + } + memcpy(dt, &DTableH, sizeof(DTableH)); + } + + /* Spread symbols */ + { + U32 const tableMask = tableSize - 1; + U32 const step = FSE_TABLESTEP(tableSize); + U32 s, position = 0; + for (s = 0; s < maxSV1; s++) { + int i; + for (i = 0; i < normalizedCounter[s]; i++) { + tableDecode[position].symbol = (FSE_FUNCTION_TYPE)s; + position = (position + step) & tableMask; + while (position > highThreshold) + position = (position + step) & tableMask; /* lowprob area */ + } + } + if (position != 0) + return ERROR(GENERIC); /* position must reach all cells once, otherwise normalizedCounter is incorrect */ + } + + /* Build Decoding table */ + { + U32 u; + for (u = 0; u < tableSize; u++) { + FSE_FUNCTION_TYPE const symbol = (FSE_FUNCTION_TYPE)(tableDecode[u].symbol); + U16 nextState = symbolNext[symbol]++; + tableDecode[u].nbBits = (BYTE)(tableLog - BIT_highbit32((U32)nextState)); + tableDecode[u].newState = (U16)((nextState << tableDecode[u].nbBits) - tableSize); + } + } + + return 0; +} + +/*-******************************************************* +* Decompression (Byte symbols) +*********************************************************/ +size_t FSE_buildDTable_rle(FSE_DTable *dt, BYTE symbolValue) +{ + void *ptr = dt; + FSE_DTableHeader *const DTableH = (FSE_DTableHeader *)ptr; + void *dPtr = dt + 1; + FSE_decode_t *const cell = (FSE_decode_t *)dPtr; + + DTableH->tableLog = 0; + DTableH->fastMode = 0; + + cell->newState = 0; + cell->symbol = symbolValue; + cell->nbBits = 0; + + return 0; +} + +size_t FSE_buildDTable_raw(FSE_DTable *dt, unsigned nbBits) +{ + void *ptr = dt; + FSE_DTableHeader *const DTableH = (FSE_DTableHeader *)ptr; + void *dPtr = dt + 1; + FSE_decode_t *const dinfo = (FSE_decode_t *)dPtr; + const unsigned tableSize = 1 << nbBits; + const unsigned tableMask = tableSize - 1; + const unsigned maxSV1 = tableMask + 1; + unsigned s; + + /* Sanity checks */ + if (nbBits < 1) + return ERROR(GENERIC); /* min size */ + + /* Build Decoding Table */ + DTableH->tableLog = (U16)nbBits; + DTableH->fastMode = 1; + for (s = 0; s < maxSV1; s++) { + dinfo[s].newState = 0; + dinfo[s].symbol = (BYTE)s; + dinfo[s].nbBits = (BYTE)nbBits; + } + + return 0; +} + +FORCE_INLINE size_t FSE_decompress_usingDTable_generic(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const FSE_DTable *dt, + const unsigned fast) +{ + BYTE *const ostart = (BYTE *)dst; + BYTE *op = ostart; + BYTE *const omax = op + maxDstSize; + BYTE *const olimit = omax - 3; + + BIT_DStream_t bitD; + FSE_DState_t state1; + FSE_DState_t state2; + + /* Init */ + CHECK_F(BIT_initDStream(&bitD, cSrc, cSrcSize)); + + FSE_initDState(&state1, &bitD, dt); + FSE_initDState(&state2, &bitD, dt); + +#define FSE_GETSYMBOL(statePtr) fast ? FSE_decodeSymbolFast(statePtr, &bitD) : FSE_decodeSymbol(statePtr, &bitD) + + /* 4 symbols per loop */ + for (; (BIT_reloadDStream(&bitD) == BIT_DStream_unfinished) & (op < olimit); op += 4) { + op[0] = FSE_GETSYMBOL(&state1); + + if (FSE_MAX_TABLELOG * 2 + 7 > sizeof(bitD.bitContainer) * 8) /* This test must be static */ + BIT_reloadDStream(&bitD); + + op[1] = FSE_GETSYMBOL(&state2); + + if (FSE_MAX_TABLELOG * 4 + 7 > sizeof(bitD.bitContainer) * 8) /* This test must be static */ + { + if (BIT_reloadDStream(&bitD) > BIT_DStream_unfinished) { + op += 2; + break; + } + } + + op[2] = FSE_GETSYMBOL(&state1); + + if (FSE_MAX_TABLELOG * 2 + 7 > sizeof(bitD.bitContainer) * 8) /* This test must be static */ + BIT_reloadDStream(&bitD); + + op[3] = FSE_GETSYMBOL(&state2); + } + + /* tail */ + /* note : BIT_reloadDStream(&bitD) >= FSE_DStream_partiallyFilled; Ends at exactly BIT_DStream_completed */ + while (1) { + if (op > (omax - 2)) + return ERROR(dstSize_tooSmall); + *op++ = FSE_GETSYMBOL(&state1); + if (BIT_reloadDStream(&bitD) == BIT_DStream_overflow) { + *op++ = FSE_GETSYMBOL(&state2); + break; + } + + if (op > (omax - 2)) + return ERROR(dstSize_tooSmall); + *op++ = FSE_GETSYMBOL(&state2); + if (BIT_reloadDStream(&bitD) == BIT_DStream_overflow) { + *op++ = FSE_GETSYMBOL(&state1); + break; + } + } + + return op - ostart; +} + +size_t FSE_decompress_usingDTable(void *dst, size_t originalSize, const void *cSrc, size_t cSrcSize, const FSE_DTable *dt) +{ + const void *ptr = dt; + const FSE_DTableHeader *DTableH = (const FSE_DTableHeader *)ptr; + const U32 fastMode = DTableH->fastMode; + + /* select fast mode (static) */ + if (fastMode) + return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1); + return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0); +} + +size_t FSE_decompress_wksp(void *dst, size_t dstCapacity, const void *cSrc, size_t cSrcSize, unsigned maxLog, void *workspace, size_t workspaceSize) +{ + const BYTE *const istart = (const BYTE *)cSrc; + const BYTE *ip = istart; + unsigned tableLog; + unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE; + size_t NCountLength; + + FSE_DTable *dt; + short *counting; + size_t spaceUsed32 = 0; + + FSE_STATIC_ASSERT(sizeof(FSE_DTable) == sizeof(U32)); + + dt = (FSE_DTable *)((U32 *)workspace + spaceUsed32); + spaceUsed32 += FSE_DTABLE_SIZE_U32(maxLog); + counting = (short *)((U32 *)workspace + spaceUsed32); + spaceUsed32 += ALIGN(sizeof(short) * (FSE_MAX_SYMBOL_VALUE + 1), sizeof(U32)) >> 2; + + if ((spaceUsed32 << 2) > workspaceSize) + return ERROR(tableLog_tooLarge); + workspace = (U32 *)workspace + spaceUsed32; + workspaceSize -= (spaceUsed32 << 2); + + /* normal FSE decoding mode */ + NCountLength = FSE_readNCount(counting, &maxSymbolValue, &tableLog, istart, cSrcSize); + if (FSE_isError(NCountLength)) + return NCountLength; + // if (NCountLength >= cSrcSize) return ERROR(srcSize_wrong); /* too small input size; supposed to be already checked in NCountLength, only remaining + // case : NCountLength==cSrcSize */ + if (tableLog > maxLog) + return ERROR(tableLog_tooLarge); + ip += NCountLength; + cSrcSize -= NCountLength; + + CHECK_F(FSE_buildDTable_wksp(dt, counting, maxSymbolValue, tableLog, workspace, workspaceSize)); + + return FSE_decompress_usingDTable(dst, dstCapacity, ip, cSrcSize, dt); /* always return, even if it is an error code */ +} diff --git a/lib/zstd/huf.h b/lib/zstd/huf.h new file mode 100644 index 000000000000..2143da28d952 --- /dev/null +++ b/lib/zstd/huf.h @@ -0,0 +1,212 @@ +/* + * Huffman coder, part of New Generation Entropy library + * header file + * Copyright (C) 2013-2016, Yann Collet. + * + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * This program is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License version 2 as published by the + * Free Software Foundation. This program is dual-licensed; you may select + * either version 2 of the GNU General Public License ("GPL") or BSD license + * ("BSD"). + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy + */ +#ifndef HUF_H_298734234 +#define HUF_H_298734234 + +/* *** Dependencies *** */ +#include /* size_t */ + +/* *** Tool functions *** */ +#define HUF_BLOCKSIZE_MAX (128 * 1024) /**< maximum input size for a single block compressed with HUF_compress */ +size_t HUF_compressBound(size_t size); /**< maximum compressed size (worst case) */ + +/* Error Management */ +unsigned HUF_isError(size_t code); /**< tells if a return value is an error code */ + +/* *** Advanced function *** */ + +/** HUF_compress4X_wksp() : +* Same as HUF_compress2(), but uses externally allocated `workSpace`, which must be a table of >= 1024 unsigned */ +size_t HUF_compress4X_wksp(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void *workSpace, + size_t wkspSize); /**< `workSpace` must be a table of at least HUF_COMPRESS_WORKSPACE_SIZE_U32 unsigned */ + +/* *** Dependencies *** */ +#include "mem.h" /* U32 */ + +/* *** Constants *** */ +#define HUF_TABLELOG_MAX 12 /* max configured tableLog (for static allocation); can be modified up to HUF_ABSOLUTEMAX_TABLELOG */ +#define HUF_TABLELOG_DEFAULT 11 /* tableLog by default, when not specified */ +#define HUF_SYMBOLVALUE_MAX 255 + +#define HUF_TABLELOG_ABSOLUTEMAX 15 /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */ +#if (HUF_TABLELOG_MAX > HUF_TABLELOG_ABSOLUTEMAX) +#error "HUF_TABLELOG_MAX is too large !" +#endif + +/* **************************************** +* Static allocation +******************************************/ +/* HUF buffer bounds */ +#define HUF_CTABLEBOUND 129 +#define HUF_BLOCKBOUND(size) (size + (size >> 8) + 8) /* only true if incompressible pre-filtered with fast heuristic */ +#define HUF_COMPRESSBOUND(size) (HUF_CTABLEBOUND + HUF_BLOCKBOUND(size)) /* Macro version, useful for static allocation */ + +/* static allocation of HUF's Compression Table */ +#define HUF_CREATE_STATIC_CTABLE(name, maxSymbolValue) \ + U32 name##hb[maxSymbolValue + 1]; \ + void *name##hv = &(name##hb); \ + HUF_CElt *name = (HUF_CElt *)(name##hv) /* no final ; */ + +/* static allocation of HUF's DTable */ +typedef U32 HUF_DTable; +#define HUF_DTABLE_SIZE(maxTableLog) (1 + (1 << (maxTableLog))) +#define HUF_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) HUF_DTable DTable[HUF_DTABLE_SIZE((maxTableLog)-1)] = {((U32)((maxTableLog)-1) * 0x01000001)} +#define HUF_CREATE_STATIC_DTABLEX4(DTable, maxTableLog) HUF_DTable DTable[HUF_DTABLE_SIZE(maxTableLog)] = {((U32)(maxTableLog)*0x01000001)} + +/* The workspace must have alignment at least 4 and be at least this large */ +#define HUF_COMPRESS_WORKSPACE_SIZE (6 << 10) +#define HUF_COMPRESS_WORKSPACE_SIZE_U32 (HUF_COMPRESS_WORKSPACE_SIZE / sizeof(U32)) + +/* The workspace must have alignment at least 4 and be at least this large */ +#define HUF_DECOMPRESS_WORKSPACE_SIZE (3 << 10) +#define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32)) + +/* **************************************** +* Advanced decompression functions +******************************************/ +size_t HUF_decompress4X_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize); /**< decodes RLE and uncompressed */ +size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, + size_t workspaceSize); /**< considers RLE and uncompressed as errors */ +size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, + size_t workspaceSize); /**< single-symbol decoder */ +size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, + size_t workspaceSize); /**< double-symbols decoder */ + +/* **************************************** +* HUF detailed API +******************************************/ +/*! +HUF_compress() does the following: +1. count symbol occurrence from source[] into table count[] using FSE_count() +2. (optional) refine tableLog using HUF_optimalTableLog() +3. build Huffman table from count using HUF_buildCTable() +4. save Huffman table to memory buffer using HUF_writeCTable_wksp() +5. encode the data stream using HUF_compress4X_usingCTable() + +The following API allows targeting specific sub-functions for advanced tasks. +For example, it's possible to compress several blocks using the same 'CTable', +or to save and regenerate 'CTable' using external methods. +*/ +/* FSE_count() : find it within "fse.h" */ +unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue); +typedef struct HUF_CElt_s HUF_CElt; /* incomplete type */ +size_t HUF_writeCTable_wksp(void *dst, size_t maxDstSize, const HUF_CElt *CTable, unsigned maxSymbolValue, unsigned huffLog, void *workspace, size_t workspaceSize); +size_t HUF_compress4X_usingCTable(void *dst, size_t dstSize, const void *src, size_t srcSize, const HUF_CElt *CTable); + +typedef enum { + HUF_repeat_none, /**< Cannot use the previous table */ + HUF_repeat_check, /**< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, + 4}X_repeat */ + HUF_repeat_valid /**< Can use the previous table and it is asumed to be valid */ +} HUF_repeat; +/** HUF_compress4X_repeat() : +* Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. +* If it uses hufTable it does not modify hufTable or repeat. +* If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used. +* If preferRepeat then the old table will always be used if valid. */ +size_t HUF_compress4X_repeat(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void *workSpace, + size_t wkspSize, HUF_CElt *hufTable, HUF_repeat *repeat, + int preferRepeat); /**< `workSpace` must be a table of at least HUF_COMPRESS_WORKSPACE_SIZE_U32 unsigned */ + +/** HUF_buildCTable_wksp() : + * Same as HUF_buildCTable(), but using externally allocated scratch buffer. + * `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as a table of 1024 unsigned. + */ +size_t HUF_buildCTable_wksp(HUF_CElt *tree, const U32 *count, U32 maxSymbolValue, U32 maxNbBits, void *workSpace, size_t wkspSize); + +/*! HUF_readStats() : + Read compact Huffman tree, saved by HUF_writeCTable(). + `huffWeight` is destination buffer. + @return : size read from `src` , or an error Code . + Note : Needed by HUF_readCTable() and HUF_readDTableXn() . */ +size_t HUF_readStats_wksp(BYTE *huffWeight, size_t hwSize, U32 *rankStats, U32 *nbSymbolsPtr, U32 *tableLogPtr, const void *src, size_t srcSize, + void *workspace, size_t workspaceSize); + +/** HUF_readCTable() : +* Loading a CTable saved with HUF_writeCTable() */ +size_t HUF_readCTable_wksp(HUF_CElt *CTable, unsigned maxSymbolValue, const void *src, size_t srcSize, void *workspace, size_t workspaceSize); + +/* +HUF_decompress() does the following: +1. select the decompression algorithm (X2, X4) based on pre-computed heuristics +2. build Huffman table from save, using HUF_readDTableXn() +3. decode 1 or 4 segments in parallel using HUF_decompressSXn_usingDTable +*/ + +/** HUF_selectDecoder() : +* Tells which decoder is likely to decode faster, +* based on a set of pre-determined metrics. +* @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 . +* Assumption : 0 < cSrcSize < dstSize <= 128 KB */ +U32 HUF_selectDecoder(size_t dstSize, size_t cSrcSize); + +size_t HUF_readDTableX2_wksp(HUF_DTable *DTable, const void *src, size_t srcSize, void *workspace, size_t workspaceSize); +size_t HUF_readDTableX4_wksp(HUF_DTable *DTable, const void *src, size_t srcSize, void *workspace, size_t workspaceSize); + +size_t HUF_decompress4X_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable); +size_t HUF_decompress4X2_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable); +size_t HUF_decompress4X4_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable); + +/* single stream variants */ + +size_t HUF_compress1X_wksp(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void *workSpace, + size_t wkspSize); /**< `workSpace` must be a table of at least HUF_COMPRESS_WORKSPACE_SIZE_U32 unsigned */ +size_t HUF_compress1X_usingCTable(void *dst, size_t dstSize, const void *src, size_t srcSize, const HUF_CElt *CTable); +/** HUF_compress1X_repeat() : +* Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. +* If it uses hufTable it does not modify hufTable or repeat. +* If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used. +* If preferRepeat then the old table will always be used if valid. */ +size_t HUF_compress1X_repeat(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void *workSpace, + size_t wkspSize, HUF_CElt *hufTable, HUF_repeat *repeat, + int preferRepeat); /**< `workSpace` must be a table of at least HUF_COMPRESS_WORKSPACE_SIZE_U32 unsigned */ + +size_t HUF_decompress1X_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize); +size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, + size_t workspaceSize); /**< single-symbol decoder */ +size_t HUF_decompress1X4_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, + size_t workspaceSize); /**< double-symbols decoder */ + +size_t HUF_decompress1X_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, + const HUF_DTable *DTable); /**< automatic selection of sing or double symbol decoder, based on DTable */ +size_t HUF_decompress1X2_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable); +size_t HUF_decompress1X4_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable); + +#endif /* HUF_H_298734234 */ diff --git a/lib/zstd/huf_compress.c b/lib/zstd/huf_compress.c new file mode 100644 index 000000000000..40055a7016e6 --- /dev/null +++ b/lib/zstd/huf_compress.c @@ -0,0 +1,770 @@ +/* + * Huffman encoder, part of New Generation Entropy library + * Copyright (C) 2013-2016, Yann Collet. + * + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * This program is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License version 2 as published by the + * Free Software Foundation. This program is dual-licensed; you may select + * either version 2 of the GNU General Public License ("GPL") or BSD license + * ("BSD"). + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy + */ + +/* ************************************************************** +* Includes +****************************************************************/ +#include "bitstream.h" +#include "fse.h" /* header compression */ +#include "huf.h" +#include +#include /* memcpy, memset */ + +/* ************************************************************** +* Error Management +****************************************************************/ +#define HUF_STATIC_ASSERT(c) \ + { \ + enum { HUF_static_assert = 1 / (int)(!!(c)) }; \ + } /* use only *after* variable declarations */ +#define CHECK_V_F(e, f) \ + size_t const e = f; \ + if (ERR_isError(e)) \ + return f +#define CHECK_F(f) \ + { \ + CHECK_V_F(_var_err__, f); \ + } + +/* ************************************************************** +* Utils +****************************************************************/ +unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue) +{ + return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1); +} + +/* ******************************************************* +* HUF : Huffman block compression +*********************************************************/ +/* HUF_compressWeights() : + * Same as FSE_compress(), but dedicated to huff0's weights compression. + * The use case needs much less stack memory. + * Note : all elements within weightTable are supposed to be <= HUF_TABLELOG_MAX. + */ +#define MAX_FSE_TABLELOG_FOR_HUFF_HEADER 6 +size_t HUF_compressWeights_wksp(void *dst, size_t dstSize, const void *weightTable, size_t wtSize, void *workspace, size_t workspaceSize) +{ + BYTE *const ostart = (BYTE *)dst; + BYTE *op = ostart; + BYTE *const oend = ostart + dstSize; + + U32 maxSymbolValue = HUF_TABLELOG_MAX; + U32 tableLog = MAX_FSE_TABLELOG_FOR_HUFF_HEADER; + + FSE_CTable *CTable; + U32 *count; + S16 *norm; + size_t spaceUsed32 = 0; + + HUF_STATIC_ASSERT(sizeof(FSE_CTable) == sizeof(U32)); + + CTable = (FSE_CTable *)((U32 *)workspace + spaceUsed32); + spaceUsed32 += FSE_CTABLE_SIZE_U32(MAX_FSE_TABLELOG_FOR_HUFF_HEADER, HUF_TABLELOG_MAX); + count = (U32 *)workspace + spaceUsed32; + spaceUsed32 += HUF_TABLELOG_MAX + 1; + norm = (S16 *)((U32 *)workspace + spaceUsed32); + spaceUsed32 += ALIGN(sizeof(S16) * (HUF_TABLELOG_MAX + 1), sizeof(U32)) >> 2; + + if ((spaceUsed32 << 2) > workspaceSize) + return ERROR(tableLog_tooLarge); + workspace = (U32 *)workspace + spaceUsed32; + workspaceSize -= (spaceUsed32 << 2); + + /* init conditions */ + if (wtSize <= 1) + return 0; /* Not compressible */ + + /* Scan input and build symbol stats */ + { + CHECK_V_F(maxCount, FSE_count_simple(count, &maxSymbolValue, weightTable, wtSize)); + if (maxCount == wtSize) + return 1; /* only a single symbol in src : rle */ + if (maxCount == 1) + return 0; /* each symbol present maximum once => not compressible */ + } + + tableLog = FSE_optimalTableLog(tableLog, wtSize, maxSymbolValue); + CHECK_F(FSE_normalizeCount(norm, tableLog, count, wtSize, maxSymbolValue)); + + /* Write table description header */ + { + CHECK_V_F(hSize, FSE_writeNCount(op, oend - op, norm, maxSymbolValue, tableLog)); + op += hSize; + } + + /* Compress */ + CHECK_F(FSE_buildCTable_wksp(CTable, norm, maxSymbolValue, tableLog, workspace, workspaceSize)); + { + CHECK_V_F(cSize, FSE_compress_usingCTable(op, oend - op, weightTable, wtSize, CTable)); + if (cSize == 0) + return 0; /* not enough space for compressed data */ + op += cSize; + } + + return op - ostart; +} + +struct HUF_CElt_s { + U16 val; + BYTE nbBits; +}; /* typedef'd to HUF_CElt within "huf.h" */ + +/*! HUF_writeCTable_wksp() : + `CTable` : Huffman tree to save, using huf representation. + @return : size of saved CTable */ +size_t HUF_writeCTable_wksp(void *dst, size_t maxDstSize, const HUF_CElt *CTable, U32 maxSymbolValue, U32 huffLog, void *workspace, size_t workspaceSize) +{ + BYTE *op = (BYTE *)dst; + U32 n; + + BYTE *bitsToWeight; + BYTE *huffWeight; + size_t spaceUsed32 = 0; + + bitsToWeight = (BYTE *)((U32 *)workspace + spaceUsed32); + spaceUsed32 += ALIGN(HUF_TABLELOG_MAX + 1, sizeof(U32)) >> 2; + huffWeight = (BYTE *)((U32 *)workspace + spaceUsed32); + spaceUsed32 += ALIGN(HUF_SYMBOLVALUE_MAX, sizeof(U32)) >> 2; + + if ((spaceUsed32 << 2) > workspaceSize) + return ERROR(tableLog_tooLarge); + workspace = (U32 *)workspace + spaceUsed32; + workspaceSize -= (spaceUsed32 << 2); + + /* check conditions */ + if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) + return ERROR(maxSymbolValue_tooLarge); + + /* convert to weight */ + bitsToWeight[0] = 0; + for (n = 1; n < huffLog + 1; n++) + bitsToWeight[n] = (BYTE)(huffLog + 1 - n); + for (n = 0; n < maxSymbolValue; n++) + huffWeight[n] = bitsToWeight[CTable[n].nbBits]; + + /* attempt weights compression by FSE */ + { + CHECK_V_F(hSize, HUF_compressWeights_wksp(op + 1, maxDstSize - 1, huffWeight, maxSymbolValue, workspace, workspaceSize)); + if ((hSize > 1) & (hSize < maxSymbolValue / 2)) { /* FSE compressed */ + op[0] = (BYTE)hSize; + return hSize + 1; + } + } + + /* write raw values as 4-bits (max : 15) */ + if (maxSymbolValue > (256 - 128)) + return ERROR(GENERIC); /* should not happen : likely means source cannot be compressed */ + if (((maxSymbolValue + 1) / 2) + 1 > maxDstSize) + return ERROR(dstSize_tooSmall); /* not enough space within dst buffer */ + op[0] = (BYTE)(128 /*special case*/ + (maxSymbolValue - 1)); + huffWeight[maxSymbolValue] = 0; /* to be sure it doesn't cause msan issue in final combination */ + for (n = 0; n < maxSymbolValue; n += 2) + op[(n / 2) + 1] = (BYTE)((huffWeight[n] << 4) + huffWeight[n + 1]); + return ((maxSymbolValue + 1) / 2) + 1; +} + +size_t HUF_readCTable_wksp(HUF_CElt *CTable, U32 maxSymbolValue, const void *src, size_t srcSize, void *workspace, size_t workspaceSize) +{ + U32 *rankVal; + BYTE *huffWeight; + U32 tableLog = 0; + U32 nbSymbols = 0; + size_t readSize; + size_t spaceUsed32 = 0; + + rankVal = (U32 *)workspace + spaceUsed32; + spaceUsed32 += HUF_TABLELOG_ABSOLUTEMAX + 1; + huffWeight = (BYTE *)((U32 *)workspace + spaceUsed32); + spaceUsed32 += ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2; + + if ((spaceUsed32 << 2) > workspaceSize) + return ERROR(tableLog_tooLarge); + workspace = (U32 *)workspace + spaceUsed32; + workspaceSize -= (spaceUsed32 << 2); + + /* get symbol weights */ + readSize = HUF_readStats_wksp(huffWeight, HUF_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize, workspace, workspaceSize); + if (ERR_isError(readSize)) + return readSize; + + /* check result */ + if (tableLog > HUF_TABLELOG_MAX) + return ERROR(tableLog_tooLarge); + if (nbSymbols > maxSymbolValue + 1) + return ERROR(maxSymbolValue_tooSmall); + + /* Prepare base value per rank */ + { + U32 n, nextRankStart = 0; + for (n = 1; n <= tableLog; n++) { + U32 curr = nextRankStart; + nextRankStart += (rankVal[n] << (n - 1)); + rankVal[n] = curr; + } + } + + /* fill nbBits */ + { + U32 n; + for (n = 0; n < nbSymbols; n++) { + const U32 w = huffWeight[n]; + CTable[n].nbBits = (BYTE)(tableLog + 1 - w); + } + } + + /* fill val */ + { + U16 nbPerRank[HUF_TABLELOG_MAX + 2] = {0}; /* support w=0=>n=tableLog+1 */ + U16 valPerRank[HUF_TABLELOG_MAX + 2] = {0}; + { + U32 n; + for (n = 0; n < nbSymbols; n++) + nbPerRank[CTable[n].nbBits]++; + } + /* determine stating value per rank */ + valPerRank[tableLog + 1] = 0; /* for w==0 */ + { + U16 min = 0; + U32 n; + for (n = tableLog; n > 0; n--) { /* start at n=tablelog <-> w=1 */ + valPerRank[n] = min; /* get starting value within each rank */ + min += nbPerRank[n]; + min >>= 1; + } + } + /* assign value within rank, symbol order */ + { + U32 n; + for (n = 0; n <= maxSymbolValue; n++) + CTable[n].val = valPerRank[CTable[n].nbBits]++; + } + } + + return readSize; +} + +typedef struct nodeElt_s { + U32 count; + U16 parent; + BYTE byte; + BYTE nbBits; +} nodeElt; + +static U32 HUF_setMaxHeight(nodeElt *huffNode, U32 lastNonNull, U32 maxNbBits) +{ + const U32 largestBits = huffNode[lastNonNull].nbBits; + if (largestBits <= maxNbBits) + return largestBits; /* early exit : no elt > maxNbBits */ + + /* there are several too large elements (at least >= 2) */ + { + int totalCost = 0; + const U32 baseCost = 1 << (largestBits - maxNbBits); + U32 n = lastNonNull; + + while (huffNode[n].nbBits > maxNbBits) { + totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits)); + huffNode[n].nbBits = (BYTE)maxNbBits; + n--; + } /* n stops at huffNode[n].nbBits <= maxNbBits */ + while (huffNode[n].nbBits == maxNbBits) + n--; /* n end at index of smallest symbol using < maxNbBits */ + + /* renorm totalCost */ + totalCost >>= (largestBits - maxNbBits); /* note : totalCost is necessarily a multiple of baseCost */ + + /* repay normalized cost */ + { + U32 const noSymbol = 0xF0F0F0F0; + U32 rankLast[HUF_TABLELOG_MAX + 2]; + int pos; + + /* Get pos of last (smallest) symbol per rank */ + memset(rankLast, 0xF0, sizeof(rankLast)); + { + U32 currNbBits = maxNbBits; + for (pos = n; pos >= 0; pos--) { + if (huffNode[pos].nbBits >= currNbBits) + continue; + currNbBits = huffNode[pos].nbBits; /* < maxNbBits */ + rankLast[maxNbBits - currNbBits] = pos; + } + } + + while (totalCost > 0) { + U32 nBitsToDecrease = BIT_highbit32(totalCost) + 1; + for (; nBitsToDecrease > 1; nBitsToDecrease--) { + U32 highPos = rankLast[nBitsToDecrease]; + U32 lowPos = rankLast[nBitsToDecrease - 1]; + if (highPos == noSymbol) + continue; + if (lowPos == noSymbol) + break; + { + U32 const highTotal = huffNode[highPos].count; + U32 const lowTotal = 2 * huffNode[lowPos].count; + if (highTotal <= lowTotal) + break; + } + } + /* only triggered when no more rank 1 symbol left => find closest one (note : there is necessarily at least one !) */ + /* HUF_MAX_TABLELOG test just to please gcc 5+; but it should not be necessary */ + while ((nBitsToDecrease <= HUF_TABLELOG_MAX) && (rankLast[nBitsToDecrease] == noSymbol)) + nBitsToDecrease++; + totalCost -= 1 << (nBitsToDecrease - 1); + if (rankLast[nBitsToDecrease - 1] == noSymbol) + rankLast[nBitsToDecrease - 1] = rankLast[nBitsToDecrease]; /* this rank is no longer empty */ + huffNode[rankLast[nBitsToDecrease]].nbBits++; + if (rankLast[nBitsToDecrease] == 0) /* special case, reached largest symbol */ + rankLast[nBitsToDecrease] = noSymbol; + else { + rankLast[nBitsToDecrease]--; + if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits - nBitsToDecrease) + rankLast[nBitsToDecrease] = noSymbol; /* this rank is now empty */ + } + } /* while (totalCost > 0) */ + + while (totalCost < 0) { /* Sometimes, cost correction overshoot */ + if (rankLast[1] == noSymbol) { /* special case : no rank 1 symbol (using maxNbBits-1); let's create one from largest rank 0 + (using maxNbBits) */ + while (huffNode[n].nbBits == maxNbBits) + n--; + huffNode[n + 1].nbBits--; + rankLast[1] = n + 1; + totalCost++; + continue; + } + huffNode[rankLast[1] + 1].nbBits--; + rankLast[1]++; + totalCost++; + } + } + } /* there are several too large elements (at least >= 2) */ + + return maxNbBits; +} + +typedef struct { + U32 base; + U32 curr; +} rankPos; + +static void HUF_sort(nodeElt *huffNode, const U32 *count, U32 maxSymbolValue) +{ + rankPos rank[32]; + U32 n; + + memset(rank, 0, sizeof(rank)); + for (n = 0; n <= maxSymbolValue; n++) { + U32 r = BIT_highbit32(count[n] + 1); + rank[r].base++; + } + for (n = 30; n > 0; n--) + rank[n - 1].base += rank[n].base; + for (n = 0; n < 32; n++) + rank[n].curr = rank[n].base; + for (n = 0; n <= maxSymbolValue; n++) { + U32 const c = count[n]; + U32 const r = BIT_highbit32(c + 1) + 1; + U32 pos = rank[r].curr++; + while ((pos > rank[r].base) && (c > huffNode[pos - 1].count)) + huffNode[pos] = huffNode[pos - 1], pos--; + huffNode[pos].count = c; + huffNode[pos].byte = (BYTE)n; + } +} + +/** HUF_buildCTable_wksp() : + * Same as HUF_buildCTable(), but using externally allocated scratch buffer. + * `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as a table of 1024 unsigned. + */ +#define STARTNODE (HUF_SYMBOLVALUE_MAX + 1) +typedef nodeElt huffNodeTable[2 * HUF_SYMBOLVALUE_MAX + 1 + 1]; +size_t HUF_buildCTable_wksp(HUF_CElt *tree, const U32 *count, U32 maxSymbolValue, U32 maxNbBits, void *workSpace, size_t wkspSize) +{ + nodeElt *const huffNode0 = (nodeElt *)workSpace; + nodeElt *const huffNode = huffNode0 + 1; + U32 n, nonNullRank; + int lowS, lowN; + U16 nodeNb = STARTNODE; + U32 nodeRoot; + + /* safety checks */ + if (wkspSize < sizeof(huffNodeTable)) + return ERROR(GENERIC); /* workSpace is not large enough */ + if (maxNbBits == 0) + maxNbBits = HUF_TABLELOG_DEFAULT; + if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) + return ERROR(GENERIC); + memset(huffNode0, 0, sizeof(huffNodeTable)); + + /* sort, decreasing order */ + HUF_sort(huffNode, count, maxSymbolValue); + + /* init for parents */ + nonNullRank = maxSymbolValue; + while (huffNode[nonNullRank].count == 0) + nonNullRank--; + lowS = nonNullRank; + nodeRoot = nodeNb + lowS - 1; + lowN = nodeNb; + huffNode[nodeNb].count = huffNode[lowS].count + huffNode[lowS - 1].count; + huffNode[lowS].parent = huffNode[lowS - 1].parent = nodeNb; + nodeNb++; + lowS -= 2; + for (n = nodeNb; n <= nodeRoot; n++) + huffNode[n].count = (U32)(1U << 30); + huffNode0[0].count = (U32)(1U << 31); /* fake entry, strong barrier */ + + /* create parents */ + while (nodeNb <= nodeRoot) { + U32 n1 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++; + U32 n2 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++; + huffNode[nodeNb].count = huffNode[n1].count + huffNode[n2].count; + huffNode[n1].parent = huffNode[n2].parent = nodeNb; + nodeNb++; + } + + /* distribute weights (unlimited tree height) */ + huffNode[nodeRoot].nbBits = 0; + for (n = nodeRoot - 1; n >= STARTNODE; n--) + huffNode[n].nbBits = huffNode[huffNode[n].parent].nbBits + 1; + for (n = 0; n <= nonNullRank; n++) + huffNode[n].nbBits = huffNode[huffNode[n].parent].nbBits + 1; + + /* enforce maxTableLog */ + maxNbBits = HUF_setMaxHeight(huffNode, nonNullRank, maxNbBits); + + /* fill result into tree (val, nbBits) */ + { + U16 nbPerRank[HUF_TABLELOG_MAX + 1] = {0}; + U16 valPerRank[HUF_TABLELOG_MAX + 1] = {0}; + if (maxNbBits > HUF_TABLELOG_MAX) + return ERROR(GENERIC); /* check fit into table */ + for (n = 0; n <= nonNullRank; n++) + nbPerRank[huffNode[n].nbBits]++; + /* determine stating value per rank */ + { + U16 min = 0; + for (n = maxNbBits; n > 0; n--) { + valPerRank[n] = min; /* get starting value within each rank */ + min += nbPerRank[n]; + min >>= 1; + } + } + for (n = 0; n <= maxSymbolValue; n++) + tree[huffNode[n].byte].nbBits = huffNode[n].nbBits; /* push nbBits per symbol, symbol order */ + for (n = 0; n <= maxSymbolValue; n++) + tree[n].val = valPerRank[tree[n].nbBits]++; /* assign value within rank, symbol order */ + } + + return maxNbBits; +} + +static size_t HUF_estimateCompressedSize(HUF_CElt *CTable, const unsigned *count, unsigned maxSymbolValue) +{ + size_t nbBits = 0; + int s; + for (s = 0; s <= (int)maxSymbolValue; ++s) { + nbBits += CTable[s].nbBits * count[s]; + } + return nbBits >> 3; +} + +static int HUF_validateCTable(const HUF_CElt *CTable, const unsigned *count, unsigned maxSymbolValue) +{ + int bad = 0; + int s; + for (s = 0; s <= (int)maxSymbolValue; ++s) { + bad |= (count[s] != 0) & (CTable[s].nbBits == 0); + } + return !bad; +} + +static void HUF_encodeSymbol(BIT_CStream_t *bitCPtr, U32 symbol, const HUF_CElt *CTable) +{ + BIT_addBitsFast(bitCPtr, CTable[symbol].val, CTable[symbol].nbBits); +} + +size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); } + +#define HUF_FLUSHBITS(s) BIT_flushBits(s) + +#define HUF_FLUSHBITS_1(stream) \ + if (sizeof((stream)->bitContainer) * 8 < HUF_TABLELOG_MAX * 2 + 7) \ + HUF_FLUSHBITS(stream) + +#define HUF_FLUSHBITS_2(stream) \ + if (sizeof((stream)->bitContainer) * 8 < HUF_TABLELOG_MAX * 4 + 7) \ + HUF_FLUSHBITS(stream) + +size_t HUF_compress1X_usingCTable(void *dst, size_t dstSize, const void *src, size_t srcSize, const HUF_CElt *CTable) +{ + const BYTE *ip = (const BYTE *)src; + BYTE *const ostart = (BYTE *)dst; + BYTE *const oend = ostart + dstSize; + BYTE *op = ostart; + size_t n; + BIT_CStream_t bitC; + + /* init */ + if (dstSize < 8) + return 0; /* not enough space to compress */ + { + size_t const initErr = BIT_initCStream(&bitC, op, oend - op); + if (HUF_isError(initErr)) + return 0; + } + + n = srcSize & ~3; /* join to mod 4 */ + switch (srcSize & 3) { + case 3: HUF_encodeSymbol(&bitC, ip[n + 2], CTable); HUF_FLUSHBITS_2(&bitC); + case 2: HUF_encodeSymbol(&bitC, ip[n + 1], CTable); HUF_FLUSHBITS_1(&bitC); + case 1: HUF_encodeSymbol(&bitC, ip[n + 0], CTable); HUF_FLUSHBITS(&bitC); + case 0: + default:; + } + + for (; n > 0; n -= 4) { /* note : n&3==0 at this stage */ + HUF_encodeSymbol(&bitC, ip[n - 1], CTable); + HUF_FLUSHBITS_1(&bitC); + HUF_encodeSymbol(&bitC, ip[n - 2], CTable); + HUF_FLUSHBITS_2(&bitC); + HUF_encodeSymbol(&bitC, ip[n - 3], CTable); + HUF_FLUSHBITS_1(&bitC); + HUF_encodeSymbol(&bitC, ip[n - 4], CTable); + HUF_FLUSHBITS(&bitC); + } + + return BIT_closeCStream(&bitC); +} + +size_t HUF_compress4X_usingCTable(void *dst, size_t dstSize, const void *src, size_t srcSize, const HUF_CElt *CTable) +{ + size_t const segmentSize = (srcSize + 3) / 4; /* first 3 segments */ + const BYTE *ip = (const BYTE *)src; + const BYTE *const iend = ip + srcSize; + BYTE *const ostart = (BYTE *)dst; + BYTE *const oend = ostart + dstSize; + BYTE *op = ostart; + + if (dstSize < 6 + 1 + 1 + 1 + 8) + return 0; /* minimum space to compress successfully */ + if (srcSize < 12) + return 0; /* no saving possible : too small input */ + op += 6; /* jumpTable */ + + { + CHECK_V_F(cSize, HUF_compress1X_usingCTable(op, oend - op, ip, segmentSize, CTable)); + if (cSize == 0) + return 0; + ZSTD_writeLE16(ostart, (U16)cSize); + op += cSize; + } + + ip += segmentSize; + { + CHECK_V_F(cSize, HUF_compress1X_usingCTable(op, oend - op, ip, segmentSize, CTable)); + if (cSize == 0) + return 0; + ZSTD_writeLE16(ostart + 2, (U16)cSize); + op += cSize; + } + + ip += segmentSize; + { + CHECK_V_F(cSize, HUF_compress1X_usingCTable(op, oend - op, ip, segmentSize, CTable)); + if (cSize == 0) + return 0; + ZSTD_writeLE16(ostart + 4, (U16)cSize); + op += cSize; + } + + ip += segmentSize; + { + CHECK_V_F(cSize, HUF_compress1X_usingCTable(op, oend - op, ip, iend - ip, CTable)); + if (cSize == 0) + return 0; + op += cSize; + } + + return op - ostart; +} + +static size_t HUF_compressCTable_internal(BYTE *const ostart, BYTE *op, BYTE *const oend, const void *src, size_t srcSize, unsigned singleStream, + const HUF_CElt *CTable) +{ + size_t const cSize = + singleStream ? HUF_compress1X_usingCTable(op, oend - op, src, srcSize, CTable) : HUF_compress4X_usingCTable(op, oend - op, src, srcSize, CTable); + if (HUF_isError(cSize)) { + return cSize; + } + if (cSize == 0) { + return 0; + } /* uncompressible */ + op += cSize; + /* check compressibility */ + if ((size_t)(op - ostart) >= srcSize - 1) { + return 0; + } + return op - ostart; +} + +/* `workSpace` must a table of at least 1024 unsigned */ +static size_t HUF_compress_internal(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, + unsigned singleStream, void *workSpace, size_t wkspSize, HUF_CElt *oldHufTable, HUF_repeat *repeat, int preferRepeat) +{ + BYTE *const ostart = (BYTE *)dst; + BYTE *const oend = ostart + dstSize; + BYTE *op = ostart; + + U32 *count; + size_t const countSize = sizeof(U32) * (HUF_SYMBOLVALUE_MAX + 1); + HUF_CElt *CTable; + size_t const CTableSize = sizeof(HUF_CElt) * (HUF_SYMBOLVALUE_MAX + 1); + + /* checks & inits */ + if (wkspSize < sizeof(huffNodeTable) + countSize + CTableSize) + return ERROR(GENERIC); + if (!srcSize) + return 0; /* Uncompressed (note : 1 means rle, so first byte must be correct) */ + if (!dstSize) + return 0; /* cannot fit within dst budget */ + if (srcSize > HUF_BLOCKSIZE_MAX) + return ERROR(srcSize_wrong); /* curr block size limit */ + if (huffLog > HUF_TABLELOG_MAX) + return ERROR(tableLog_tooLarge); + if (!maxSymbolValue) + maxSymbolValue = HUF_SYMBOLVALUE_MAX; + if (!huffLog) + huffLog = HUF_TABLELOG_DEFAULT; + + count = (U32 *)workSpace; + workSpace = (BYTE *)workSpace + countSize; + wkspSize -= countSize; + CTable = (HUF_CElt *)workSpace; + workSpace = (BYTE *)workSpace + CTableSize; + wkspSize -= CTableSize; + + /* Heuristic : If we don't need to check the validity of the old table use the old table for small inputs */ + if (preferRepeat && repeat && *repeat == HUF_repeat_valid) { + return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, oldHufTable); + } + + /* Scan input and build symbol stats */ + { + CHECK_V_F(largest, FSE_count_wksp(count, &maxSymbolValue, (const BYTE *)src, srcSize, (U32 *)workSpace)); + if (largest == srcSize) { + *ostart = ((const BYTE *)src)[0]; + return 1; + } /* single symbol, rle */ + if (largest <= (srcSize >> 7) + 1) + return 0; /* Fast heuristic : not compressible enough */ + } + + /* Check validity of previous table */ + if (repeat && *repeat == HUF_repeat_check && !HUF_validateCTable(oldHufTable, count, maxSymbolValue)) { + *repeat = HUF_repeat_none; + } + /* Heuristic : use existing table for small inputs */ + if (preferRepeat && repeat && *repeat != HUF_repeat_none) { + return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, oldHufTable); + } + + /* Build Huffman Tree */ + huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue); + { + CHECK_V_F(maxBits, HUF_buildCTable_wksp(CTable, count, maxSymbolValue, huffLog, workSpace, wkspSize)); + huffLog = (U32)maxBits; + /* Zero the unused symbols so we can check it for validity */ + memset(CTable + maxSymbolValue + 1, 0, CTableSize - (maxSymbolValue + 1) * sizeof(HUF_CElt)); + } + + /* Write table description header */ + { + CHECK_V_F(hSize, HUF_writeCTable_wksp(op, dstSize, CTable, maxSymbolValue, huffLog, workSpace, wkspSize)); + /* Check if using the previous table will be beneficial */ + if (repeat && *repeat != HUF_repeat_none) { + size_t const oldSize = HUF_estimateCompressedSize(oldHufTable, count, maxSymbolValue); + size_t const newSize = HUF_estimateCompressedSize(CTable, count, maxSymbolValue); + if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) { + return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, oldHufTable); + } + } + /* Use the new table */ + if (hSize + 12ul >= srcSize) { + return 0; + } + op += hSize; + if (repeat) { + *repeat = HUF_repeat_none; + } + if (oldHufTable) { + memcpy(oldHufTable, CTable, CTableSize); + } /* Save the new table */ + } + return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, CTable); +} + +size_t HUF_compress1X_wksp(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, void *workSpace, + size_t wkspSize) +{ + return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 1 /* single stream */, workSpace, wkspSize, NULL, NULL, 0); +} + +size_t HUF_compress1X_repeat(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, void *workSpace, + size_t wkspSize, HUF_CElt *hufTable, HUF_repeat *repeat, int preferRepeat) +{ + return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 1 /* single stream */, workSpace, wkspSize, hufTable, repeat, + preferRepeat); +} + +size_t HUF_compress4X_wksp(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, void *workSpace, + size_t wkspSize) +{ + return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 0 /* 4 streams */, workSpace, wkspSize, NULL, NULL, 0); +} + +size_t HUF_compress4X_repeat(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, void *workSpace, + size_t wkspSize, HUF_CElt *hufTable, HUF_repeat *repeat, int preferRepeat) +{ + return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 0 /* 4 streams */, workSpace, wkspSize, hufTable, repeat, + preferRepeat); +} diff --git a/lib/zstd/huf_decompress.c b/lib/zstd/huf_decompress.c new file mode 100644 index 000000000000..6526482047dc --- /dev/null +++ b/lib/zstd/huf_decompress.c @@ -0,0 +1,960 @@ +/* + * Huffman decoder, part of New Generation Entropy library + * Copyright (C) 2013-2016, Yann Collet. + * + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * This program is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License version 2 as published by the + * Free Software Foundation. This program is dual-licensed; you may select + * either version 2 of the GNU General Public License ("GPL") or BSD license + * ("BSD"). + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy + */ + +/* ************************************************************** +* Compiler specifics +****************************************************************/ +#define FORCE_INLINE static __always_inline + +/* ************************************************************** +* Dependencies +****************************************************************/ +#include "bitstream.h" /* BIT_* */ +#include "fse.h" /* header compression */ +#include "huf.h" +#include +#include +#include /* memcpy, memset */ + +/* ************************************************************** +* Error Management +****************************************************************/ +#define HUF_STATIC_ASSERT(c) \ + { \ + enum { HUF_static_assert = 1 / (int)(!!(c)) }; \ + } /* use only *after* variable declarations */ + +/*-***************************/ +/* generic DTableDesc */ +/*-***************************/ + +typedef struct { + BYTE maxTableLog; + BYTE tableType; + BYTE tableLog; + BYTE reserved; +} DTableDesc; + +static DTableDesc HUF_getDTableDesc(const HUF_DTable *table) +{ + DTableDesc dtd; + memcpy(&dtd, table, sizeof(dtd)); + return dtd; +} + +/*-***************************/ +/* single-symbol decoding */ +/*-***************************/ + +typedef struct { + BYTE byte; + BYTE nbBits; +} HUF_DEltX2; /* single-symbol decoding */ + +size_t HUF_readDTableX2_wksp(HUF_DTable *DTable, const void *src, size_t srcSize, void *workspace, size_t workspaceSize) +{ + U32 tableLog = 0; + U32 nbSymbols = 0; + size_t iSize; + void *const dtPtr = DTable + 1; + HUF_DEltX2 *const dt = (HUF_DEltX2 *)dtPtr; + + U32 *rankVal; + BYTE *huffWeight; + size_t spaceUsed32 = 0; + + rankVal = (U32 *)workspace + spaceUsed32; + spaceUsed32 += HUF_TABLELOG_ABSOLUTEMAX + 1; + huffWeight = (BYTE *)((U32 *)workspace + spaceUsed32); + spaceUsed32 += ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2; + + if ((spaceUsed32 << 2) > workspaceSize) + return ERROR(tableLog_tooLarge); + workspace = (U32 *)workspace + spaceUsed32; + workspaceSize -= (spaceUsed32 << 2); + + HUF_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable)); + /* memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */ + + iSize = HUF_readStats_wksp(huffWeight, HUF_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize, workspace, workspaceSize); + if (HUF_isError(iSize)) + return iSize; + + /* Table header */ + { + DTableDesc dtd = HUF_getDTableDesc(DTable); + if (tableLog > (U32)(dtd.maxTableLog + 1)) + return ERROR(tableLog_tooLarge); /* DTable too small, Huffman tree cannot fit in */ + dtd.tableType = 0; + dtd.tableLog = (BYTE)tableLog; + memcpy(DTable, &dtd, sizeof(dtd)); + } + + /* Calculate starting value for each rank */ + { + U32 n, nextRankStart = 0; + for (n = 1; n < tableLog + 1; n++) { + U32 const curr = nextRankStart; + nextRankStart += (rankVal[n] << (n - 1)); + rankVal[n] = curr; + } + } + + /* fill DTable */ + { + U32 n; + for (n = 0; n < nbSymbols; n++) { + U32 const w = huffWeight[n]; + U32 const length = (1 << w) >> 1; + U32 u; + HUF_DEltX2 D; + D.byte = (BYTE)n; + D.nbBits = (BYTE)(tableLog + 1 - w); + for (u = rankVal[w]; u < rankVal[w] + length; u++) + dt[u] = D; + rankVal[w] += length; + } + } + + return iSize; +} + +static BYTE HUF_decodeSymbolX2(BIT_DStream_t *Dstream, const HUF_DEltX2 *dt, const U32 dtLog) +{ + size_t const val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */ + BYTE const c = dt[val].byte; + BIT_skipBits(Dstream, dt[val].nbBits); + return c; +} + +#define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) *ptr++ = HUF_decodeSymbolX2(DStreamPtr, dt, dtLog) + +#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \ + if (ZSTD_64bits() || (HUF_TABLELOG_MAX <= 12)) \ + HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) + +#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \ + if (ZSTD_64bits()) \ + HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) + +FORCE_INLINE size_t HUF_decodeStreamX2(BYTE *p, BIT_DStream_t *const bitDPtr, BYTE *const pEnd, const HUF_DEltX2 *const dt, const U32 dtLog) +{ + BYTE *const pStart = p; + + /* up to 4 symbols at a time */ + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p <= pEnd - 4)) { + HUF_DECODE_SYMBOLX2_2(p, bitDPtr); + HUF_DECODE_SYMBOLX2_1(p, bitDPtr); + HUF_DECODE_SYMBOLX2_2(p, bitDPtr); + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + } + + /* closer to the end */ + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p < pEnd)) + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + + /* no more data to retrieve from bitstream, hence no need to reload */ + while (p < pEnd) + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + + return pEnd - pStart; +} + +static size_t HUF_decompress1X2_usingDTable_internal(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable) +{ + BYTE *op = (BYTE *)dst; + BYTE *const oend = op + dstSize; + const void *dtPtr = DTable + 1; + const HUF_DEltX2 *const dt = (const HUF_DEltX2 *)dtPtr; + BIT_DStream_t bitD; + DTableDesc const dtd = HUF_getDTableDesc(DTable); + U32 const dtLog = dtd.tableLog; + + { + size_t const errorCode = BIT_initDStream(&bitD, cSrc, cSrcSize); + if (HUF_isError(errorCode)) + return errorCode; + } + + HUF_decodeStreamX2(op, &bitD, oend, dt, dtLog); + + /* check */ + if (!BIT_endOfDStream(&bitD)) + return ERROR(corruption_detected); + + return dstSize; +} + +size_t HUF_decompress1X2_usingDTable(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable) +{ + DTableDesc dtd = HUF_getDTableDesc(DTable); + if (dtd.tableType != 0) + return ERROR(GENERIC); + return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable); +} + +size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable *DCtx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize) +{ + const BYTE *ip = (const BYTE *)cSrc; + + size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize, workspace, workspaceSize); + if (HUF_isError(hSize)) + return hSize; + if (hSize >= cSrcSize) + return ERROR(srcSize_wrong); + ip += hSize; + cSrcSize -= hSize; + + return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx); +} + +static size_t HUF_decompress4X2_usingDTable_internal(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable) +{ + /* Check */ + if (cSrcSize < 10) + return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */ + + { + const BYTE *const istart = (const BYTE *)cSrc; + BYTE *const ostart = (BYTE *)dst; + BYTE *const oend = ostart + dstSize; + const void *const dtPtr = DTable + 1; + const HUF_DEltX2 *const dt = (const HUF_DEltX2 *)dtPtr; + + /* Init */ + BIT_DStream_t bitD1; + BIT_DStream_t bitD2; + BIT_DStream_t bitD3; + BIT_DStream_t bitD4; + size_t const length1 = ZSTD_readLE16(istart); + size_t const length2 = ZSTD_readLE16(istart + 2); + size_t const length3 = ZSTD_readLE16(istart + 4); + size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6); + const BYTE *const istart1 = istart + 6; /* jumpTable */ + const BYTE *const istart2 = istart1 + length1; + const BYTE *const istart3 = istart2 + length2; + const BYTE *const istart4 = istart3 + length3; + const size_t segmentSize = (dstSize + 3) / 4; + BYTE *const opStart2 = ostart + segmentSize; + BYTE *const opStart3 = opStart2 + segmentSize; + BYTE *const opStart4 = opStart3 + segmentSize; + BYTE *op1 = ostart; + BYTE *op2 = opStart2; + BYTE *op3 = opStart3; + BYTE *op4 = opStart4; + U32 endSignal; + DTableDesc const dtd = HUF_getDTableDesc(DTable); + U32 const dtLog = dtd.tableLog; + + if (length4 > cSrcSize) + return ERROR(corruption_detected); /* overflow */ + { + size_t const errorCode = BIT_initDStream(&bitD1, istart1, length1); + if (HUF_isError(errorCode)) + return errorCode; + } + { + size_t const errorCode = BIT_initDStream(&bitD2, istart2, length2); + if (HUF_isError(errorCode)) + return errorCode; + } + { + size_t const errorCode = BIT_initDStream(&bitD3, istart3, length3); + if (HUF_isError(errorCode)) + return errorCode; + } + { + size_t const errorCode = BIT_initDStream(&bitD4, istart4, length4); + if (HUF_isError(errorCode)) + return errorCode; + } + + /* 16-32 symbols per loop (4-8 symbols per stream) */ + endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4); + for (; (endSignal == BIT_DStream_unfinished) && (op4 < (oend - 7));) { + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); + HUF_DECODE_SYMBOLX2_1(op1, &bitD1); + HUF_DECODE_SYMBOLX2_1(op2, &bitD2); + HUF_DECODE_SYMBOLX2_1(op3, &bitD3); + HUF_DECODE_SYMBOLX2_1(op4, &bitD4); + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); + HUF_DECODE_SYMBOLX2_0(op1, &bitD1); + HUF_DECODE_SYMBOLX2_0(op2, &bitD2); + HUF_DECODE_SYMBOLX2_0(op3, &bitD3); + HUF_DECODE_SYMBOLX2_0(op4, &bitD4); + endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4); + } + + /* check corruption */ + if (op1 > opStart2) + return ERROR(corruption_detected); + if (op2 > opStart3) + return ERROR(corruption_detected); + if (op3 > opStart4) + return ERROR(corruption_detected); + /* note : op4 supposed already verified within main loop */ + + /* finish bitStreams one by one */ + HUF_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog); + HUF_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog); + HUF_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog); + HUF_decodeStreamX2(op4, &bitD4, oend, dt, dtLog); + + /* check */ + endSignal = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4); + if (!endSignal) + return ERROR(corruption_detected); + + /* decoded size */ + return dstSize; + } +} + +size_t HUF_decompress4X2_usingDTable(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable) +{ + DTableDesc dtd = HUF_getDTableDesc(DTable); + if (dtd.tableType != 0) + return ERROR(GENERIC); + return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable); +} + +size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize) +{ + const BYTE *ip = (const BYTE *)cSrc; + + size_t const hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize, workspace, workspaceSize); + if (HUF_isError(hSize)) + return hSize; + if (hSize >= cSrcSize) + return ERROR(srcSize_wrong); + ip += hSize; + cSrcSize -= hSize; + + return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx); +} + +/* *************************/ +/* double-symbols decoding */ +/* *************************/ +typedef struct { + U16 sequence; + BYTE nbBits; + BYTE length; +} HUF_DEltX4; /* double-symbols decoding */ + +typedef struct { + BYTE symbol; + BYTE weight; +} sortedSymbol_t; + +/* HUF_fillDTableX4Level2() : + * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */ +static void HUF_fillDTableX4Level2(HUF_DEltX4 *DTable, U32 sizeLog, const U32 consumed, const U32 *rankValOrigin, const int minWeight, + const sortedSymbol_t *sortedSymbols, const U32 sortedListSize, U32 nbBitsBaseline, U16 baseSeq) +{ + HUF_DEltX4 DElt; + U32 rankVal[HUF_TABLELOG_MAX + 1]; + + /* get pre-calculated rankVal */ + memcpy(rankVal, rankValOrigin, sizeof(rankVal)); + + /* fill skipped values */ + if (minWeight > 1) { + U32 i, skipSize = rankVal[minWeight]; + ZSTD_writeLE16(&(DElt.sequence), baseSeq); + DElt.nbBits = (BYTE)(consumed); + DElt.length = 1; + for (i = 0; i < skipSize; i++) + DTable[i] = DElt; + } + + /* fill DTable */ + { + U32 s; + for (s = 0; s < sortedListSize; s++) { /* note : sortedSymbols already skipped */ + const U32 symbol = sortedSymbols[s].symbol; + const U32 weight = sortedSymbols[s].weight; + const U32 nbBits = nbBitsBaseline - weight; + const U32 length = 1 << (sizeLog - nbBits); + const U32 start = rankVal[weight]; + U32 i = start; + const U32 end = start + length; + + ZSTD_writeLE16(&(DElt.sequence), (U16)(baseSeq + (symbol << 8))); + DElt.nbBits = (BYTE)(nbBits + consumed); + DElt.length = 2; + do { + DTable[i++] = DElt; + } while (i < end); /* since length >= 1 */ + + rankVal[weight] += length; + } + } +} + +typedef U32 rankVal_t[HUF_TABLELOG_MAX][HUF_TABLELOG_MAX + 1]; +typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1]; + +static void HUF_fillDTableX4(HUF_DEltX4 *DTable, const U32 targetLog, const sortedSymbol_t *sortedList, const U32 sortedListSize, const U32 *rankStart, + rankVal_t rankValOrigin, const U32 maxWeight, const U32 nbBitsBaseline) +{ + U32 rankVal[HUF_TABLELOG_MAX + 1]; + const int scaleLog = nbBitsBaseline - targetLog; /* note : targetLog >= srcLog, hence scaleLog <= 1 */ + const U32 minBits = nbBitsBaseline - maxWeight; + U32 s; + + memcpy(rankVal, rankValOrigin, sizeof(rankVal)); + + /* fill DTable */ + for (s = 0; s < sortedListSize; s++) { + const U16 symbol = sortedList[s].symbol; + const U32 weight = sortedList[s].weight; + const U32 nbBits = nbBitsBaseline - weight; + const U32 start = rankVal[weight]; + const U32 length = 1 << (targetLog - nbBits); + + if (targetLog - nbBits >= minBits) { /* enough room for a second symbol */ + U32 sortedRank; + int minWeight = nbBits + scaleLog; + if (minWeight < 1) + minWeight = 1; + sortedRank = rankStart[minWeight]; + HUF_fillDTableX4Level2(DTable + start, targetLog - nbBits, nbBits, rankValOrigin[nbBits], minWeight, sortedList + sortedRank, + sortedListSize - sortedRank, nbBitsBaseline, symbol); + } else { + HUF_DEltX4 DElt; + ZSTD_writeLE16(&(DElt.sequence), symbol); + DElt.nbBits = (BYTE)(nbBits); + DElt.length = 1; + { + U32 const end = start + length; + U32 u; + for (u = start; u < end; u++) + DTable[u] = DElt; + } + } + rankVal[weight] += length; + } +} + +size_t HUF_readDTableX4_wksp(HUF_DTable *DTable, const void *src, size_t srcSize, void *workspace, size_t workspaceSize) +{ + U32 tableLog, maxW, sizeOfSort, nbSymbols; + DTableDesc dtd = HUF_getDTableDesc(DTable); + U32 const maxTableLog = dtd.maxTableLog; + size_t iSize; + void *dtPtr = DTable + 1; /* force compiler to avoid strict-aliasing */ + HUF_DEltX4 *const dt = (HUF_DEltX4 *)dtPtr; + U32 *rankStart; + + rankValCol_t *rankVal; + U32 *rankStats; + U32 *rankStart0; + sortedSymbol_t *sortedSymbol; + BYTE *weightList; + size_t spaceUsed32 = 0; + + HUF_STATIC_ASSERT((sizeof(rankValCol_t) & 3) == 0); + + rankVal = (rankValCol_t *)((U32 *)workspace + spaceUsed32); + spaceUsed32 += (sizeof(rankValCol_t) * HUF_TABLELOG_MAX) >> 2; + rankStats = (U32 *)workspace + spaceUsed32; + spaceUsed32 += HUF_TABLELOG_MAX + 1; + rankStart0 = (U32 *)workspace + spaceUsed32; + spaceUsed32 += HUF_TABLELOG_MAX + 2; + sortedSymbol = (sortedSymbol_t *)((U32 *)workspace + spaceUsed32); + spaceUsed32 += ALIGN(sizeof(sortedSymbol_t) * (HUF_SYMBOLVALUE_MAX + 1), sizeof(U32)) >> 2; + weightList = (BYTE *)((U32 *)workspace + spaceUsed32); + spaceUsed32 += ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2; + + if ((spaceUsed32 << 2) > workspaceSize) + return ERROR(tableLog_tooLarge); + workspace = (U32 *)workspace + spaceUsed32; + workspaceSize -= (spaceUsed32 << 2); + + rankStart = rankStart0 + 1; + memset(rankStats, 0, sizeof(U32) * (2 * HUF_TABLELOG_MAX + 2 + 1)); + + HUF_STATIC_ASSERT(sizeof(HUF_DEltX4) == sizeof(HUF_DTable)); /* if compiler fails here, assertion is wrong */ + if (maxTableLog > HUF_TABLELOG_MAX) + return ERROR(tableLog_tooLarge); + /* memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */ + + iSize = HUF_readStats_wksp(weightList, HUF_SYMBOLVALUE_MAX + 1, rankStats, &nbSymbols, &tableLog, src, srcSize, workspace, workspaceSize); + if (HUF_isError(iSize)) + return iSize; + + /* check result */ + if (tableLog > maxTableLog) + return ERROR(tableLog_tooLarge); /* DTable can't fit code depth */ + + /* find maxWeight */ + for (maxW = tableLog; rankStats[maxW] == 0; maxW--) { + } /* necessarily finds a solution before 0 */ + + /* Get start index of each weight */ + { + U32 w, nextRankStart = 0; + for (w = 1; w < maxW + 1; w++) { + U32 curr = nextRankStart; + nextRankStart += rankStats[w]; + rankStart[w] = curr; + } + rankStart[0] = nextRankStart; /* put all 0w symbols at the end of sorted list*/ + sizeOfSort = nextRankStart; + } + + /* sort symbols by weight */ + { + U32 s; + for (s = 0; s < nbSymbols; s++) { + U32 const w = weightList[s]; + U32 const r = rankStart[w]++; + sortedSymbol[r].symbol = (BYTE)s; + sortedSymbol[r].weight = (BYTE)w; + } + rankStart[0] = 0; /* forget 0w symbols; this is beginning of weight(1) */ + } + + /* Build rankVal */ + { + U32 *const rankVal0 = rankVal[0]; + { + int const rescale = (maxTableLog - tableLog) - 1; /* tableLog <= maxTableLog */ + U32 nextRankVal = 0; + U32 w; + for (w = 1; w < maxW + 1; w++) { + U32 curr = nextRankVal; + nextRankVal += rankStats[w] << (w + rescale); + rankVal0[w] = curr; + } + } + { + U32 const minBits = tableLog + 1 - maxW; + U32 consumed; + for (consumed = minBits; consumed < maxTableLog - minBits + 1; consumed++) { + U32 *const rankValPtr = rankVal[consumed]; + U32 w; + for (w = 1; w < maxW + 1; w++) { + rankValPtr[w] = rankVal0[w] >> consumed; + } + } + } + } + + HUF_fillDTableX4(dt, maxTableLog, sortedSymbol, sizeOfSort, rankStart0, rankVal, maxW, tableLog + 1); + + dtd.tableLog = (BYTE)maxTableLog; + dtd.tableType = 1; + memcpy(DTable, &dtd, sizeof(dtd)); + return iSize; +} + +static U32 HUF_decodeSymbolX4(void *op, BIT_DStream_t *DStream, const HUF_DEltX4 *dt, const U32 dtLog) +{ + size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */ + memcpy(op, dt + val, 2); + BIT_skipBits(DStream, dt[val].nbBits); + return dt[val].length; +} + +static U32 HUF_decodeLastSymbolX4(void *op, BIT_DStream_t *DStream, const HUF_DEltX4 *dt, const U32 dtLog) +{ + size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */ + memcpy(op, dt + val, 1); + if (dt[val].length == 1) + BIT_skipBits(DStream, dt[val].nbBits); + else { + if (DStream->bitsConsumed < (sizeof(DStream->bitContainer) * 8)) { + BIT_skipBits(DStream, dt[val].nbBits); + if (DStream->bitsConsumed > (sizeof(DStream->bitContainer) * 8)) + /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */ + DStream->bitsConsumed = (sizeof(DStream->bitContainer) * 8); + } + } + return 1; +} + +#define HUF_DECODE_SYMBOLX4_0(ptr, DStreamPtr) ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog) + +#define HUF_DECODE_SYMBOLX4_1(ptr, DStreamPtr) \ + if (ZSTD_64bits() || (HUF_TABLELOG_MAX <= 12)) \ + ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog) + +#define HUF_DECODE_SYMBOLX4_2(ptr, DStreamPtr) \ + if (ZSTD_64bits()) \ + ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog) + +FORCE_INLINE size_t HUF_decodeStreamX4(BYTE *p, BIT_DStream_t *bitDPtr, BYTE *const pEnd, const HUF_DEltX4 *const dt, const U32 dtLog) +{ + BYTE *const pStart = p; + + /* up to 8 symbols at a time */ + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd - (sizeof(bitDPtr->bitContainer) - 1))) { + HUF_DECODE_SYMBOLX4_2(p, bitDPtr); + HUF_DECODE_SYMBOLX4_1(p, bitDPtr); + HUF_DECODE_SYMBOLX4_2(p, bitDPtr); + HUF_DECODE_SYMBOLX4_0(p, bitDPtr); + } + + /* closer to end : up to 2 symbols at a time */ + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd - 2)) + HUF_DECODE_SYMBOLX4_0(p, bitDPtr); + + while (p <= pEnd - 2) + HUF_DECODE_SYMBOLX4_0(p, bitDPtr); /* no need to reload : reached the end of DStream */ + + if (p < pEnd) + p += HUF_decodeLastSymbolX4(p, bitDPtr, dt, dtLog); + + return p - pStart; +} + +static size_t HUF_decompress1X4_usingDTable_internal(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable) +{ + BIT_DStream_t bitD; + + /* Init */ + { + size_t const errorCode = BIT_initDStream(&bitD, cSrc, cSrcSize); + if (HUF_isError(errorCode)) + return errorCode; + } + + /* decode */ + { + BYTE *const ostart = (BYTE *)dst; + BYTE *const oend = ostart + dstSize; + const void *const dtPtr = DTable + 1; /* force compiler to not use strict-aliasing */ + const HUF_DEltX4 *const dt = (const HUF_DEltX4 *)dtPtr; + DTableDesc const dtd = HUF_getDTableDesc(DTable); + HUF_decodeStreamX4(ostart, &bitD, oend, dt, dtd.tableLog); + } + + /* check */ + if (!BIT_endOfDStream(&bitD)) + return ERROR(corruption_detected); + + /* decoded size */ + return dstSize; +} + +size_t HUF_decompress1X4_usingDTable(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable) +{ + DTableDesc dtd = HUF_getDTableDesc(DTable); + if (dtd.tableType != 1) + return ERROR(GENERIC); + return HUF_decompress1X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable); +} + +size_t HUF_decompress1X4_DCtx_wksp(HUF_DTable *DCtx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize) +{ + const BYTE *ip = (const BYTE *)cSrc; + + size_t const hSize = HUF_readDTableX4_wksp(DCtx, cSrc, cSrcSize, workspace, workspaceSize); + if (HUF_isError(hSize)) + return hSize; + if (hSize >= cSrcSize) + return ERROR(srcSize_wrong); + ip += hSize; + cSrcSize -= hSize; + + return HUF_decompress1X4_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx); +} + +static size_t HUF_decompress4X4_usingDTable_internal(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable) +{ + if (cSrcSize < 10) + return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */ + + { + const BYTE *const istart = (const BYTE *)cSrc; + BYTE *const ostart = (BYTE *)dst; + BYTE *const oend = ostart + dstSize; + const void *const dtPtr = DTable + 1; + const HUF_DEltX4 *const dt = (const HUF_DEltX4 *)dtPtr; + + /* Init */ + BIT_DStream_t bitD1; + BIT_DStream_t bitD2; + BIT_DStream_t bitD3; + BIT_DStream_t bitD4; + size_t const length1 = ZSTD_readLE16(istart); + size_t const length2 = ZSTD_readLE16(istart + 2); + size_t const length3 = ZSTD_readLE16(istart + 4); + size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6); + const BYTE *const istart1 = istart + 6; /* jumpTable */ + const BYTE *const istart2 = istart1 + length1; + const BYTE *const istart3 = istart2 + length2; + const BYTE *const istart4 = istart3 + length3; + size_t const segmentSize = (dstSize + 3) / 4; + BYTE *const opStart2 = ostart + segmentSize; + BYTE *const opStart3 = opStart2 + segmentSize; + BYTE *const opStart4 = opStart3 + segmentSize; + BYTE *op1 = ostart; + BYTE *op2 = opStart2; + BYTE *op3 = opStart3; + BYTE *op4 = opStart4; + U32 endSignal; + DTableDesc const dtd = HUF_getDTableDesc(DTable); + U32 const dtLog = dtd.tableLog; + + if (length4 > cSrcSize) + return ERROR(corruption_detected); /* overflow */ + { + size_t const errorCode = BIT_initDStream(&bitD1, istart1, length1); + if (HUF_isError(errorCode)) + return errorCode; + } + { + size_t const errorCode = BIT_initDStream(&bitD2, istart2, length2); + if (HUF_isError(errorCode)) + return errorCode; + } + { + size_t const errorCode = BIT_initDStream(&bitD3, istart3, length3); + if (HUF_isError(errorCode)) + return errorCode; + } + { + size_t const errorCode = BIT_initDStream(&bitD4, istart4, length4); + if (HUF_isError(errorCode)) + return errorCode; + } + + /* 16-32 symbols per loop (4-8 symbols per stream) */ + endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4); + for (; (endSignal == BIT_DStream_unfinished) & (op4 < (oend - (sizeof(bitD4.bitContainer) - 1)));) { + HUF_DECODE_SYMBOLX4_2(op1, &bitD1); + HUF_DECODE_SYMBOLX4_2(op2, &bitD2); + HUF_DECODE_SYMBOLX4_2(op3, &bitD3); + HUF_DECODE_SYMBOLX4_2(op4, &bitD4); + HUF_DECODE_SYMBOLX4_1(op1, &bitD1); + HUF_DECODE_SYMBOLX4_1(op2, &bitD2); + HUF_DECODE_SYMBOLX4_1(op3, &bitD3); + HUF_DECODE_SYMBOLX4_1(op4, &bitD4); + HUF_DECODE_SYMBOLX4_2(op1, &bitD1); + HUF_DECODE_SYMBOLX4_2(op2, &bitD2); + HUF_DECODE_SYMBOLX4_2(op3, &bitD3); + HUF_DECODE_SYMBOLX4_2(op4, &bitD4); + HUF_DECODE_SYMBOLX4_0(op1, &bitD1); + HUF_DECODE_SYMBOLX4_0(op2, &bitD2); + HUF_DECODE_SYMBOLX4_0(op3, &bitD3); + HUF_DECODE_SYMBOLX4_0(op4, &bitD4); + + endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4); + } + + /* check corruption */ + if (op1 > opStart2) + return ERROR(corruption_detected); + if (op2 > opStart3) + return ERROR(corruption_detected); + if (op3 > opStart4) + return ERROR(corruption_detected); + /* note : op4 already verified within main loop */ + + /* finish bitStreams one by one */ + HUF_decodeStreamX4(op1, &bitD1, opStart2, dt, dtLog); + HUF_decodeStreamX4(op2, &bitD2, opStart3, dt, dtLog); + HUF_decodeStreamX4(op3, &bitD3, opStart4, dt, dtLog); + HUF_decodeStreamX4(op4, &bitD4, oend, dt, dtLog); + + /* check */ + { + U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4); + if (!endCheck) + return ERROR(corruption_detected); + } + + /* decoded size */ + return dstSize; + } +} + +size_t HUF_decompress4X4_usingDTable(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable) +{ + DTableDesc dtd = HUF_getDTableDesc(DTable); + if (dtd.tableType != 1) + return ERROR(GENERIC); + return HUF_decompress4X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable); +} + +size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize) +{ + const BYTE *ip = (const BYTE *)cSrc; + + size_t hSize = HUF_readDTableX4_wksp(dctx, cSrc, cSrcSize, workspace, workspaceSize); + if (HUF_isError(hSize)) + return hSize; + if (hSize >= cSrcSize) + return ERROR(srcSize_wrong); + ip += hSize; + cSrcSize -= hSize; + + return HUF_decompress4X4_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx); +} + +/* ********************************/ +/* Generic decompression selector */ +/* ********************************/ + +size_t HUF_decompress1X_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable) +{ + DTableDesc const dtd = HUF_getDTableDesc(DTable); + return dtd.tableType ? HUF_decompress1X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable) + : HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable); +} + +size_t HUF_decompress4X_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable) +{ + DTableDesc const dtd = HUF_getDTableDesc(DTable); + return dtd.tableType ? HUF_decompress4X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable) + : HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable); +} + +typedef struct { + U32 tableTime; + U32 decode256Time; +} algo_time_t; +static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] = { + /* single, double, quad */ + {{0, 0}, {1, 1}, {2, 2}}, /* Q==0 : impossible */ + {{0, 0}, {1, 1}, {2, 2}}, /* Q==1 : impossible */ + {{38, 130}, {1313, 74}, {2151, 38}}, /* Q == 2 : 12-18% */ + {{448, 128}, {1353, 74}, {2238, 41}}, /* Q == 3 : 18-25% */ + {{556, 128}, {1353, 74}, {2238, 47}}, /* Q == 4 : 25-32% */ + {{714, 128}, {1418, 74}, {2436, 53}}, /* Q == 5 : 32-38% */ + {{883, 128}, {1437, 74}, {2464, 61}}, /* Q == 6 : 38-44% */ + {{897, 128}, {1515, 75}, {2622, 68}}, /* Q == 7 : 44-50% */ + {{926, 128}, {1613, 75}, {2730, 75}}, /* Q == 8 : 50-56% */ + {{947, 128}, {1729, 77}, {3359, 77}}, /* Q == 9 : 56-62% */ + {{1107, 128}, {2083, 81}, {4006, 84}}, /* Q ==10 : 62-69% */ + {{1177, 128}, {2379, 87}, {4785, 88}}, /* Q ==11 : 69-75% */ + {{1242, 128}, {2415, 93}, {5155, 84}}, /* Q ==12 : 75-81% */ + {{1349, 128}, {2644, 106}, {5260, 106}}, /* Q ==13 : 81-87% */ + {{1455, 128}, {2422, 124}, {4174, 124}}, /* Q ==14 : 87-93% */ + {{722, 128}, {1891, 145}, {1936, 146}}, /* Q ==15 : 93-99% */ +}; + +/** HUF_selectDecoder() : +* Tells which decoder is likely to decode faster, +* based on a set of pre-determined metrics. +* @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 . +* Assumption : 0 < cSrcSize < dstSize <= 128 KB */ +U32 HUF_selectDecoder(size_t dstSize, size_t cSrcSize) +{ + /* decoder timing evaluation */ + U32 const Q = (U32)(cSrcSize * 16 / dstSize); /* Q < 16 since dstSize > cSrcSize */ + U32 const D256 = (U32)(dstSize >> 8); + U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256); + U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256); + DTime1 += DTime1 >> 3; /* advantage to algorithm using less memory, for cache eviction */ + + return DTime1 < DTime0; +} + +typedef size_t (*decompressionAlgo)(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize); + +size_t HUF_decompress4X_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize) +{ + /* validation checks */ + if (dstSize == 0) + return ERROR(dstSize_tooSmall); + if (cSrcSize > dstSize) + return ERROR(corruption_detected); /* invalid */ + if (cSrcSize == dstSize) { + memcpy(dst, cSrc, dstSize); + return dstSize; + } /* not compressed */ + if (cSrcSize == 1) { + memset(dst, *(const BYTE *)cSrc, dstSize); + return dstSize; + } /* RLE */ + + { + U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize); + return algoNb ? HUF_decompress4X4_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize) + : HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize); + } +} + +size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize) +{ + /* validation checks */ + if (dstSize == 0) + return ERROR(dstSize_tooSmall); + if ((cSrcSize >= dstSize) || (cSrcSize <= 1)) + return ERROR(corruption_detected); /* invalid */ + + { + U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize); + return algoNb ? HUF_decompress4X4_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize) + : HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize); + } +} + +size_t HUF_decompress1X_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize) +{ + /* validation checks */ + if (dstSize == 0) + return ERROR(dstSize_tooSmall); + if (cSrcSize > dstSize) + return ERROR(corruption_detected); /* invalid */ + if (cSrcSize == dstSize) { + memcpy(dst, cSrc, dstSize); + return dstSize; + } /* not compressed */ + if (cSrcSize == 1) { + memset(dst, *(const BYTE *)cSrc, dstSize); + return dstSize; + } /* RLE */ + + { + U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize); + return algoNb ? HUF_decompress1X4_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize) + : HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize); + } +} diff --git a/lib/zstd/mem.h b/lib/zstd/mem.h new file mode 100644 index 000000000000..3a0f34c8706c --- /dev/null +++ b/lib/zstd/mem.h @@ -0,0 +1,151 @@ +/** + * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of https://github.com/facebook/zstd. + * An additional grant of patent rights can be found in the PATENTS file in the + * same directory. + * + * This program is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License version 2 as published by the + * Free Software Foundation. This program is dual-licensed; you may select + * either version 2 of the GNU General Public License ("GPL") or BSD license + * ("BSD"). + */ + +#ifndef MEM_H_MODULE +#define MEM_H_MODULE + +/*-**************************************** +* Dependencies +******************************************/ +#include +#include /* memcpy */ +#include /* size_t, ptrdiff_t */ + +/*-**************************************** +* Compiler specifics +******************************************/ +#define ZSTD_STATIC static __inline __attribute__((unused)) + +/*-************************************************************** +* Basic Types +*****************************************************************/ +typedef uint8_t BYTE; +typedef uint16_t U16; +typedef int16_t S16; +typedef uint32_t U32; +typedef int32_t S32; +typedef uint64_t U64; +typedef int64_t S64; +typedef ptrdiff_t iPtrDiff; +typedef uintptr_t uPtrDiff; + +/*-************************************************************** +* Memory I/O +*****************************************************************/ +ZSTD_STATIC unsigned ZSTD_32bits(void) { return sizeof(size_t) == 4; } +ZSTD_STATIC unsigned ZSTD_64bits(void) { return sizeof(size_t) == 8; } + +#if defined(__LITTLE_ENDIAN) +#define ZSTD_LITTLE_ENDIAN 1 +#else +#define ZSTD_LITTLE_ENDIAN 0 +#endif + +ZSTD_STATIC unsigned ZSTD_isLittleEndian(void) { return ZSTD_LITTLE_ENDIAN; } + +ZSTD_STATIC U16 ZSTD_read16(const void *memPtr) { return get_unaligned((const U16 *)memPtr); } + +ZSTD_STATIC U32 ZSTD_read32(const void *memPtr) { return get_unaligned((const U32 *)memPtr); } + +ZSTD_STATIC U64 ZSTD_read64(const void *memPtr) { return get_unaligned((const U64 *)memPtr); } + +ZSTD_STATIC size_t ZSTD_readST(const void *memPtr) { return get_unaligned((const size_t *)memPtr); } + +ZSTD_STATIC void ZSTD_write16(void *memPtr, U16 value) { put_unaligned(value, (U16 *)memPtr); } + +ZSTD_STATIC void ZSTD_write32(void *memPtr, U32 value) { put_unaligned(value, (U32 *)memPtr); } + +ZSTD_STATIC void ZSTD_write64(void *memPtr, U64 value) { put_unaligned(value, (U64 *)memPtr); } + +/*=== Little endian r/w ===*/ + +ZSTD_STATIC U16 ZSTD_readLE16(const void *memPtr) { return get_unaligned_le16(memPtr); } + +ZSTD_STATIC void ZSTD_writeLE16(void *memPtr, U16 val) { put_unaligned_le16(val, memPtr); } + +ZSTD_STATIC U32 ZSTD_readLE24(const void *memPtr) { return ZSTD_readLE16(memPtr) + (((const BYTE *)memPtr)[2] << 16); } + +ZSTD_STATIC void ZSTD_writeLE24(void *memPtr, U32 val) +{ + ZSTD_writeLE16(memPtr, (U16)val); + ((BYTE *)memPtr)[2] = (BYTE)(val >> 16); +} + +ZSTD_STATIC U32 ZSTD_readLE32(const void *memPtr) { return get_unaligned_le32(memPtr); } + +ZSTD_STATIC void ZSTD_writeLE32(void *memPtr, U32 val32) { put_unaligned_le32(val32, memPtr); } + +ZSTD_STATIC U64 ZSTD_readLE64(const void *memPtr) { return get_unaligned_le64(memPtr); } + +ZSTD_STATIC void ZSTD_writeLE64(void *memPtr, U64 val64) { put_unaligned_le64(val64, memPtr); } + +ZSTD_STATIC size_t ZSTD_readLEST(const void *memPtr) +{ + if (ZSTD_32bits()) + return (size_t)ZSTD_readLE32(memPtr); + else + return (size_t)ZSTD_readLE64(memPtr); +} + +ZSTD_STATIC void ZSTD_writeLEST(void *memPtr, size_t val) +{ + if (ZSTD_32bits()) + ZSTD_writeLE32(memPtr, (U32)val); + else + ZSTD_writeLE64(memPtr, (U64)val); +} + +/*=== Big endian r/w ===*/ + +ZSTD_STATIC U32 ZSTD_readBE32(const void *memPtr) { return get_unaligned_be32(memPtr); } + +ZSTD_STATIC void ZSTD_writeBE32(void *memPtr, U32 val32) { put_unaligned_be32(val32, memPtr); } + +ZSTD_STATIC U64 ZSTD_readBE64(const void *memPtr) { return get_unaligned_be64(memPtr); } + +ZSTD_STATIC void ZSTD_writeBE64(void *memPtr, U64 val64) { put_unaligned_be64(val64, memPtr); } + +ZSTD_STATIC size_t ZSTD_readBEST(const void *memPtr) +{ + if (ZSTD_32bits()) + return (size_t)ZSTD_readBE32(memPtr); + else + return (size_t)ZSTD_readBE64(memPtr); +} + +ZSTD_STATIC void ZSTD_writeBEST(void *memPtr, size_t val) +{ + if (ZSTD_32bits()) + ZSTD_writeBE32(memPtr, (U32)val); + else + ZSTD_writeBE64(memPtr, (U64)val); +} + +/* function safe only for comparisons */ +ZSTD_STATIC U32 ZSTD_readMINMATCH(const void *memPtr, U32 length) +{ + switch (length) { + default: + case 4: return ZSTD_read32(memPtr); + case 3: + if (ZSTD_isLittleEndian()) + return ZSTD_read32(memPtr) << 8; + else + return ZSTD_read32(memPtr) >> 8; + } +} + +#endif /* MEM_H_MODULE */ diff --git a/lib/zstd/zstd_common.c b/lib/zstd/zstd_common.c new file mode 100644 index 000000000000..a282624ee155 --- /dev/null +++ b/lib/zstd/zstd_common.c @@ -0,0 +1,75 @@ +/** + * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of https://github.com/facebook/zstd. + * An additional grant of patent rights can be found in the PATENTS file in the + * same directory. + * + * This program is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License version 2 as published by the + * Free Software Foundation. This program is dual-licensed; you may select + * either version 2 of the GNU General Public License ("GPL") or BSD license + * ("BSD"). + */ + +/*-************************************* +* Dependencies +***************************************/ +#include "error_private.h" +#include "zstd_internal.h" /* declaration of ZSTD_isError, ZSTD_getErrorName, ZSTD_getErrorCode, ZSTD_getErrorString, ZSTD_versionNumber */ +#include + +/*=************************************************************** +* Custom allocator +****************************************************************/ + +#define stack_push(stack, size) \ + ({ \ + void *const ptr = ZSTD_PTR_ALIGN((stack)->ptr); \ + (stack)->ptr = (char *)ptr + (size); \ + (stack)->ptr <= (stack)->end ? ptr : NULL; \ + }) + +ZSTD_customMem ZSTD_initStack(void *workspace, size_t workspaceSize) +{ + ZSTD_customMem stackMem = {ZSTD_stackAlloc, ZSTD_stackFree, workspace}; + ZSTD_stack *stack = (ZSTD_stack *)workspace; + /* Verify preconditions */ + if (!workspace || workspaceSize < sizeof(ZSTD_stack) || workspace != ZSTD_PTR_ALIGN(workspace)) { + ZSTD_customMem error = {NULL, NULL, NULL}; + return error; + } + /* Initialize the stack */ + stack->ptr = workspace; + stack->end = (char *)workspace + workspaceSize; + stack_push(stack, sizeof(ZSTD_stack)); + return stackMem; +} + +void *ZSTD_stackAllocAll(void *opaque, size_t *size) +{ + ZSTD_stack *stack = (ZSTD_stack *)opaque; + *size = (BYTE const *)stack->end - (BYTE *)ZSTD_PTR_ALIGN(stack->ptr); + return stack_push(stack, *size); +} + +void *ZSTD_stackAlloc(void *opaque, size_t size) +{ + ZSTD_stack *stack = (ZSTD_stack *)opaque; + return stack_push(stack, size); +} +void ZSTD_stackFree(void *opaque, void *address) +{ + (void)opaque; + (void)address; +} + +void *ZSTD_malloc(size_t size, ZSTD_customMem customMem) { return customMem.customAlloc(customMem.opaque, size); } + +void ZSTD_free(void *ptr, ZSTD_customMem customMem) +{ + if (ptr != NULL) + customMem.customFree(customMem.opaque, ptr); +} diff --git a/lib/zstd/zstd_internal.h b/lib/zstd/zstd_internal.h new file mode 100644 index 000000000000..1a79fab9e13a --- /dev/null +++ b/lib/zstd/zstd_internal.h @@ -0,0 +1,263 @@ +/** + * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of https://github.com/facebook/zstd. + * An additional grant of patent rights can be found in the PATENTS file in the + * same directory. + * + * This program is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License version 2 as published by the + * Free Software Foundation. This program is dual-licensed; you may select + * either version 2 of the GNU General Public License ("GPL") or BSD license + * ("BSD"). + */ + +#ifndef ZSTD_CCOMMON_H_MODULE +#define ZSTD_CCOMMON_H_MODULE + +/*-******************************************************* +* Compiler specifics +*********************************************************/ +#define FORCE_INLINE static __always_inline +#define FORCE_NOINLINE static noinline + +/*-************************************* +* Dependencies +***************************************/ +#include "error_private.h" +#include "mem.h" +#include +#include +#include +#include + +/*-************************************* +* shared macros +***************************************/ +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define CHECK_F(f) \ + { \ + size_t const errcod = f; \ + if (ERR_isError(errcod)) \ + return errcod; \ + } /* check and Forward error code */ +#define CHECK_E(f, e) \ + { \ + size_t const errcod = f; \ + if (ERR_isError(errcod)) \ + return ERROR(e); \ + } /* check and send Error code */ +#define ZSTD_STATIC_ASSERT(c) \ + { \ + enum { ZSTD_static_assert = 1 / (int)(!!(c)) }; \ + } + +/*-************************************* +* Common constants +***************************************/ +#define ZSTD_OPT_NUM (1 << 12) +#define ZSTD_DICT_MAGIC 0xEC30A437 /* v0.7+ */ + +#define ZSTD_REP_NUM 3 /* number of repcodes */ +#define ZSTD_REP_CHECK (ZSTD_REP_NUM) /* number of repcodes to check by the optimal parser */ +#define ZSTD_REP_MOVE (ZSTD_REP_NUM - 1) +#define ZSTD_REP_MOVE_OPT (ZSTD_REP_NUM) +static const U32 repStartValue[ZSTD_REP_NUM] = {1, 4, 8}; + +#define KB *(1 << 10) +#define MB *(1 << 20) +#define GB *(1U << 30) + +#define BIT7 128 +#define BIT6 64 +#define BIT5 32 +#define BIT4 16 +#define BIT1 2 +#define BIT0 1 + +#define ZSTD_WINDOWLOG_ABSOLUTEMIN 10 +static const size_t ZSTD_fcs_fieldSize[4] = {0, 2, 4, 8}; +static const size_t ZSTD_did_fieldSize[4] = {0, 1, 2, 4}; + +#define ZSTD_BLOCKHEADERSIZE 3 /* C standard doesn't allow `static const` variable to be init using another `static const` variable */ +static const size_t ZSTD_blockHeaderSize = ZSTD_BLOCKHEADERSIZE; +typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e; + +#define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */ +#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */) /* for a non-null block */ + +#define HufLog 12 +typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e; + +#define LONGNBSEQ 0x7F00 + +#define MINMATCH 3 +#define EQUAL_READ32 4 + +#define Litbits 8 +#define MaxLit ((1 << Litbits) - 1) +#define MaxML 52 +#define MaxLL 35 +#define MaxOff 28 +#define MaxSeq MAX(MaxLL, MaxML) /* Assumption : MaxOff < MaxLL,MaxML */ +#define MLFSELog 9 +#define LLFSELog 9 +#define OffFSELog 8 + +static const U32 LL_bits[MaxLL + 1] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; +static const S16 LL_defaultNorm[MaxLL + 1] = {4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 1, -1, -1, -1, -1}; +#define LL_DEFAULTNORMLOG 6 /* for static allocation */ +static const U32 LL_defaultNormLog = LL_DEFAULTNORMLOG; + +static const U32 ML_bits[MaxML + 1] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; +static const S16 ML_defaultNorm[MaxML + 1] = {1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1}; +#define ML_DEFAULTNORMLOG 6 /* for static allocation */ +static const U32 ML_defaultNormLog = ML_DEFAULTNORMLOG; + +static const S16 OF_defaultNorm[MaxOff + 1] = {1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1}; +#define OF_DEFAULTNORMLOG 5 /* for static allocation */ +static const U32 OF_defaultNormLog = OF_DEFAULTNORMLOG; + +/*-******************************************* +* Shared functions to include for inlining +*********************************************/ +ZSTD_STATIC void ZSTD_copy8(void *dst, const void *src) { + memcpy(dst, src, 8); +} +/*! ZSTD_wildcopy() : +* custom version of memcpy(), can copy up to 7 bytes too many (8 bytes if length==0) */ +#define WILDCOPY_OVERLENGTH 8 +ZSTD_STATIC void ZSTD_wildcopy(void *dst, const void *src, ptrdiff_t length) +{ + const BYTE* ip = (const BYTE*)src; + BYTE* op = (BYTE*)dst; + BYTE* const oend = op + length; + /* Work around https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81388. + * Avoid the bad case where the loop only runs once by handling the + * special case separately. This doesn't trigger the bug because it + * doesn't involve pointer/integer overflow. + */ + if (length <= 8) + return ZSTD_copy8(dst, src); + do { + ZSTD_copy8(op, ip); + op += 8; + ip += 8; + } while (op < oend); +} + +/*-******************************************* +* Private interfaces +*********************************************/ +typedef struct ZSTD_stats_s ZSTD_stats_t; + +typedef struct { + U32 off; + U32 len; +} ZSTD_match_t; + +typedef struct { + U32 price; + U32 off; + U32 mlen; + U32 litlen; + U32 rep[ZSTD_REP_NUM]; +} ZSTD_optimal_t; + +typedef struct seqDef_s { + U32 offset; + U16 litLength; + U16 matchLength; +} seqDef; + +typedef struct { + seqDef *sequencesStart; + seqDef *sequences; + BYTE *litStart; + BYTE *lit; + BYTE *llCode; + BYTE *mlCode; + BYTE *ofCode; + U32 longLengthID; /* 0 == no longLength; 1 == Lit.longLength; 2 == Match.longLength; */ + U32 longLengthPos; + /* opt */ + ZSTD_optimal_t *priceTable; + ZSTD_match_t *matchTable; + U32 *matchLengthFreq; + U32 *litLengthFreq; + U32 *litFreq; + U32 *offCodeFreq; + U32 matchLengthSum; + U32 matchSum; + U32 litLengthSum; + U32 litSum; + U32 offCodeSum; + U32 log2matchLengthSum; + U32 log2matchSum; + U32 log2litLengthSum; + U32 log2litSum; + U32 log2offCodeSum; + U32 factor; + U32 staticPrices; + U32 cachedPrice; + U32 cachedLitLength; + const BYTE *cachedLiterals; +} seqStore_t; + +const seqStore_t *ZSTD_getSeqStore(const ZSTD_CCtx *ctx); +void ZSTD_seqToCodes(const seqStore_t *seqStorePtr); +int ZSTD_isSkipFrame(ZSTD_DCtx *dctx); + +/*= Custom memory allocation functions */ +typedef void *(*ZSTD_allocFunction)(void *opaque, size_t size); +typedef void (*ZSTD_freeFunction)(void *opaque, void *address); +typedef struct { + ZSTD_allocFunction customAlloc; + ZSTD_freeFunction customFree; + void *opaque; +} ZSTD_customMem; + +void *ZSTD_malloc(size_t size, ZSTD_customMem customMem); +void ZSTD_free(void *ptr, ZSTD_customMem customMem); + +/*====== stack allocation ======*/ + +typedef struct { + void *ptr; + const void *end; +} ZSTD_stack; + +#define ZSTD_ALIGN(x) ALIGN(x, sizeof(size_t)) +#define ZSTD_PTR_ALIGN(p) PTR_ALIGN(p, sizeof(size_t)) + +ZSTD_customMem ZSTD_initStack(void *workspace, size_t workspaceSize); + +void *ZSTD_stackAllocAll(void *opaque, size_t *size); +void *ZSTD_stackAlloc(void *opaque, size_t size); +void ZSTD_stackFree(void *opaque, void *address); + +/*====== common function ======*/ + +ZSTD_STATIC U32 ZSTD_highbit32(U32 val) { return 31 - __builtin_clz(val); } + +/* hidden functions */ + +/* ZSTD_invalidateRepCodes() : + * ensures next compression will not use repcodes from previous block. + * Note : only works with regular variant; + * do not use with extDict variant ! */ +void ZSTD_invalidateRepCodes(ZSTD_CCtx *cctx); + +size_t ZSTD_freeCCtx(ZSTD_CCtx *cctx); +size_t ZSTD_freeDCtx(ZSTD_DCtx *dctx); +size_t ZSTD_freeCDict(ZSTD_CDict *cdict); +size_t ZSTD_freeDDict(ZSTD_DDict *cdict); +size_t ZSTD_freeCStream(ZSTD_CStream *zcs); +size_t ZSTD_freeDStream(ZSTD_DStream *zds); + +#endif /* ZSTD_CCOMMON_H_MODULE */ diff --git a/lib/zstd/zstd_opt.h b/lib/zstd/zstd_opt.h new file mode 100644 index 000000000000..55e1b4cba808 --- /dev/null +++ b/lib/zstd/zstd_opt.h @@ -0,0 +1,1014 @@ +/** + * Copyright (c) 2016-present, Przemyslaw Skibinski, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of https://github.com/facebook/zstd. + * An additional grant of patent rights can be found in the PATENTS file in the + * same directory. + * + * This program is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License version 2 as published by the + * Free Software Foundation. This program is dual-licensed; you may select + * either version 2 of the GNU General Public License ("GPL") or BSD license + * ("BSD"). + */ + +/* Note : this file is intended to be included within zstd_compress.c */ + +#ifndef ZSTD_OPT_H_91842398743 +#define ZSTD_OPT_H_91842398743 + +#define ZSTD_LITFREQ_ADD 2 +#define ZSTD_FREQ_DIV 4 +#define ZSTD_MAX_PRICE (1 << 30) + +/*-************************************* +* Price functions for optimal parser +***************************************/ +FORCE_INLINE void ZSTD_setLog2Prices(seqStore_t *ssPtr) +{ + ssPtr->log2matchLengthSum = ZSTD_highbit32(ssPtr->matchLengthSum + 1); + ssPtr->log2litLengthSum = ZSTD_highbit32(ssPtr->litLengthSum + 1); + ssPtr->log2litSum = ZSTD_highbit32(ssPtr->litSum + 1); + ssPtr->log2offCodeSum = ZSTD_highbit32(ssPtr->offCodeSum + 1); + ssPtr->factor = 1 + ((ssPtr->litSum >> 5) / ssPtr->litLengthSum) + ((ssPtr->litSum << 1) / (ssPtr->litSum + ssPtr->matchSum)); +} + +ZSTD_STATIC void ZSTD_rescaleFreqs(seqStore_t *ssPtr, const BYTE *src, size_t srcSize) +{ + unsigned u; + + ssPtr->cachedLiterals = NULL; + ssPtr->cachedPrice = ssPtr->cachedLitLength = 0; + ssPtr->staticPrices = 0; + + if (ssPtr->litLengthSum == 0) { + if (srcSize <= 1024) + ssPtr->staticPrices = 1; + + for (u = 0; u <= MaxLit; u++) + ssPtr->litFreq[u] = 0; + for (u = 0; u < srcSize; u++) + ssPtr->litFreq[src[u]]++; + + ssPtr->litSum = 0; + ssPtr->litLengthSum = MaxLL + 1; + ssPtr->matchLengthSum = MaxML + 1; + ssPtr->offCodeSum = (MaxOff + 1); + ssPtr->matchSum = (ZSTD_LITFREQ_ADD << Litbits); + + for (u = 0; u <= MaxLit; u++) { + ssPtr->litFreq[u] = 1 + (ssPtr->litFreq[u] >> ZSTD_FREQ_DIV); + ssPtr->litSum += ssPtr->litFreq[u]; + } + for (u = 0; u <= MaxLL; u++) + ssPtr->litLengthFreq[u] = 1; + for (u = 0; u <= MaxML; u++) + ssPtr->matchLengthFreq[u] = 1; + for (u = 0; u <= MaxOff; u++) + ssPtr->offCodeFreq[u] = 1; + } else { + ssPtr->matchLengthSum = 0; + ssPtr->litLengthSum = 0; + ssPtr->offCodeSum = 0; + ssPtr->matchSum = 0; + ssPtr->litSum = 0; + + for (u = 0; u <= MaxLit; u++) { + ssPtr->litFreq[u] = 1 + (ssPtr->litFreq[u] >> (ZSTD_FREQ_DIV + 1)); + ssPtr->litSum += ssPtr->litFreq[u]; + } + for (u = 0; u <= MaxLL; u++) { + ssPtr->litLengthFreq[u] = 1 + (ssPtr->litLengthFreq[u] >> (ZSTD_FREQ_DIV + 1)); + ssPtr->litLengthSum += ssPtr->litLengthFreq[u]; + } + for (u = 0; u <= MaxML; u++) { + ssPtr->matchLengthFreq[u] = 1 + (ssPtr->matchLengthFreq[u] >> ZSTD_FREQ_DIV); + ssPtr->matchLengthSum += ssPtr->matchLengthFreq[u]; + ssPtr->matchSum += ssPtr->matchLengthFreq[u] * (u + 3); + } + ssPtr->matchSum *= ZSTD_LITFREQ_ADD; + for (u = 0; u <= MaxOff; u++) { + ssPtr->offCodeFreq[u] = 1 + (ssPtr->offCodeFreq[u] >> ZSTD_FREQ_DIV); + ssPtr->offCodeSum += ssPtr->offCodeFreq[u]; + } + } + + ZSTD_setLog2Prices(ssPtr); +} + +FORCE_INLINE U32 ZSTD_getLiteralPrice(seqStore_t *ssPtr, U32 litLength, const BYTE *literals) +{ + U32 price, u; + + if (ssPtr->staticPrices) + return ZSTD_highbit32((U32)litLength + 1) + (litLength * 6); + + if (litLength == 0) + return ssPtr->log2litLengthSum - ZSTD_highbit32(ssPtr->litLengthFreq[0] + 1); + + /* literals */ + if (ssPtr->cachedLiterals == literals) { + U32 const additional = litLength - ssPtr->cachedLitLength; + const BYTE *literals2 = ssPtr->cachedLiterals + ssPtr->cachedLitLength; + price = ssPtr->cachedPrice + additional * ssPtr->log2litSum; + for (u = 0; u < additional; u++) + price -= ZSTD_highbit32(ssPtr->litFreq[literals2[u]] + 1); + ssPtr->cachedPrice = price; + ssPtr->cachedLitLength = litLength; + } else { + price = litLength * ssPtr->log2litSum; + for (u = 0; u < litLength; u++) + price -= ZSTD_highbit32(ssPtr->litFreq[literals[u]] + 1); + + if (litLength >= 12) { + ssPtr->cachedLiterals = literals; + ssPtr->cachedPrice = price; + ssPtr->cachedLitLength = litLength; + } + } + + /* literal Length */ + { + const BYTE LL_deltaCode = 19; + const BYTE llCode = (litLength > 63) ? (BYTE)ZSTD_highbit32(litLength) + LL_deltaCode : LL_Code[litLength]; + price += LL_bits[llCode] + ssPtr->log2litLengthSum - ZSTD_highbit32(ssPtr->litLengthFreq[llCode] + 1); + } + + return price; +} + +FORCE_INLINE U32 ZSTD_getPrice(seqStore_t *seqStorePtr, U32 litLength, const BYTE *literals, U32 offset, U32 matchLength, const int ultra) +{ + /* offset */ + U32 price; + BYTE const offCode = (BYTE)ZSTD_highbit32(offset + 1); + + if (seqStorePtr->staticPrices) + return ZSTD_getLiteralPrice(seqStorePtr, litLength, literals) + ZSTD_highbit32((U32)matchLength + 1) + 16 + offCode; + + price = offCode + seqStorePtr->log2offCodeSum - ZSTD_highbit32(seqStorePtr->offCodeFreq[offCode] + 1); + if (!ultra && offCode >= 20) + price += (offCode - 19) * 2; + + /* match Length */ + { + const BYTE ML_deltaCode = 36; + const BYTE mlCode = (matchLength > 127) ? (BYTE)ZSTD_highbit32(matchLength) + ML_deltaCode : ML_Code[matchLength]; + price += ML_bits[mlCode] + seqStorePtr->log2matchLengthSum - ZSTD_highbit32(seqStorePtr->matchLengthFreq[mlCode] + 1); + } + + return price + ZSTD_getLiteralPrice(seqStorePtr, litLength, literals) + seqStorePtr->factor; +} + +ZSTD_STATIC void ZSTD_updatePrice(seqStore_t *seqStorePtr, U32 litLength, const BYTE *literals, U32 offset, U32 matchLength) +{ + U32 u; + + /* literals */ + seqStorePtr->litSum += litLength * ZSTD_LITFREQ_ADD; + for (u = 0; u < litLength; u++) + seqStorePtr->litFreq[literals[u]] += ZSTD_LITFREQ_ADD; + + /* literal Length */ + { + const BYTE LL_deltaCode = 19; + const BYTE llCode = (litLength > 63) ? (BYTE)ZSTD_highbit32(litLength) + LL_deltaCode : LL_Code[litLength]; + seqStorePtr->litLengthFreq[llCode]++; + seqStorePtr->litLengthSum++; + } + + /* match offset */ + { + BYTE const offCode = (BYTE)ZSTD_highbit32(offset + 1); + seqStorePtr->offCodeSum++; + seqStorePtr->offCodeFreq[offCode]++; + } + + /* match Length */ + { + const BYTE ML_deltaCode = 36; + const BYTE mlCode = (matchLength > 127) ? (BYTE)ZSTD_highbit32(matchLength) + ML_deltaCode : ML_Code[matchLength]; + seqStorePtr->matchLengthFreq[mlCode]++; + seqStorePtr->matchLengthSum++; + } + + ZSTD_setLog2Prices(seqStorePtr); +} + +#define SET_PRICE(pos, mlen_, offset_, litlen_, price_) \ + { \ + while (last_pos < pos) { \ + opt[last_pos + 1].price = ZSTD_MAX_PRICE; \ + last_pos++; \ + } \ + opt[pos].mlen = mlen_; \ + opt[pos].off = offset_; \ + opt[pos].litlen = litlen_; \ + opt[pos].price = price_; \ + } + +/* Update hashTable3 up to ip (excluded) + Assumption : always within prefix (i.e. not within extDict) */ +FORCE_INLINE +U32 ZSTD_insertAndFindFirstIndexHash3(ZSTD_CCtx *zc, const BYTE *ip) +{ + U32 *const hashTable3 = zc->hashTable3; + U32 const hashLog3 = zc->hashLog3; + const BYTE *const base = zc->base; + U32 idx = zc->nextToUpdate3; + const U32 target = zc->nextToUpdate3 = (U32)(ip - base); + const size_t hash3 = ZSTD_hash3Ptr(ip, hashLog3); + + while (idx < target) { + hashTable3[ZSTD_hash3Ptr(base + idx, hashLog3)] = idx; + idx++; + } + + return hashTable3[hash3]; +} + +/*-************************************* +* Binary Tree search +***************************************/ +static U32 ZSTD_insertBtAndGetAllMatches(ZSTD_CCtx *zc, const BYTE *const ip, const BYTE *const iLimit, U32 nbCompares, const U32 mls, U32 extDict, + ZSTD_match_t *matches, const U32 minMatchLen) +{ + const BYTE *const base = zc->base; + const U32 curr = (U32)(ip - base); + const U32 hashLog = zc->params.cParams.hashLog; + const size_t h = ZSTD_hashPtr(ip, hashLog, mls); + U32 *const hashTable = zc->hashTable; + U32 matchIndex = hashTable[h]; + U32 *const bt = zc->chainTable; + const U32 btLog = zc->params.cParams.chainLog - 1; + const U32 btMask = (1U << btLog) - 1; + size_t commonLengthSmaller = 0, commonLengthLarger = 0; + const BYTE *const dictBase = zc->dictBase; + const U32 dictLimit = zc->dictLimit; + const BYTE *const dictEnd = dictBase + dictLimit; + const BYTE *const prefixStart = base + dictLimit; + const U32 btLow = btMask >= curr ? 0 : curr - btMask; + const U32 windowLow = zc->lowLimit; + U32 *smallerPtr = bt + 2 * (curr & btMask); + U32 *largerPtr = bt + 2 * (curr & btMask) + 1; + U32 matchEndIdx = curr + 8; + U32 dummy32; /* to be nullified at the end */ + U32 mnum = 0; + + const U32 minMatch = (mls == 3) ? 3 : 4; + size_t bestLength = minMatchLen - 1; + + if (minMatch == 3) { /* HC3 match finder */ + U32 const matchIndex3 = ZSTD_insertAndFindFirstIndexHash3(zc, ip); + if (matchIndex3 > windowLow && (curr - matchIndex3 < (1 << 18))) { + const BYTE *match; + size_t currMl = 0; + if ((!extDict) || matchIndex3 >= dictLimit) { + match = base + matchIndex3; + if (match[bestLength] == ip[bestLength]) + currMl = ZSTD_count(ip, match, iLimit); + } else { + match = dictBase + matchIndex3; + if (ZSTD_readMINMATCH(match, MINMATCH) == + ZSTD_readMINMATCH(ip, MINMATCH)) /* assumption : matchIndex3 <= dictLimit-4 (by table construction) */ + currMl = ZSTD_count_2segments(ip + MINMATCH, match + MINMATCH, iLimit, dictEnd, prefixStart) + MINMATCH; + } + + /* save best solution */ + if (currMl > bestLength) { + bestLength = currMl; + matches[mnum].off = ZSTD_REP_MOVE_OPT + curr - matchIndex3; + matches[mnum].len = (U32)currMl; + mnum++; + if (currMl > ZSTD_OPT_NUM) + goto update; + if (ip + currMl == iLimit) + goto update; /* best possible, and avoid read overflow*/ + } + } + } + + hashTable[h] = curr; /* Update Hash Table */ + + while (nbCompares-- && (matchIndex > windowLow)) { + U32 *nextPtr = bt + 2 * (matchIndex & btMask); + size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ + const BYTE *match; + + if ((!extDict) || (matchIndex + matchLength >= dictLimit)) { + match = base + matchIndex; + if (match[matchLength] == ip[matchLength]) { + matchLength += ZSTD_count(ip + matchLength + 1, match + matchLength + 1, iLimit) + 1; + } + } else { + match = dictBase + matchIndex; + matchLength += ZSTD_count_2segments(ip + matchLength, match + matchLength, iLimit, dictEnd, prefixStart); + if (matchIndex + matchLength >= dictLimit) + match = base + matchIndex; /* to prepare for next usage of match[matchLength] */ + } + + if (matchLength > bestLength) { + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; + bestLength = matchLength; + matches[mnum].off = ZSTD_REP_MOVE_OPT + curr - matchIndex; + matches[mnum].len = (U32)matchLength; + mnum++; + if (matchLength > ZSTD_OPT_NUM) + break; + if (ip + matchLength == iLimit) /* equal : no way to know if inf or sup */ + break; /* drop, to guarantee consistency (miss a little bit of compression) */ + } + + if (match[matchLength] < ip[matchLength]) { + /* match is smaller than curr */ + *smallerPtr = matchIndex; /* update smaller idx */ + commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ + if (matchIndex <= btLow) { + smallerPtr = &dummy32; + break; + } /* beyond tree size, stop the search */ + smallerPtr = nextPtr + 1; /* new "smaller" => larger of match */ + matchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to curr) */ + } else { + /* match is larger than curr */ + *largerPtr = matchIndex; + commonLengthLarger = matchLength; + if (matchIndex <= btLow) { + largerPtr = &dummy32; + break; + } /* beyond tree size, stop the search */ + largerPtr = nextPtr; + matchIndex = nextPtr[0]; + } + } + + *smallerPtr = *largerPtr = 0; + +update: + zc->nextToUpdate = (matchEndIdx > curr + 8) ? matchEndIdx - 8 : curr + 1; + return mnum; +} + +/** Tree updater, providing best match */ +static U32 ZSTD_BtGetAllMatches(ZSTD_CCtx *zc, const BYTE *const ip, const BYTE *const iLimit, const U32 maxNbAttempts, const U32 mls, ZSTD_match_t *matches, + const U32 minMatchLen) +{ + if (ip < zc->base + zc->nextToUpdate) + return 0; /* skipped area */ + ZSTD_updateTree(zc, ip, iLimit, maxNbAttempts, mls); + return ZSTD_insertBtAndGetAllMatches(zc, ip, iLimit, maxNbAttempts, mls, 0, matches, minMatchLen); +} + +static U32 ZSTD_BtGetAllMatches_selectMLS(ZSTD_CCtx *zc, /* Index table will be updated */ + const BYTE *ip, const BYTE *const iHighLimit, const U32 maxNbAttempts, const U32 matchLengthSearch, + ZSTD_match_t *matches, const U32 minMatchLen) +{ + switch (matchLengthSearch) { + case 3: return ZSTD_BtGetAllMatches(zc, ip, iHighLimit, maxNbAttempts, 3, matches, minMatchLen); + default: + case 4: return ZSTD_BtGetAllMatches(zc, ip, iHighLimit, maxNbAttempts, 4, matches, minMatchLen); + case 5: return ZSTD_BtGetAllMatches(zc, ip, iHighLimit, maxNbAttempts, 5, matches, minMatchLen); + case 7: + case 6: return ZSTD_BtGetAllMatches(zc, ip, iHighLimit, maxNbAttempts, 6, matches, minMatchLen); + } +} + +/** Tree updater, providing best match */ +static U32 ZSTD_BtGetAllMatches_extDict(ZSTD_CCtx *zc, const BYTE *const ip, const BYTE *const iLimit, const U32 maxNbAttempts, const U32 mls, + ZSTD_match_t *matches, const U32 minMatchLen) +{ + if (ip < zc->base + zc->nextToUpdate) + return 0; /* skipped area */ + ZSTD_updateTree_extDict(zc, ip, iLimit, maxNbAttempts, mls); + return ZSTD_insertBtAndGetAllMatches(zc, ip, iLimit, maxNbAttempts, mls, 1, matches, minMatchLen); +} + +static U32 ZSTD_BtGetAllMatches_selectMLS_extDict(ZSTD_CCtx *zc, /* Index table will be updated */ + const BYTE *ip, const BYTE *const iHighLimit, const U32 maxNbAttempts, const U32 matchLengthSearch, + ZSTD_match_t *matches, const U32 minMatchLen) +{ + switch (matchLengthSearch) { + case 3: return ZSTD_BtGetAllMatches_extDict(zc, ip, iHighLimit, maxNbAttempts, 3, matches, minMatchLen); + default: + case 4: return ZSTD_BtGetAllMatches_extDict(zc, ip, iHighLimit, maxNbAttempts, 4, matches, minMatchLen); + case 5: return ZSTD_BtGetAllMatches_extDict(zc, ip, iHighLimit, maxNbAttempts, 5, matches, minMatchLen); + case 7: + case 6: return ZSTD_BtGetAllMatches_extDict(zc, ip, iHighLimit, maxNbAttempts, 6, matches, minMatchLen); + } +} + +/*-******************************* +* Optimal parser +*********************************/ +FORCE_INLINE +void ZSTD_compressBlock_opt_generic(ZSTD_CCtx *ctx, const void *src, size_t srcSize, const int ultra) +{ + seqStore_t *seqStorePtr = &(ctx->seqStore); + const BYTE *const istart = (const BYTE *)src; + const BYTE *ip = istart; + const BYTE *anchor = istart; + const BYTE *const iend = istart + srcSize; + const BYTE *const ilimit = iend - 8; + const BYTE *const base = ctx->base; + const BYTE *const prefixStart = base + ctx->dictLimit; + + const U32 maxSearches = 1U << ctx->params.cParams.searchLog; + const U32 sufficient_len = ctx->params.cParams.targetLength; + const U32 mls = ctx->params.cParams.searchLength; + const U32 minMatch = (ctx->params.cParams.searchLength == 3) ? 3 : 4; + + ZSTD_optimal_t *opt = seqStorePtr->priceTable; + ZSTD_match_t *matches = seqStorePtr->matchTable; + const BYTE *inr; + U32 offset, rep[ZSTD_REP_NUM]; + + /* init */ + ctx->nextToUpdate3 = ctx->nextToUpdate; + ZSTD_rescaleFreqs(seqStorePtr, (const BYTE *)src, srcSize); + ip += (ip == prefixStart); + { + U32 i; + for (i = 0; i < ZSTD_REP_NUM; i++) + rep[i] = ctx->rep[i]; + } + + /* Match Loop */ + while (ip < ilimit) { + U32 cur, match_num, last_pos, litlen, price; + U32 u, mlen, best_mlen, best_off, litLength; + memset(opt, 0, sizeof(ZSTD_optimal_t)); + last_pos = 0; + litlen = (U32)(ip - anchor); + + /* check repCode */ + { + U32 i, last_i = ZSTD_REP_CHECK + (ip == anchor); + for (i = (ip == anchor); i < last_i; i++) { + const S32 repCur = (i == ZSTD_REP_MOVE_OPT) ? (rep[0] - 1) : rep[i]; + if ((repCur > 0) && (repCur < (S32)(ip - prefixStart)) && + (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(ip - repCur, minMatch))) { + mlen = (U32)ZSTD_count(ip + minMatch, ip + minMatch - repCur, iend) + minMatch; + if (mlen > sufficient_len || mlen >= ZSTD_OPT_NUM) { + best_mlen = mlen; + best_off = i; + cur = 0; + last_pos = 1; + goto _storeSequence; + } + best_off = i - (ip == anchor); + do { + price = ZSTD_getPrice(seqStorePtr, litlen, anchor, best_off, mlen - MINMATCH, ultra); + if (mlen > last_pos || price < opt[mlen].price) + SET_PRICE(mlen, mlen, i, litlen, price); /* note : macro modifies last_pos */ + mlen--; + } while (mlen >= minMatch); + } + } + } + + match_num = ZSTD_BtGetAllMatches_selectMLS(ctx, ip, iend, maxSearches, mls, matches, minMatch); + + if (!last_pos && !match_num) { + ip++; + continue; + } + + if (match_num && (matches[match_num - 1].len > sufficient_len || matches[match_num - 1].len >= ZSTD_OPT_NUM)) { + best_mlen = matches[match_num - 1].len; + best_off = matches[match_num - 1].off; + cur = 0; + last_pos = 1; + goto _storeSequence; + } + + /* set prices using matches at position = 0 */ + best_mlen = (last_pos) ? last_pos : minMatch; + for (u = 0; u < match_num; u++) { + mlen = (u > 0) ? matches[u - 1].len + 1 : best_mlen; + best_mlen = matches[u].len; + while (mlen <= best_mlen) { + price = ZSTD_getPrice(seqStorePtr, litlen, anchor, matches[u].off - 1, mlen - MINMATCH, ultra); + if (mlen > last_pos || price < opt[mlen].price) + SET_PRICE(mlen, mlen, matches[u].off, litlen, price); /* note : macro modifies last_pos */ + mlen++; + } + } + + if (last_pos < minMatch) { + ip++; + continue; + } + + /* initialize opt[0] */ + { + U32 i; + for (i = 0; i < ZSTD_REP_NUM; i++) + opt[0].rep[i] = rep[i]; + } + opt[0].mlen = 1; + opt[0].litlen = litlen; + + /* check further positions */ + for (cur = 1; cur <= last_pos; cur++) { + inr = ip + cur; + + if (opt[cur - 1].mlen == 1) { + litlen = opt[cur - 1].litlen + 1; + if (cur > litlen) { + price = opt[cur - litlen].price + ZSTD_getLiteralPrice(seqStorePtr, litlen, inr - litlen); + } else + price = ZSTD_getLiteralPrice(seqStorePtr, litlen, anchor); + } else { + litlen = 1; + price = opt[cur - 1].price + ZSTD_getLiteralPrice(seqStorePtr, litlen, inr - 1); + } + + if (cur > last_pos || price <= opt[cur].price) + SET_PRICE(cur, 1, 0, litlen, price); + + if (cur == last_pos) + break; + + if (inr > ilimit) /* last match must start at a minimum distance of 8 from oend */ + continue; + + mlen = opt[cur].mlen; + if (opt[cur].off > ZSTD_REP_MOVE_OPT) { + opt[cur].rep[2] = opt[cur - mlen].rep[1]; + opt[cur].rep[1] = opt[cur - mlen].rep[0]; + opt[cur].rep[0] = opt[cur].off - ZSTD_REP_MOVE_OPT; + } else { + opt[cur].rep[2] = (opt[cur].off > 1) ? opt[cur - mlen].rep[1] : opt[cur - mlen].rep[2]; + opt[cur].rep[1] = (opt[cur].off > 0) ? opt[cur - mlen].rep[0] : opt[cur - mlen].rep[1]; + opt[cur].rep[0] = + ((opt[cur].off == ZSTD_REP_MOVE_OPT) && (mlen != 1)) ? (opt[cur - mlen].rep[0] - 1) : (opt[cur - mlen].rep[opt[cur].off]); + } + + best_mlen = minMatch; + { + U32 i, last_i = ZSTD_REP_CHECK + (mlen != 1); + for (i = (opt[cur].mlen != 1); i < last_i; i++) { /* check rep */ + const S32 repCur = (i == ZSTD_REP_MOVE_OPT) ? (opt[cur].rep[0] - 1) : opt[cur].rep[i]; + if ((repCur > 0) && (repCur < (S32)(inr - prefixStart)) && + (ZSTD_readMINMATCH(inr, minMatch) == ZSTD_readMINMATCH(inr - repCur, minMatch))) { + mlen = (U32)ZSTD_count(inr + minMatch, inr + minMatch - repCur, iend) + minMatch; + + if (mlen > sufficient_len || cur + mlen >= ZSTD_OPT_NUM) { + best_mlen = mlen; + best_off = i; + last_pos = cur + 1; + goto _storeSequence; + } + + best_off = i - (opt[cur].mlen != 1); + if (mlen > best_mlen) + best_mlen = mlen; + + do { + if (opt[cur].mlen == 1) { + litlen = opt[cur].litlen; + if (cur > litlen) { + price = opt[cur - litlen].price + ZSTD_getPrice(seqStorePtr, litlen, inr - litlen, + best_off, mlen - MINMATCH, ultra); + } else + price = ZSTD_getPrice(seqStorePtr, litlen, anchor, best_off, mlen - MINMATCH, ultra); + } else { + litlen = 0; + price = opt[cur].price + ZSTD_getPrice(seqStorePtr, 0, NULL, best_off, mlen - MINMATCH, ultra); + } + + if (cur + mlen > last_pos || price <= opt[cur + mlen].price) + SET_PRICE(cur + mlen, mlen, i, litlen, price); + mlen--; + } while (mlen >= minMatch); + } + } + } + + match_num = ZSTD_BtGetAllMatches_selectMLS(ctx, inr, iend, maxSearches, mls, matches, best_mlen); + + if (match_num > 0 && (matches[match_num - 1].len > sufficient_len || cur + matches[match_num - 1].len >= ZSTD_OPT_NUM)) { + best_mlen = matches[match_num - 1].len; + best_off = matches[match_num - 1].off; + last_pos = cur + 1; + goto _storeSequence; + } + + /* set prices using matches at position = cur */ + for (u = 0; u < match_num; u++) { + mlen = (u > 0) ? matches[u - 1].len + 1 : best_mlen; + best_mlen = matches[u].len; + + while (mlen <= best_mlen) { + if (opt[cur].mlen == 1) { + litlen = opt[cur].litlen; + if (cur > litlen) + price = opt[cur - litlen].price + ZSTD_getPrice(seqStorePtr, litlen, ip + cur - litlen, + matches[u].off - 1, mlen - MINMATCH, ultra); + else + price = ZSTD_getPrice(seqStorePtr, litlen, anchor, matches[u].off - 1, mlen - MINMATCH, ultra); + } else { + litlen = 0; + price = opt[cur].price + ZSTD_getPrice(seqStorePtr, 0, NULL, matches[u].off - 1, mlen - MINMATCH, ultra); + } + + if (cur + mlen > last_pos || (price < opt[cur + mlen].price)) + SET_PRICE(cur + mlen, mlen, matches[u].off, litlen, price); + + mlen++; + } + } + } + + best_mlen = opt[last_pos].mlen; + best_off = opt[last_pos].off; + cur = last_pos - best_mlen; + + /* store sequence */ +_storeSequence: /* cur, last_pos, best_mlen, best_off have to be set */ + opt[0].mlen = 1; + + while (1) { + mlen = opt[cur].mlen; + offset = opt[cur].off; + opt[cur].mlen = best_mlen; + opt[cur].off = best_off; + best_mlen = mlen; + best_off = offset; + if (mlen > cur) + break; + cur -= mlen; + } + + for (u = 0; u <= last_pos;) { + u += opt[u].mlen; + } + + for (cur = 0; cur < last_pos;) { + mlen = opt[cur].mlen; + if (mlen == 1) { + ip++; + cur++; + continue; + } + offset = opt[cur].off; + cur += mlen; + litLength = (U32)(ip - anchor); + + if (offset > ZSTD_REP_MOVE_OPT) { + rep[2] = rep[1]; + rep[1] = rep[0]; + rep[0] = offset - ZSTD_REP_MOVE_OPT; + offset--; + } else { + if (offset != 0) { + best_off = (offset == ZSTD_REP_MOVE_OPT) ? (rep[0] - 1) : (rep[offset]); + if (offset != 1) + rep[2] = rep[1]; + rep[1] = rep[0]; + rep[0] = best_off; + } + if (litLength == 0) + offset--; + } + + ZSTD_updatePrice(seqStorePtr, litLength, anchor, offset, mlen - MINMATCH); + ZSTD_storeSeq(seqStorePtr, litLength, anchor, offset, mlen - MINMATCH); + anchor = ip = ip + mlen; + } + } /* for (cur=0; cur < last_pos; ) */ + + /* Save reps for next block */ + { + int i; + for (i = 0; i < ZSTD_REP_NUM; i++) + ctx->repToConfirm[i] = rep[i]; + } + + /* Last Literals */ + { + size_t const lastLLSize = iend - anchor; + memcpy(seqStorePtr->lit, anchor, lastLLSize); + seqStorePtr->lit += lastLLSize; + } +} + +FORCE_INLINE +void ZSTD_compressBlock_opt_extDict_generic(ZSTD_CCtx *ctx, const void *src, size_t srcSize, const int ultra) +{ + seqStore_t *seqStorePtr = &(ctx->seqStore); + const BYTE *const istart = (const BYTE *)src; + const BYTE *ip = istart; + const BYTE *anchor = istart; + const BYTE *const iend = istart + srcSize; + const BYTE *const ilimit = iend - 8; + const BYTE *const base = ctx->base; + const U32 lowestIndex = ctx->lowLimit; + const U32 dictLimit = ctx->dictLimit; + const BYTE *const prefixStart = base + dictLimit; + const BYTE *const dictBase = ctx->dictBase; + const BYTE *const dictEnd = dictBase + dictLimit; + + const U32 maxSearches = 1U << ctx->params.cParams.searchLog; + const U32 sufficient_len = ctx->params.cParams.targetLength; + const U32 mls = ctx->params.cParams.searchLength; + const U32 minMatch = (ctx->params.cParams.searchLength == 3) ? 3 : 4; + + ZSTD_optimal_t *opt = seqStorePtr->priceTable; + ZSTD_match_t *matches = seqStorePtr->matchTable; + const BYTE *inr; + + /* init */ + U32 offset, rep[ZSTD_REP_NUM]; + { + U32 i; + for (i = 0; i < ZSTD_REP_NUM; i++) + rep[i] = ctx->rep[i]; + } + + ctx->nextToUpdate3 = ctx->nextToUpdate; + ZSTD_rescaleFreqs(seqStorePtr, (const BYTE *)src, srcSize); + ip += (ip == prefixStart); + + /* Match Loop */ + while (ip < ilimit) { + U32 cur, match_num, last_pos, litlen, price; + U32 u, mlen, best_mlen, best_off, litLength; + U32 curr = (U32)(ip - base); + memset(opt, 0, sizeof(ZSTD_optimal_t)); + last_pos = 0; + opt[0].litlen = (U32)(ip - anchor); + + /* check repCode */ + { + U32 i, last_i = ZSTD_REP_CHECK + (ip == anchor); + for (i = (ip == anchor); i < last_i; i++) { + const S32 repCur = (i == ZSTD_REP_MOVE_OPT) ? (rep[0] - 1) : rep[i]; + const U32 repIndex = (U32)(curr - repCur); + const BYTE *const repBase = repIndex < dictLimit ? dictBase : base; + const BYTE *const repMatch = repBase + repIndex; + if ((repCur > 0 && repCur <= (S32)curr) && + (((U32)((dictLimit - 1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */ + && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch))) { + /* repcode detected we should take it */ + const BYTE *const repEnd = repIndex < dictLimit ? dictEnd : iend; + mlen = (U32)ZSTD_count_2segments(ip + minMatch, repMatch + minMatch, iend, repEnd, prefixStart) + minMatch; + + if (mlen > sufficient_len || mlen >= ZSTD_OPT_NUM) { + best_mlen = mlen; + best_off = i; + cur = 0; + last_pos = 1; + goto _storeSequence; + } + + best_off = i - (ip == anchor); + litlen = opt[0].litlen; + do { + price = ZSTD_getPrice(seqStorePtr, litlen, anchor, best_off, mlen - MINMATCH, ultra); + if (mlen > last_pos || price < opt[mlen].price) + SET_PRICE(mlen, mlen, i, litlen, price); /* note : macro modifies last_pos */ + mlen--; + } while (mlen >= minMatch); + } + } + } + + match_num = ZSTD_BtGetAllMatches_selectMLS_extDict(ctx, ip, iend, maxSearches, mls, matches, minMatch); /* first search (depth 0) */ + + if (!last_pos && !match_num) { + ip++; + continue; + } + + { + U32 i; + for (i = 0; i < ZSTD_REP_NUM; i++) + opt[0].rep[i] = rep[i]; + } + opt[0].mlen = 1; + + if (match_num && (matches[match_num - 1].len > sufficient_len || matches[match_num - 1].len >= ZSTD_OPT_NUM)) { + best_mlen = matches[match_num - 1].len; + best_off = matches[match_num - 1].off; + cur = 0; + last_pos = 1; + goto _storeSequence; + } + + best_mlen = (last_pos) ? last_pos : minMatch; + + /* set prices using matches at position = 0 */ + for (u = 0; u < match_num; u++) { + mlen = (u > 0) ? matches[u - 1].len + 1 : best_mlen; + best_mlen = matches[u].len; + litlen = opt[0].litlen; + while (mlen <= best_mlen) { + price = ZSTD_getPrice(seqStorePtr, litlen, anchor, matches[u].off - 1, mlen - MINMATCH, ultra); + if (mlen > last_pos || price < opt[mlen].price) + SET_PRICE(mlen, mlen, matches[u].off, litlen, price); + mlen++; + } + } + + if (last_pos < minMatch) { + ip++; + continue; + } + + /* check further positions */ + for (cur = 1; cur <= last_pos; cur++) { + inr = ip + cur; + + if (opt[cur - 1].mlen == 1) { + litlen = opt[cur - 1].litlen + 1; + if (cur > litlen) { + price = opt[cur - litlen].price + ZSTD_getLiteralPrice(seqStorePtr, litlen, inr - litlen); + } else + price = ZSTD_getLiteralPrice(seqStorePtr, litlen, anchor); + } else { + litlen = 1; + price = opt[cur - 1].price + ZSTD_getLiteralPrice(seqStorePtr, litlen, inr - 1); + } + + if (cur > last_pos || price <= opt[cur].price) + SET_PRICE(cur, 1, 0, litlen, price); + + if (cur == last_pos) + break; + + if (inr > ilimit) /* last match must start at a minimum distance of 8 from oend */ + continue; + + mlen = opt[cur].mlen; + if (opt[cur].off > ZSTD_REP_MOVE_OPT) { + opt[cur].rep[2] = opt[cur - mlen].rep[1]; + opt[cur].rep[1] = opt[cur - mlen].rep[0]; + opt[cur].rep[0] = opt[cur].off - ZSTD_REP_MOVE_OPT; + } else { + opt[cur].rep[2] = (opt[cur].off > 1) ? opt[cur - mlen].rep[1] : opt[cur - mlen].rep[2]; + opt[cur].rep[1] = (opt[cur].off > 0) ? opt[cur - mlen].rep[0] : opt[cur - mlen].rep[1]; + opt[cur].rep[0] = + ((opt[cur].off == ZSTD_REP_MOVE_OPT) && (mlen != 1)) ? (opt[cur - mlen].rep[0] - 1) : (opt[cur - mlen].rep[opt[cur].off]); + } + + best_mlen = minMatch; + { + U32 i, last_i = ZSTD_REP_CHECK + (mlen != 1); + for (i = (mlen != 1); i < last_i; i++) { + const S32 repCur = (i == ZSTD_REP_MOVE_OPT) ? (opt[cur].rep[0] - 1) : opt[cur].rep[i]; + const U32 repIndex = (U32)(curr + cur - repCur); + const BYTE *const repBase = repIndex < dictLimit ? dictBase : base; + const BYTE *const repMatch = repBase + repIndex; + if ((repCur > 0 && repCur <= (S32)(curr + cur)) && + (((U32)((dictLimit - 1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */ + && (ZSTD_readMINMATCH(inr, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch))) { + /* repcode detected */ + const BYTE *const repEnd = repIndex < dictLimit ? dictEnd : iend; + mlen = (U32)ZSTD_count_2segments(inr + minMatch, repMatch + minMatch, iend, repEnd, prefixStart) + minMatch; + + if (mlen > sufficient_len || cur + mlen >= ZSTD_OPT_NUM) { + best_mlen = mlen; + best_off = i; + last_pos = cur + 1; + goto _storeSequence; + } + + best_off = i - (opt[cur].mlen != 1); + if (mlen > best_mlen) + best_mlen = mlen; + + do { + if (opt[cur].mlen == 1) { + litlen = opt[cur].litlen; + if (cur > litlen) { + price = opt[cur - litlen].price + ZSTD_getPrice(seqStorePtr, litlen, inr - litlen, + best_off, mlen - MINMATCH, ultra); + } else + price = ZSTD_getPrice(seqStorePtr, litlen, anchor, best_off, mlen - MINMATCH, ultra); + } else { + litlen = 0; + price = opt[cur].price + ZSTD_getPrice(seqStorePtr, 0, NULL, best_off, mlen - MINMATCH, ultra); + } + + if (cur + mlen > last_pos || price <= opt[cur + mlen].price) + SET_PRICE(cur + mlen, mlen, i, litlen, price); + mlen--; + } while (mlen >= minMatch); + } + } + } + + match_num = ZSTD_BtGetAllMatches_selectMLS_extDict(ctx, inr, iend, maxSearches, mls, matches, minMatch); + + if (match_num > 0 && (matches[match_num - 1].len > sufficient_len || cur + matches[match_num - 1].len >= ZSTD_OPT_NUM)) { + best_mlen = matches[match_num - 1].len; + best_off = matches[match_num - 1].off; + last_pos = cur + 1; + goto _storeSequence; + } + + /* set prices using matches at position = cur */ + for (u = 0; u < match_num; u++) { + mlen = (u > 0) ? matches[u - 1].len + 1 : best_mlen; + best_mlen = matches[u].len; + + while (mlen <= best_mlen) { + if (opt[cur].mlen == 1) { + litlen = opt[cur].litlen; + if (cur > litlen) + price = opt[cur - litlen].price + ZSTD_getPrice(seqStorePtr, litlen, ip + cur - litlen, + matches[u].off - 1, mlen - MINMATCH, ultra); + else + price = ZSTD_getPrice(seqStorePtr, litlen, anchor, matches[u].off - 1, mlen - MINMATCH, ultra); + } else { + litlen = 0; + price = opt[cur].price + ZSTD_getPrice(seqStorePtr, 0, NULL, matches[u].off - 1, mlen - MINMATCH, ultra); + } + + if (cur + mlen > last_pos || (price < opt[cur + mlen].price)) + SET_PRICE(cur + mlen, mlen, matches[u].off, litlen, price); + + mlen++; + } + } + } /* for (cur = 1; cur <= last_pos; cur++) */ + + best_mlen = opt[last_pos].mlen; + best_off = opt[last_pos].off; + cur = last_pos - best_mlen; + + /* store sequence */ +_storeSequence: /* cur, last_pos, best_mlen, best_off have to be set */ + opt[0].mlen = 1; + + while (1) { + mlen = opt[cur].mlen; + offset = opt[cur].off; + opt[cur].mlen = best_mlen; + opt[cur].off = best_off; + best_mlen = mlen; + best_off = offset; + if (mlen > cur) + break; + cur -= mlen; + } + + for (u = 0; u <= last_pos;) { + u += opt[u].mlen; + } + + for (cur = 0; cur < last_pos;) { + mlen = opt[cur].mlen; + if (mlen == 1) { + ip++; + cur++; + continue; + } + offset = opt[cur].off; + cur += mlen; + litLength = (U32)(ip - anchor); + + if (offset > ZSTD_REP_MOVE_OPT) { + rep[2] = rep[1]; + rep[1] = rep[0]; + rep[0] = offset - ZSTD_REP_MOVE_OPT; + offset--; + } else { + if (offset != 0) { + best_off = (offset == ZSTD_REP_MOVE_OPT) ? (rep[0] - 1) : (rep[offset]); + if (offset != 1) + rep[2] = rep[1]; + rep[1] = rep[0]; + rep[0] = best_off; + } + + if (litLength == 0) + offset--; + } + + ZSTD_updatePrice(seqStorePtr, litLength, anchor, offset, mlen - MINMATCH); + ZSTD_storeSeq(seqStorePtr, litLength, anchor, offset, mlen - MINMATCH); + anchor = ip = ip + mlen; + } + } /* for (cur=0; cur < last_pos; ) */ + + /* Save reps for next block */ + { + int i; + for (i = 0; i < ZSTD_REP_NUM; i++) + ctx->repToConfirm[i] = rep[i]; + } + + /* Last Literals */ + { + size_t lastLLSize = iend - anchor; + memcpy(seqStorePtr->lit, anchor, lastLLSize); + seqStorePtr->lit += lastLLSize; + } +} + +#endif /* ZSTD_OPT_H_91842398743 */ From f0b5d429750919066b9a3614083ee0432a291c5a Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 15 Nov 2017 17:33:49 -0800 Subject: [PATCH 1171/1212] UPSTREAM: zram: add zstd to the supported algorithms list Add ZSTD to the list of supported compression algorithms. ZRAM fio perf test: LZO DEFLATE ZSTD WRITE: (2180MB/s) (77.2MB/s) (1429MB/s) WRITE: (1617MB/s) (77.7MB/s) (1202MB/s) READ: (426MB/s) (595MB/s) (1181MB/s) READ: (422MB/s) (572MB/s) (1020MB/s) READ: (318MB/s) (67.8MB/s) (563MB/s) WRITE: (318MB/s) (67.9MB/s) (564MB/s) READ: (336MB/s) (68.3MB/s) (583MB/s) WRITE: (335MB/s) (68.2MB/s) (582MB/s) WRITE: (3441MB/s) (152MB/s) (2141MB/s) WRITE: (2507MB/s) (147MB/s) (1888MB/s) READ: (801MB/s) (1146MB/s) (1890MB/s) READ: (767MB/s) (1096MB/s) (2073MB/s) READ: (621MB/s) (126MB/s) (1009MB/s) WRITE: (621MB/s) (126MB/s) (1009MB/s) READ: (656MB/s) (125MB/s) (1075MB/s) WRITE: (657MB/s) (126MB/s) (1077MB/s) WRITE: (4772MB/s) (225MB/s) (3394MB/s) WRITE: (3905MB/s) (211MB/s) (2939MB/s) READ: (1216MB/s) (1608MB/s) (3218MB/s) READ: (1159MB/s) (1431MB/s) (2981MB/s) READ: (906MB/s) (156MB/s) (1457MB/s) WRITE: (907MB/s) (156MB/s) (1458MB/s) READ: (953MB/s) (158MB/s) (1595MB/s) WRITE: (952MB/s) (157MB/s) (1593MB/s) WRITE: (6036MB/s) (265MB/s) (4469MB/s) WRITE: (5059MB/s) (263MB/s) (3951MB/s) READ: (1618MB/s) (2066MB/s) (4276MB/s) READ: (1573MB/s) (1942MB/s) (3830MB/s) READ: (1202MB/s) (227MB/s) (1971MB/s) WRITE: (1200MB/s) (227MB/s) (1968MB/s) READ: (1265MB/s) (226MB/s) (2116MB/s) WRITE: (1264MB/s) (226MB/s) (2114MB/s) WRITE: (5339MB/s) (233MB/s) (3781MB/s) WRITE: (4298MB/s) (234MB/s) (3276MB/s) READ: (1626MB/s) (2048MB/s) (4081MB/s) READ: (1567MB/s) (1929MB/s) (3758MB/s) READ: (1174MB/s) (205MB/s) (1747MB/s) WRITE: (1173MB/s) (204MB/s) (1746MB/s) READ: (1214MB/s) (208MB/s) (1890MB/s) WRITE: (1215MB/s) (208MB/s) (1892MB/s) WRITE: (5666MB/s) (270MB/s) (4338MB/s) WRITE: (4828MB/s) (267MB/s) (3772MB/s) READ: (1803MB/s) (2058MB/s) (4946MB/s) READ: (1805MB/s) (2156MB/s) (4711MB/s) READ: (1334MB/s) (235MB/s) (2135MB/s) WRITE: (1335MB/s) (235MB/s) (2137MB/s) READ: (1364MB/s) (236MB/s) (2268MB/s) WRITE: (1365MB/s) (237MB/s) (2270MB/s) WRITE: (5474MB/s) (270MB/s) (4300MB/s) WRITE: (4666MB/s) (266MB/s) (3817MB/s) READ: (2022MB/s) (2319MB/s) (5472MB/s) READ: (1924MB/s) (2260MB/s) (5031MB/s) READ: (1369MB/s) (242MB/s) (2153MB/s) WRITE: (1370MB/s) (242MB/s) (2155MB/s) READ: (1499MB/s) (246MB/s) (2310MB/s) WRITE: (1497MB/s) (246MB/s) (2307MB/s) WRITE: (5558MB/s) (273MB/s) (4439MB/s) WRITE: (4763MB/s) (271MB/s) (3918MB/s) READ: (2201MB/s) (2599MB/s) (6062MB/s) READ: (2105MB/s) (2463MB/s) (5413MB/s) READ: (1490MB/s) (252MB/s) (2238MB/s) WRITE: (1488MB/s) (252MB/s) (2236MB/s) READ: (1566MB/s) (254MB/s) (2434MB/s) WRITE: (1568MB/s) (254MB/s) (2437MB/s) WRITE: (5120MB/s) (264MB/s) (4035MB/s) WRITE: (4531MB/s) (267MB/s) (3740MB/s) READ: (1940MB/s) (2258MB/s) (4986MB/s) READ: (2024MB/s) (2387MB/s) (4871MB/s) READ: (1343MB/s) (246MB/s) (2038MB/s) WRITE: (1342MB/s) (246MB/s) (2037MB/s) READ: (1553MB/s) (238MB/s) (2243MB/s) WRITE: (1552MB/s) (238MB/s) (2242MB/s) WRITE: (5345MB/s) (271MB/s) (3988MB/s) WRITE: (4750MB/s) (254MB/s) (3668MB/s) READ: (1876MB/s) (2363MB/s) (5150MB/s) READ: (1990MB/s) (2256MB/s) (5080MB/s) READ: (1355MB/s) (250MB/s) (2019MB/s) WRITE: (1356MB/s) (251MB/s) (2020MB/s) READ: (1490MB/s) (252MB/s) (2202MB/s) WRITE: (1488MB/s) (252MB/s) (2199MB/s) jobs1 perfstat instructions 52,065,555,710 ( 0.79) 855,731,114,587 ( 2.64) 54,280,709,944 ( 1.40) branches 14,020,427,116 ( 725.847) 101,733,449,582 (1074.521) 11,170,591,067 ( 992.869) branch-misses 22,626,174 ( 0.16%) 274,197,885 ( 0.27%) 25,915,805 ( 0.23%) jobs2 perfstat instructions 103,633,110,402 ( 0.75) 1,710,822,100,914 ( 2.59) 107,879,874,104 ( 1.28) branches 27,931,237,282 ( 679.203) 203,298,267,479 (1037.326) 22,185,350,842 ( 884.427) branch-misses 46,103,811 ( 0.17%) 533,747,204 ( 0.26%) 49,682,483 ( 0.22%) jobs3 perfstat instructions 154,857,283,657 ( 0.76) 2,565,748,974,197 ( 2.57) 161,515,435,813 ( 1.31) branches 41,759,490,355 ( 670.529) 304,905,605,277 ( 978.765) 33,215,805,907 ( 888.003) branch-misses 74,263,293 ( 0.18%) 759,746,240 ( 0.25%) 76,841,196 ( 0.23%) jobs4 perfstat instructions 206,215,849,076 ( 0.75) 3,420,169,460,897 ( 2.60) 215,003,061,664 ( 1.31) branches 55,632,141,739 ( 666.501) 406,394,977,433 ( 927.241) 44,214,322,251 ( 883.532) branch-misses 102,287,788 ( 0.18%) 1,098,617,314 ( 0.27%) 103,891,040 ( 0.23%) jobs5 perfstat instructions 258,711,315,588 ( 0.67) 4,275,657,533,244 ( 2.23) 269,332,235,685 ( 1.08) branches 69,802,821,166 ( 588.823) 507,996,211,252 ( 797.036) 55,450,846,129 ( 735.095) branch-misses 129,217,214 ( 0.19%) 1,243,284,991 ( 0.24%) 173,512,278 ( 0.31%) jobs6 perfstat instructions 312,796,166,008 ( 0.61) 5,133,896,344,660 ( 2.02) 323,658,769,588 ( 1.04) branches 84,372,488,583 ( 520.541) 610,310,494,402 ( 697.642) 66,683,292,992 ( 693.939) branch-misses 159,438,978 ( 0.19%) 1,396,368,563 ( 0.23%) 174,406,934 ( 0.26%) jobs7 perfstat instructions 363,211,372,930 ( 0.56) 5,988,205,600,879 ( 1.75) 377,824,674,156 ( 0.93) branches 98,057,013,765 ( 463.117) 711,841,255,974 ( 598.762) 77,879,009,954 ( 600.443) branch-misses 199,513,153 ( 0.20%) 1,507,651,077 ( 0.21%) 248,203,369 ( 0.32%) jobs8 perfstat instructions 413,960,354,615 ( 0.52) 6,842,918,558,378 ( 1.45) 431,938,486,581 ( 0.83) branches 111,812,574,884 ( 414.224) 813,299,084,518 ( 491.173) 89,062,699,827 ( 517.795) branch-misses 233,584,845 ( 0.21%) 1,531,593,921 ( 0.19%) 286,818,489 ( 0.32%) jobs9 perfstat instructions 465,976,220,300 ( 0.53) 7,698,467,237,372 ( 1.47) 486,352,600,321 ( 0.84) branches 125,931,456,162 ( 424.063) 915,207,005,715 ( 498.192) 100,370,404,090 ( 517.439) branch-misses 256,992,445 ( 0.20%) 1,782,809,816 ( 0.19%) 345,239,380 ( 0.34%) jobs10 perfstat instructions 517,406,372,715 ( 0.53) 8,553,527,312,900 ( 1.48) 540,732,653,094 ( 0.84) branches 139,839,780,676 ( 427.732) 1,016,737,699,389 ( 503.172) 111,696,557,638 ( 516.750) branch-misses 259,595,561 ( 0.19%) 1,952,570,279 ( 0.19%) 357,818,661 ( 0.32%) seconds elapsed 20.630411534 96.084546565 12.743373571 seconds elapsed 22.292627625 100.984155001 14.407413560 seconds elapsed 22.396016966 110.344880848 14.032201392 seconds elapsed 22.517330949 113.351459170 14.243074935 seconds elapsed 28.548305104 156.515193765 19.159286861 seconds elapsed 30.453538116 164.559937678 19.362492717 seconds elapsed 33.467108086 188.486827481 21.492612173 seconds elapsed 35.617727591 209.602677783 23.256422492 seconds elapsed 42.584239509 243.959902566 28.458540338 seconds elapsed 47.683632526 269.635248851 31.542404137 Over all, ZSTD has slower WRITE, but much faster READ (perhaps a static compression buffer used during the test helped ZSTD a lot), which results in faster test results. Memory consumption (zram mm_stat file): zram LZO mm_stat mm_stat (jobs1): 2147483648 23068672 33558528 0 33558528 0 0 mm_stat (jobs2): 2147483648 23068672 33558528 0 33558528 0 0 mm_stat (jobs3): 2147483648 23068672 33558528 0 33562624 0 0 mm_stat (jobs4): 2147483648 23068672 33558528 0 33558528 0 0 mm_stat (jobs5): 2147483648 23068672 33558528 0 33558528 0 0 mm_stat (jobs6): 2147483648 23068672 33558528 0 33562624 0 0 mm_stat (jobs7): 2147483648 23068672 33558528 0 33566720 0 0 mm_stat (jobs8): 2147483648 23068672 33558528 0 33558528 0 0 mm_stat (jobs9): 2147483648 23068672 33558528 0 33558528 0 0 mm_stat (jobs10): 2147483648 23068672 33558528 0 33562624 0 0 zram DEFLATE mm_stat mm_stat (jobs1): 2147483648 16252928 25178112 0 25178112 0 0 mm_stat (jobs2): 2147483648 16252928 25178112 0 25178112 0 0 mm_stat (jobs3): 2147483648 16252928 25178112 0 25178112 0 0 mm_stat (jobs4): 2147483648 16252928 25178112 0 25178112 0 0 mm_stat (jobs5): 2147483648 16252928 25178112 0 25178112 0 0 mm_stat (jobs6): 2147483648 16252928 25178112 0 25178112 0 0 mm_stat (jobs7): 2147483648 16252928 25178112 0 25190400 0 0 mm_stat (jobs8): 2147483648 16252928 25178112 0 25190400 0 0 mm_stat (jobs9): 2147483648 16252928 25178112 0 25178112 0 0 mm_stat (jobs10): 2147483648 16252928 25178112 0 25178112 0 0 zram ZSTD mm_stat mm_stat (jobs1): 2147483648 11010048 16781312 0 16781312 0 0 mm_stat (jobs2): 2147483648 11010048 16781312 0 16781312 0 0 mm_stat (jobs3): 2147483648 11010048 16781312 0 16785408 0 0 mm_stat (jobs4): 2147483648 11010048 16781312 0 16781312 0 0 mm_stat (jobs5): 2147483648 11010048 16781312 0 16781312 0 0 mm_stat (jobs6): 2147483648 11010048 16781312 0 16781312 0 0 mm_stat (jobs7): 2147483648 11010048 16781312 0 16781312 0 0 mm_stat (jobs8): 2147483648 11010048 16781312 0 16781312 0 0 mm_stat (jobs9): 2147483648 11010048 16781312 0 16785408 0 0 mm_stat (jobs10): 2147483648 11010048 16781312 0 16781312 0 0 ================================================================================== Official benchmarks [1]: Compressor name Ratio Compression Decompress. zstd 1.1.3 -1 2.877 430 MB/s 1110 MB/s zlib 1.2.8 -1 2.743 110 MB/s 400 MB/s brotli 0.5.2 -0 2.708 400 MB/s 430 MB/s quicklz 1.5.0 -1 2.238 550 MB/s 710 MB/s lzo1x 2.09 -1 2.108 650 MB/s 830 MB/s lz4 1.7.5 2.101 720 MB/s 3600 MB/s snappy 1.1.3 2.091 500 MB/s 1650 MB/s lzf 3.6 -1 2.077 400 MB/s 860 MB/s Minchan said: : I did test with my sample data and compared zstd with deflate. zstd's : compress ratio is lower a little bit but compression speed is much faster : 3 times more and decompress speed is too 2 times more. With different : data, it is different but overall, zstd would be better for speed at the : cost of a little lower compress ratio(about 5%) so I believe it's worth to : replace deflate. [1] https://github.com/facebook/zstd Link: http://lkml.kernel.org/r/20170912050005.3247-1-sergey.senozhatsky@gmail.com Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Tested-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 5ef3a8b12556d7fcba81edc74e9d85b029615ae0) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: Ieb6239dab92f560fa654d9cc29b1e266f2e44050 Signed-off-by: Amit Pundir --- drivers/block/zram/zcomp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index 4b5cd3a7b2b6..c084a7f9763d 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -31,6 +31,9 @@ static const char * const backends[] = { #endif #if IS_ENABLED(CONFIG_CRYPTO_842) "842", +#endif +#if IS_ENABLED(CONFIG_CRYPTO_ZSTD) + "zstd", #endif NULL }; From e9c01c2e6041b9efbcef2aa05f870559edcc3d2c Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Fri, 30 Mar 2018 12:14:53 -0700 Subject: [PATCH 1172/1212] BACKPORT: crypto: zstd - Add zstd support Adds zstd support to crypto and scompress. Only supports the default level. Previously we held off on this patch, since there weren't any users. Now zram is ready for zstd support, but depends on CONFIG_CRYPTO_ZSTD, which isn't defined until this patch is in. I also see a patch adding zstd to pstore [0], which depends on crypto zstd. [0] lkml.kernel.org/r/9c9416b2dff19f05fb4c35879aaa83d11ff72c92.1521626182.git.geliangtang@gmail.com Signed-off-by: Nick Terrell Signed-off-by: Herbert Xu (cherry picked from commit d28fc3dbe1918333730d62aa5f0d84b6fb4e7254) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I070acf1dd8bf415f8997a48ee908d930754fc71e Signed-off-by: Amit Pundir --- crypto/Kconfig | 9 ++ crypto/Makefile | 1 + crypto/testmgr.c | 16 ++++ crypto/testmgr.h | 74 +++++++++++++++++ crypto/zstd.c | 209 +++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 309 insertions(+) create mode 100644 crypto/zstd.c diff --git a/crypto/Kconfig b/crypto/Kconfig index 7a9f4d3d089b..ba8ca7cedb7e 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1577,6 +1577,15 @@ config CRYPTO_LZ4HC help This is the LZ4 high compression mode algorithm. +config CRYPTO_ZSTD + tristate "Zstd compression algorithm" + select CRYPTO_ALGAPI + select CRYPTO_ACOMP2 + select ZSTD_COMPRESS + select ZSTD_DECOMPRESS + help + This is the zstd algorithm. + comment "Random Number Generation" config CRYPTO_ANSI_CPRNG diff --git a/crypto/Makefile b/crypto/Makefile index 7ae15c47f684..74f36e7d163f 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -126,6 +126,7 @@ obj-$(CONFIG_CRYPTO_USER_API_HASH) += algif_hash.o obj-$(CONFIG_CRYPTO_USER_API_SKCIPHER) += algif_skcipher.o obj-$(CONFIG_CRYPTO_USER_API_RNG) += algif_rng.o obj-$(CONFIG_CRYPTO_USER_API_AEAD) += algif_aead.o +obj-$(CONFIG_CRYPTO_ZSTD) += zstd.o # # generic algorithms and the async_tx api diff --git a/crypto/testmgr.c b/crypto/testmgr.c index a4aef61e40d8..2329b5f16b8c 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -3949,6 +3949,22 @@ static const struct alg_test_desc alg_test_descs[] = { } } } + }, { + .alg = "zstd", + .test = alg_test_comp, + .fips_allowed = 1, + .suite = { + .comp = { + .comp = { + .vecs = zstd_comp_tv_template, + .count = ZSTD_COMP_TEST_VECTORS + }, + .decomp = { + .vecs = zstd_decomp_tv_template, + .count = ZSTD_DECOMP_TEST_VECTORS + } + } + } } }; diff --git a/crypto/testmgr.h b/crypto/testmgr.h index 0bb950ea22ed..58072e1a4def 100644 --- a/crypto/testmgr.h +++ b/crypto/testmgr.h @@ -35331,4 +35331,78 @@ static struct comp_testvec lz4hc_decomp_tv_template[] = { }, }; +#define ZSTD_COMP_TEST_VECTORS 2 +#define ZSTD_DECOMP_TEST_VECTORS 2 + +static struct comp_testvec zstd_comp_tv_template[] = { + { + .inlen = 68, + .outlen = 39, + .input = "The algorithm is zstd. " + "The algorithm is zstd. " + "The algorithm is zstd.", + .output = "\x28\xb5\x2f\xfd\x00\x50\xf5\x00\x00\xb8\x54\x68\x65" + "\x20\x61\x6c\x67\x6f\x72\x69\x74\x68\x6d\x20\x69\x73" + "\x20\x7a\x73\x74\x64\x2e\x20\x01\x00\x55\x73\x36\x01" + , + }, + { + .inlen = 244, + .outlen = 151, + .input = "zstd, short for Zstandard, is a fast lossless " + "compression algorithm, targeting real-time " + "compression scenarios at zlib-level and better " + "compression ratios. The zstd compression library " + "provides in-memory compression and decompression " + "functions.", + .output = "\x28\xb5\x2f\xfd\x00\x50\x75\x04\x00\x42\x4b\x1e\x17" + "\x90\x81\x31\x00\xf2\x2f\xe4\x36\xc9\xef\x92\x88\x32" + "\xc9\xf2\x24\x94\xd8\x68\x9a\x0f\x00\x0c\xc4\x31\x6f" + "\x0d\x0c\x38\xac\x5c\x48\x03\xcd\x63\x67\xc0\xf3\xad" + "\x4e\x90\xaa\x78\xa0\xa4\xc5\x99\xda\x2f\xb6\x24\x60" + "\xe2\x79\x4b\xaa\xb6\x6b\x85\x0b\xc9\xc6\x04\x66\x86" + "\xe2\xcc\xe2\x25\x3f\x4f\x09\xcd\xb8\x9d\xdb\xc1\x90" + "\xa9\x11\xbc\x35\x44\x69\x2d\x9c\x64\x4f\x13\x31\x64" + "\xcc\xfb\x4d\x95\x93\x86\x7f\x33\x7f\x1a\xef\xe9\x30" + "\xf9\x67\xa1\x94\x0a\x69\x0f\x60\xcd\xc3\xab\x99\xdc" + "\x42\xed\x97\x05\x00\x33\xc3\x15\x95\x3a\x06\xa0\x0e" + "\x20\xa9\x0e\x82\xb9\x43\x45\x01", + }, +}; + +static struct comp_testvec zstd_decomp_tv_template[] = { + { + .inlen = 43, + .outlen = 68, + .input = "\x28\xb5\x2f\xfd\x04\x50\xf5\x00\x00\xb8\x54\x68\x65" + "\x20\x61\x6c\x67\x6f\x72\x69\x74\x68\x6d\x20\x69\x73" + "\x20\x7a\x73\x74\x64\x2e\x20\x01\x00\x55\x73\x36\x01" + "\x6b\xf4\x13\x35", + .output = "The algorithm is zstd. " + "The algorithm is zstd. " + "The algorithm is zstd.", + }, + { + .inlen = 155, + .outlen = 244, + .input = "\x28\xb5\x2f\xfd\x04\x50\x75\x04\x00\x42\x4b\x1e\x17" + "\x90\x81\x31\x00\xf2\x2f\xe4\x36\xc9\xef\x92\x88\x32" + "\xc9\xf2\x24\x94\xd8\x68\x9a\x0f\x00\x0c\xc4\x31\x6f" + "\x0d\x0c\x38\xac\x5c\x48\x03\xcd\x63\x67\xc0\xf3\xad" + "\x4e\x90\xaa\x78\xa0\xa4\xc5\x99\xda\x2f\xb6\x24\x60" + "\xe2\x79\x4b\xaa\xb6\x6b\x85\x0b\xc9\xc6\x04\x66\x86" + "\xe2\xcc\xe2\x25\x3f\x4f\x09\xcd\xb8\x9d\xdb\xc1\x90" + "\xa9\x11\xbc\x35\x44\x69\x2d\x9c\x64\x4f\x13\x31\x64" + "\xcc\xfb\x4d\x95\x93\x86\x7f\x33\x7f\x1a\xef\xe9\x30" + "\xf9\x67\xa1\x94\x0a\x69\x0f\x60\xcd\xc3\xab\x99\xdc" + "\x42\xed\x97\x05\x00\x33\xc3\x15\x95\x3a\x06\xa0\x0e" + "\x20\xa9\x0e\x82\xb9\x43\x45\x01\xaa\x6d\xda\x0d", + .output = "zstd, short for Zstandard, is a fast lossless " + "compression algorithm, targeting real-time " + "compression scenarios at zlib-level and better " + "compression ratios. The zstd compression library " + "provides in-memory compression and decompression " + "functions.", + }, +}; #endif /* _CRYPTO_TESTMGR_H */ diff --git a/crypto/zstd.c b/crypto/zstd.c new file mode 100644 index 000000000000..9bfd28f8cc77 --- /dev/null +++ b/crypto/zstd.c @@ -0,0 +1,209 @@ +/* + * Cryptographic API. + * + * Copyright (c) 2017-present, Facebook, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include +#include +#include +#include +#include +#include +#include +#include + + +#define ZSTD_DEF_LEVEL 3 + +struct zstd_ctx { + ZSTD_CCtx *cctx; + ZSTD_DCtx *dctx; + void *cwksp; + void *dwksp; +}; + +static ZSTD_parameters zstd_params(void) +{ + return ZSTD_getParams(ZSTD_DEF_LEVEL, 0, 0); +} + +static int zstd_comp_init(struct zstd_ctx *ctx) +{ + int ret = 0; + const ZSTD_parameters params = zstd_params(); + const size_t wksp_size = ZSTD_CCtxWorkspaceBound(params.cParams); + + ctx->cwksp = vzalloc(wksp_size); + if (!ctx->cwksp) { + ret = -ENOMEM; + goto out; + } + + ctx->cctx = ZSTD_initCCtx(ctx->cwksp, wksp_size); + if (!ctx->cctx) { + ret = -EINVAL; + goto out_free; + } +out: + return ret; +out_free: + vfree(ctx->cwksp); + goto out; +} + +static int zstd_decomp_init(struct zstd_ctx *ctx) +{ + int ret = 0; + const size_t wksp_size = ZSTD_DCtxWorkspaceBound(); + + ctx->dwksp = vzalloc(wksp_size); + if (!ctx->dwksp) { + ret = -ENOMEM; + goto out; + } + + ctx->dctx = ZSTD_initDCtx(ctx->dwksp, wksp_size); + if (!ctx->dctx) { + ret = -EINVAL; + goto out_free; + } +out: + return ret; +out_free: + vfree(ctx->dwksp); + goto out; +} + +static void zstd_comp_exit(struct zstd_ctx *ctx) +{ + vfree(ctx->cwksp); + ctx->cwksp = NULL; + ctx->cctx = NULL; +} + +static void zstd_decomp_exit(struct zstd_ctx *ctx) +{ + vfree(ctx->dwksp); + ctx->dwksp = NULL; + ctx->dctx = NULL; +} + +static int __zstd_init(void *ctx) +{ + int ret; + + ret = zstd_comp_init(ctx); + if (ret) + return ret; + ret = zstd_decomp_init(ctx); + if (ret) + zstd_comp_exit(ctx); + return ret; +} + +static int zstd_init(struct crypto_tfm *tfm) +{ + struct zstd_ctx *ctx = crypto_tfm_ctx(tfm); + + return __zstd_init(ctx); +} + +static void __zstd_exit(void *ctx) +{ + zstd_comp_exit(ctx); + zstd_decomp_exit(ctx); +} + +static void zstd_exit(struct crypto_tfm *tfm) +{ + struct zstd_ctx *ctx = crypto_tfm_ctx(tfm); + + __zstd_exit(ctx); +} + +static int __zstd_compress(const u8 *src, unsigned int slen, + u8 *dst, unsigned int *dlen, void *ctx) +{ + size_t out_len; + struct zstd_ctx *zctx = ctx; + const ZSTD_parameters params = zstd_params(); + + out_len = ZSTD_compressCCtx(zctx->cctx, dst, *dlen, src, slen, params); + if (ZSTD_isError(out_len)) + return -EINVAL; + *dlen = out_len; + return 0; +} + +static int zstd_compress(struct crypto_tfm *tfm, const u8 *src, + unsigned int slen, u8 *dst, unsigned int *dlen) +{ + struct zstd_ctx *ctx = crypto_tfm_ctx(tfm); + + return __zstd_compress(src, slen, dst, dlen, ctx); +} + +static int __zstd_decompress(const u8 *src, unsigned int slen, + u8 *dst, unsigned int *dlen, void *ctx) +{ + size_t out_len; + struct zstd_ctx *zctx = ctx; + + out_len = ZSTD_decompressDCtx(zctx->dctx, dst, *dlen, src, slen); + if (ZSTD_isError(out_len)) + return -EINVAL; + *dlen = out_len; + return 0; +} + +static int zstd_decompress(struct crypto_tfm *tfm, const u8 *src, + unsigned int slen, u8 *dst, unsigned int *dlen) +{ + struct zstd_ctx *ctx = crypto_tfm_ctx(tfm); + + return __zstd_decompress(src, slen, dst, dlen, ctx); +} + +static struct crypto_alg alg = { + .cra_name = "zstd", + .cra_flags = CRYPTO_ALG_TYPE_COMPRESS, + .cra_ctxsize = sizeof(struct zstd_ctx), + .cra_module = THIS_MODULE, + .cra_init = zstd_init, + .cra_exit = zstd_exit, + .cra_u = { .compress = { + .coa_compress = zstd_compress, + .coa_decompress = zstd_decompress } } +}; + +static int __init zstd_mod_init(void) +{ + int ret; + + ret = crypto_register_alg(&alg); + if (ret) + return ret; + + return ret; +} + +static void __exit zstd_mod_fini(void) +{ + crypto_unregister_alg(&alg); +} + +module_init(zstd_mod_init); +module_exit(zstd_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Zstd Compression Algorithm"); +MODULE_ALIAS_CRYPTO("zstd"); From e4d5d60b23c4fec2f9e3390b2553692d99e9ece9 Mon Sep 17 00:00:00 2001 From: Peter Kalauskas Date: Thu, 23 Aug 2018 16:25:04 -0700 Subject: [PATCH 1173/1212] ANDROID: x86_64_cuttlefish_defconfig: Enable zram and zstd Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: Id53d213c1630f59cb7934309f0da4e9dae6545d8 Signed-off-by: Amit Pundir --- arch/x86/configs/x86_64_cuttlefish_defconfig | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig index 99d7b53932f7..e42f748f7414 100644 --- a/arch/x86/configs/x86_64_cuttlefish_defconfig +++ b/arch/x86/configs/x86_64_cuttlefish_defconfig @@ -48,6 +48,7 @@ CONFIG_X86_CPUID=y CONFIG_KSM=y CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 CONFIG_TRANSPARENT_HUGEPAGE=y +CONFIG_ZSMALLOC=y # CONFIG_MTRR is not set CONFIG_HZ_100=y CONFIG_KEXEC=y @@ -199,6 +200,7 @@ CONFIG_DEBUG_DEVRES=y CONFIG_OF=y CONFIG_OF_UNITTEST=y # CONFIG_PNP_DEBUG_MESSAGES is not set +CONFIG_ZRAM=y CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=8192 @@ -450,6 +452,7 @@ CONFIG_SECURITY_SELINUX=y CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1 # CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set CONFIG_CRYPTO_SHA512=y +CONFIG_CRYPTO_ZSTD=y CONFIG_ASYMMETRIC_KEY_TYPE=y CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y CONFIG_X509_CERTIFICATE_PARSER=y From 50436998d07f00470fae27d88017139c59a229a8 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 6 Sep 2017 16:19:44 -0700 Subject: [PATCH 1174/1212] UPSTREAM: zram: clean up duplicated codes in __zram_bvec_write Patch series "writeback incompressible pages to storage", v1. zRam is useful for memory saving with compressible pages but sometime, workload can be changed and system has lots of incompressible pages which is very harmful for zram. This patch supports writeback feature of zram so admin can set up a block device and with it, zram can save the memory via writing out the incompressile pages once it found it's incompressible pages (1/4 comp ratio) instead of keeping the page in memory. [1-3] is just clean up and [4-8] is step by step feature enablement. [4-8] is logically not bisectable(ie, logical unit separation) although I tried to compiled out without breaking but I think it would be better to review. This patch (of 9): __zram_bvec_write has some of duplicated logic for zram meta data handling of same_page|compressed_page. This patch aims to clean it up without behavior change. [xieyisheng1@huawei.com: fix compr_data_size stat] Link: http://lkml.kernel.org/r/1502707447-6944-1-git-send-email-xieyisheng1@huawei.com Link: http://lkml.kernel.org/r/1496019048-27016-1-git-send-email-minchan@kernel.org Link: http://lkml.kernel.org/r/1498459987-24562-2-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Signed-off-by: Yisheng Xie Reviewed-by: Sergey Senozhatsky Cc: Juneho Choi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 4ebbe7f7fc99260afd51759e35dbfdd6010dc697) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I3fa150c869a66ff289712b956924ecb361864a2e Signed-off-by: Amit Pundir --- drivers/block/zram/zram_drv.c | 56 ++++++++++++++--------------------- 1 file changed, 23 insertions(+), 33 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 7bed8e73c376..2c3046b1f9b7 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -452,30 +452,6 @@ static bool zram_same_page_read(struct zram *zram, u32 index, return false; } -static bool zram_same_page_write(struct zram *zram, u32 index, - struct page *page) -{ - unsigned long element; - void *mem = kmap_atomic(page); - - if (page_same_filled(mem, &element)) { - kunmap_atomic(mem); - /* Free memory associated with this sector now. */ - zram_slot_lock(zram, index); - zram_free_page(zram, index); - zram_set_flag(zram, index, ZRAM_SAME); - zram_set_element(zram, index, element); - zram_slot_unlock(zram, index); - - atomic64_inc(&zram->stats.same_pages); - atomic64_inc(&zram->stats.pages_stored); - return true; - } - kunmap_atomic(mem); - - return false; -} - static void zram_meta_free(struct zram *zram, u64 disksize) { size_t num_pages = disksize >> PAGE_SHIFT; @@ -684,14 +660,23 @@ static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm, static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index) { int ret; - unsigned long handle; - unsigned int comp_len; - void *src, *dst; + unsigned long handle = 0; + unsigned int comp_len = 0; + void *src, *dst, *mem; struct zcomp_strm *zstrm; struct page *page = bvec->bv_page; + unsigned long element = 0; + enum zram_pageflags flags = 0; - if (zram_same_page_write(zram, index, page)) - return 0; + mem = kmap_atomic(page); + if (page_same_filled(mem, &element)) { + kunmap_atomic(mem); + /* Free memory associated with this sector now */ + atomic64_inc(&zram->stats.same_pages); + flags = ZRAM_SAME; + goto out; + } + kunmap_atomic(mem); zstrm = zcomp_stream_get(zram->comp); ret = zram_compress(zram, &zstrm, page, &handle, &comp_len); @@ -711,19 +696,24 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index) zcomp_stream_put(zram->comp); zs_unmap_object(zram->mem_pool, handle); - + atomic64_add(comp_len, &zram->stats.compr_data_size); +out: /* * Free memory associated with this sector * before overwriting unused sectors. */ zram_slot_lock(zram, index); zram_free_page(zram, index); - zram_set_handle(zram, index, handle); - zram_set_obj_size(zram, index, comp_len); + if (flags == ZRAM_SAME) { + zram_set_flag(zram, index, ZRAM_SAME); + zram_set_element(zram, index, element); + } else { + zram_set_handle(zram, index, handle); + zram_set_obj_size(zram, index, comp_len); + } zram_slot_unlock(zram, index); /* Update stats */ - atomic64_add(comp_len, &zram->stats.compr_data_size); atomic64_inc(&zram->stats.pages_stored); return 0; } From eda1379b4d3cc9736370dcc8f8338f0af6853d75 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 6 Sep 2017 16:19:47 -0700 Subject: [PATCH 1175/1212] UPSTREAM: zram: inline zram_compress zram_compress does several things, compress, entry alloc and check limitation. I did for just readbility but it hurts modulization.:( So this patch removes zram_compress functions and inline it in __zram_bvec_write for upcoming patches. Link: http://lkml.kernel.org/r/1498459987-24562-3-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Juneho Choi Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 97ec7c8bd5d029b2c3e40355c1204197094e9ba1) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: Ibb37d77168edd0b01d0b9820e431c73cc3c2ff20 Signed-off-by: Amit Pundir --- drivers/block/zram/zram_drv.c | 64 ++++++++++++----------------------- 1 file changed, 22 insertions(+), 42 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 2c3046b1f9b7..96c175bb476f 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -588,25 +588,38 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, return ret; } -static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm, - struct page *page, - unsigned long *out_handle, unsigned int *out_comp_len) +static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index) { int ret; - unsigned int comp_len; - void *src; unsigned long alloced_pages; unsigned long handle = 0; + unsigned int comp_len = 0; + void *src, *dst, *mem; + struct zcomp_strm *zstrm; + struct page *page = bvec->bv_page; + unsigned long element = 0; + enum zram_pageflags flags = 0; + + mem = kmap_atomic(page); + if (page_same_filled(mem, &element)) { + kunmap_atomic(mem); + /* Free memory associated with this sector now. */ + flags = ZRAM_SAME; + atomic64_inc(&zram->stats.same_pages); + goto out; + } + kunmap_atomic(mem); compress_again: + zstrm = zcomp_stream_get(zram->comp); src = kmap_atomic(page); - ret = zcomp_compress(*zstrm, src, &comp_len); + ret = zcomp_compress(zstrm, src, &comp_len); kunmap_atomic(src); if (unlikely(ret)) { + zcomp_stream_put(zram->comp); pr_err("Compression failed! err=%d\n", ret); - if (handle) - zs_free(zram->mem_pool, handle); + zs_free(zram->mem_pool, handle); return ret; } @@ -638,7 +651,6 @@ static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm, handle = zs_malloc(zram->mem_pool, comp_len, GFP_NOIO | __GFP_HIGHMEM | __GFP_MOVABLE); - *zstrm = zcomp_stream_get(zram->comp); if (handle) goto compress_again; return -ENOMEM; @@ -648,43 +660,11 @@ static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm, update_used_max(zram, alloced_pages); if (zram->limit_pages && alloced_pages > zram->limit_pages) { + zcomp_stream_put(zram->comp); zs_free(zram->mem_pool, handle); return -ENOMEM; } - *out_handle = handle; - *out_comp_len = comp_len; - return 0; -} - -static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index) -{ - int ret; - unsigned long handle = 0; - unsigned int comp_len = 0; - void *src, *dst, *mem; - struct zcomp_strm *zstrm; - struct page *page = bvec->bv_page; - unsigned long element = 0; - enum zram_pageflags flags = 0; - - mem = kmap_atomic(page); - if (page_same_filled(mem, &element)) { - kunmap_atomic(mem); - /* Free memory associated with this sector now */ - atomic64_inc(&zram->stats.same_pages); - flags = ZRAM_SAME; - goto out; - } - kunmap_atomic(mem); - - zstrm = zcomp_stream_get(zram->comp); - ret = zram_compress(zram, &zstrm, page, &handle, &comp_len); - if (ret) { - zcomp_stream_put(zram->comp); - return ret; - } - dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO); src = zstrm->buffer; From 1e9f94054587ba4c24d7c5996be24935e957d28e Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 6 Sep 2017 16:19:50 -0700 Subject: [PATCH 1176/1212] UPSTREAM: zram: rename zram_decompress_page to __zram_bvec_read zram_decompress_page naming is not proper because it doesn't decompress if page was dedup hit or stored with compression. Use more abstract term and consistent with write path function __zram_bvec_write. Link: http://lkml.kernel.org/r/1498459987-24562-4-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Juneho Choi Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 693dc1ce25b8c8fa33f930d47cd8f926eeb90812) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: Ia7c948f4b78601458b7ebc23ab345d4bc0a8d4a8 Signed-off-by: Amit Pundir --- drivers/block/zram/zram_drv.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 96c175bb476f..9137eff24e6d 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -517,7 +517,7 @@ static void zram_free_page(struct zram *zram, size_t index) zram_set_obj_size(zram, index, 0); } -static int zram_decompress_page(struct zram *zram, struct page *page, u32 index) +static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index) { int ret; unsigned long handle; @@ -569,7 +569,7 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, return -ENOMEM; } - ret = zram_decompress_page(zram, page, index); + ret = __zram_bvec_read(zram, page, index); if (unlikely(ret)) goto out; @@ -717,7 +717,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, if (!page) return -ENOMEM; - ret = zram_decompress_page(zram, page, index); + ret = __zram_bvec_read(zram, page, index); if (ret) goto out; From 3973f693934d66ab10500eb81b5b27de65f68068 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 6 Sep 2017 16:19:54 -0700 Subject: [PATCH 1177/1212] UPSTREAM: zram: add interface to specif backing device For writeback feature, user should set up backing device before the zram working. This patch enables the interface via /sys/block/zramX/backing_dev. Currently, it supports block device only but it could be enhanced for file as well. Link: http://lkml.kernel.org/r/1498459987-24562-5-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Juneho Choi Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 013bf95a83ec760a2afc37fabd6bf13a9cdae205) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I4bbf12ed7496d476bddd574e756bac5c8a838089 Signed-off-by: Amit Pundir --- drivers/block/zram/zram_drv.c | 142 ++++++++++++++++++++++++++++++++++ drivers/block/zram/zram_drv.h | 5 ++ 2 files changed, 147 insertions(+) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 9137eff24e6d..9732218d94a9 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -269,6 +269,141 @@ static ssize_t mem_used_max_store(struct device *dev, return len; } +#ifdef CONFIG_ZRAM_WRITEBACK +static bool zram_wb_enabled(struct zram *zram) +{ + return zram->backing_dev; +} + +static void reset_bdev(struct zram *zram) +{ + struct block_device *bdev; + + if (!zram_wb_enabled(zram)) + return; + + bdev = zram->bdev; + if (zram->old_block_size) + set_blocksize(bdev, zram->old_block_size); + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + /* hope filp_close flush all of IO */ + filp_close(zram->backing_dev, NULL); + zram->backing_dev = NULL; + zram->old_block_size = 0; + zram->bdev = NULL; +} + +static ssize_t backing_dev_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct zram *zram = dev_to_zram(dev); + struct file *file = zram->backing_dev; + char *p; + ssize_t ret; + + down_read(&zram->init_lock); + if (!zram_wb_enabled(zram)) { + memcpy(buf, "none\n", 5); + up_read(&zram->init_lock); + return 5; + } + + p = file_path(file, buf, PAGE_SIZE - 1); + if (IS_ERR(p)) { + ret = PTR_ERR(p); + goto out; + } + + ret = strlen(p); + memmove(buf, p, ret); + buf[ret++] = '\n'; +out: + up_read(&zram->init_lock); + return ret; +} + +static ssize_t backing_dev_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + char *file_name; + struct file *backing_dev = NULL; + struct inode *inode; + struct address_space *mapping; + unsigned int old_block_size = 0; + struct block_device *bdev = NULL; + int err; + struct zram *zram = dev_to_zram(dev); + + file_name = kmalloc(PATH_MAX, GFP_KERNEL); + if (!file_name) + return -ENOMEM; + + down_write(&zram->init_lock); + if (init_done(zram)) { + pr_info("Can't setup backing device for initialized device\n"); + err = -EBUSY; + goto out; + } + + strlcpy(file_name, buf, len); + + backing_dev = filp_open(file_name, O_RDWR|O_LARGEFILE, 0); + if (IS_ERR(backing_dev)) { + err = PTR_ERR(backing_dev); + backing_dev = NULL; + goto out; + } + + mapping = backing_dev->f_mapping; + inode = mapping->host; + + /* Support only block device in this moment */ + if (!S_ISBLK(inode->i_mode)) { + err = -ENOTBLK; + goto out; + } + + bdev = bdgrab(I_BDEV(inode)); + err = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, zram); + if (err < 0) + goto out; + + old_block_size = block_size(bdev); + err = set_blocksize(bdev, PAGE_SIZE); + if (err) + goto out; + + reset_bdev(zram); + + zram->old_block_size = old_block_size; + zram->bdev = bdev; + zram->backing_dev = backing_dev; + up_write(&zram->init_lock); + + pr_info("setup backing device %s\n", file_name); + kfree(file_name); + + return len; +out: + if (bdev) + blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + + if (backing_dev) + filp_close(backing_dev, NULL); + + up_write(&zram->init_lock); + + kfree(file_name); + + return err; +} + +#else +static bool zram_wb_enabled(struct zram *zram) { return false; } +static inline void reset_bdev(struct zram *zram) {}; +#endif + + /* * We switched to per-cpu streams and this attr is not needed anymore. * However, we will keep it around for some time, because: @@ -947,6 +1082,7 @@ static void zram_reset_device(struct zram *zram) zram_meta_free(zram, disksize); memset(&zram->stats, 0, sizeof(zram->stats)); zcomp_destroy(comp); + reset_bdev(zram); } static ssize_t disksize_store(struct device *dev, @@ -1072,6 +1208,9 @@ static DEVICE_ATTR_WO(mem_limit); static DEVICE_ATTR_WO(mem_used_max); static DEVICE_ATTR_RW(max_comp_streams); static DEVICE_ATTR_RW(comp_algorithm); +#ifdef CONFIG_ZRAM_WRITEBACK +static DEVICE_ATTR_RW(backing_dev); +#endif static struct attribute *zram_disk_attrs[] = { &dev_attr_disksize.attr, @@ -1082,6 +1221,9 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_mem_used_max.attr, &dev_attr_max_comp_streams.attr, &dev_attr_comp_algorithm.attr, +#ifdef CONFIG_ZRAM_WRITEBACK + &dev_attr_backing_dev.attr, +#endif &dev_attr_io_stat.attr, &dev_attr_mm_stat.attr, &dev_attr_debug_stat.attr, diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index e34e44d02e3e..113a41118918 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -115,5 +115,10 @@ struct zram { * zram is claimed so open request will be failed */ bool claim; /* Protected by bdev->bd_mutex */ +#ifdef CONFIG_ZRAM_WRITEBACK + struct file *backing_dev; + struct block_device *bdev; + unsigned int old_block_size; +#endif }; #endif From e75f62136262988878845e453c9deed407cad919 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 6 Sep 2017 16:19:57 -0700 Subject: [PATCH 1178/1212] BACKPORT: zram: add free space management in backing device With backing device, zram needs management of free space of backing device. This patch adds bitmap logic to manage free space which is very naive. However, it would be simple enough as considering uncompressible pages's frequenty in zram. Link: http://lkml.kernel.org/r/1498459987-24562-6-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Juneho Choi Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 1363d4662a0d28dfdb81ef426c88c9a8dbf7c338) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I37dc98b40bfddceb9eb6d989ca30683dbf89210c Signed-off-by: Amit Pundir --- drivers/block/zram/zram_drv.c | 56 ++++++++++++++++++++++++++++++++++- drivers/block/zram/zram_drv.h | 3 ++ 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 9732218d94a9..79b5e9c774a7 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -291,6 +291,9 @@ static void reset_bdev(struct zram *zram) zram->backing_dev = NULL; zram->old_block_size = 0; zram->bdev = NULL; + + kvfree(zram->bitmap); + zram->bitmap = NULL; } static ssize_t backing_dev_show(struct device *dev, @@ -329,10 +332,12 @@ static ssize_t backing_dev_store(struct device *dev, struct file *backing_dev = NULL; struct inode *inode; struct address_space *mapping; - unsigned int old_block_size = 0; + unsigned int bitmap_sz, old_block_size = 0; + unsigned long nr_pages, *bitmap = NULL; struct block_device *bdev = NULL; int err; struct zram *zram = dev_to_zram(dev); + gfp_t kmalloc_flags; file_name = kmalloc(PATH_MAX, GFP_KERNEL); if (!file_name) @@ -368,16 +373,34 @@ static ssize_t backing_dev_store(struct device *dev, if (err < 0) goto out; + nr_pages = i_size_read(inode) >> PAGE_SHIFT; + bitmap_sz = BITS_TO_LONGS(nr_pages) * sizeof(long); + kmalloc_flags = GFP_KERNEL | __GFP_ZERO; + if (bitmap_sz > PAGE_SIZE) + kmalloc_flags |= __GFP_NOWARN | __GFP_NORETRY; + + bitmap = kmalloc_node(bitmap_sz, kmalloc_flags, NUMA_NO_NODE); + if (!bitmap && bitmap_sz > PAGE_SIZE) + bitmap = vzalloc(bitmap_sz); + + if (!bitmap) { + err = -ENOMEM; + goto out; + } + old_block_size = block_size(bdev); err = set_blocksize(bdev, PAGE_SIZE); if (err) goto out; reset_bdev(zram); + spin_lock_init(&zram->bitmap_lock); zram->old_block_size = old_block_size; zram->bdev = bdev; zram->backing_dev = backing_dev; + zram->bitmap = bitmap; + zram->nr_pages = nr_pages; up_write(&zram->init_lock); pr_info("setup backing device %s\n", file_name); @@ -385,6 +408,9 @@ static ssize_t backing_dev_store(struct device *dev, return len; out: + if (bitmap) + kvfree(bitmap); + if (bdev) blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); @@ -398,6 +424,34 @@ static ssize_t backing_dev_store(struct device *dev, return err; } +static unsigned long get_entry_bdev(struct zram *zram) +{ + unsigned long entry; + + spin_lock(&zram->bitmap_lock); + /* skip 0 bit to confuse zram.handle = 0 */ + entry = find_next_zero_bit(zram->bitmap, zram->nr_pages, 1); + if (entry == zram->nr_pages) { + spin_unlock(&zram->bitmap_lock); + return 0; + } + + set_bit(entry, zram->bitmap); + spin_unlock(&zram->bitmap_lock); + + return entry; +} + +static void put_entry_bdev(struct zram *zram, unsigned long entry) +{ + int was_set; + + spin_lock(&zram->bitmap_lock); + was_set = test_and_clear_bit(entry, zram->bitmap); + spin_unlock(&zram->bitmap_lock); + WARN_ON_ONCE(!was_set); +} + #else static bool zram_wb_enabled(struct zram *zram) { return false; } static inline void reset_bdev(struct zram *zram) {}; diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 113a41118918..707aec0a2681 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -119,6 +119,9 @@ struct zram { struct file *backing_dev; struct block_device *bdev; unsigned int old_block_size; + unsigned long *bitmap; + unsigned long nr_pages; + spinlock_t bitmap_lock; #endif }; #endif From ca9fe02221ae711d801d6c95b35cb5a757bce824 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 6 Sep 2017 16:20:00 -0700 Subject: [PATCH 1179/1212] BACKPORT: zram: identify asynchronous IO's return value For upcoming asynchronous IO like writeback, zram_rw_page should be aware of that whether requested IO was completed or submitted successfully, otherwise error. For the goal, zram_bvec_rw has three return values. -errno: returns error number 0: IO request is done synchronously 1: IO request is issued successfully. Link: http://lkml.kernel.org/r/1498459987-24562-7-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Juneho Choi Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit ae85a8075c5b025b9d503554ddc480a346a24536) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: Id6e764b3eacfebdca2f46050648a49fc5f276b2c Signed-off-by: Amit Pundir --- drivers/block/zram/zram_drv.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 79b5e9c774a7..df0d61dfa110 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -779,7 +779,7 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index) { - int ret; + int ret = 0; unsigned long alloced_pages; unsigned long handle = 0; unsigned int comp_len = 0; @@ -884,7 +884,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index) /* Update stats */ atomic64_inc(&zram->stats.pages_stored); - return 0; + return ret; } static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, @@ -966,6 +966,11 @@ static void zram_bio_discard(struct zram *zram, u32 index, } } +/* + * Returns errno if it has some problem. Otherwise return 0 or 1. + * Returns 0 if IO request was done synchronously + * Returns 1 if IO request was successfully submitted. + */ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, int offset, int rw) { @@ -986,7 +991,7 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, generic_end_io_acct(rw, &zram->disk->part0, start_time); - if (unlikely(ret)) { + if (unlikely(ret < 0)) { if (rw == READ) atomic64_inc(&zram->stats.failed_reads); else @@ -1075,7 +1080,7 @@ static void zram_slot_free_notify(struct block_device *bdev, static int zram_rw_page(struct block_device *bdev, sector_t sector, struct page *page, int rw) { - int offset, err = -EIO; + int offset, ret; u32 index; struct zram *zram; struct bio_vec bv; @@ -1084,7 +1089,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector, if (!valid_io_request(zram, sector, PAGE_SIZE)) { atomic64_inc(&zram->stats.invalid_io); - err = -EINVAL; + ret = -EINVAL; goto out; } @@ -1095,7 +1100,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector, bv.bv_len = PAGE_SIZE; bv.bv_offset = 0; - err = zram_bvec_rw(zram, &bv, index, offset, rw); + ret = zram_bvec_rw(zram, &bv, index, offset, rw); out: /* * If I/O fails, just return error(ie, non-zero) without @@ -1105,9 +1110,20 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector, * bio->bi_end_io does things to handle the error * (e.g., SetPageError, set_page_dirty and extra works). */ - if (err == 0) + if (unlikely(ret < 0)) + return ret; + + switch (ret) { + case 0: page_endio(page, rw, 0); - return err; + break; + case 1: + ret = 0; + break; + default: + WARN_ON(1); + } + return ret; } static void zram_reset_device(struct zram *zram) From 46ec4c35375e00277a96e1b167e16ef463468cbf Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 6 Sep 2017 16:20:03 -0700 Subject: [PATCH 1180/1212] BACKPORT: zram: write incompressible pages to backing device This patch enables write IO to transfer data to backing device. For that, it implements write_to_bdev function which creates new bio and chaining with parent bio to make the parent bio asynchrnous. For rw_page which don't have parent bio, it submit owned bio and handle IO completion by zram_page_end_io. Also, this patch defines new flag ZRAM_WB to mark written page for later read IO. [xieyisheng1@huawei.com: fix typo in comment] Link: http://lkml.kernel.org/r/1502707447-6944-2-git-send-email-xieyisheng1@huawei.com Link: http://lkml.kernel.org/r/1498459987-24562-8-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Signed-off-by: Yisheng Xie Cc: Juneho Choi Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit db8ffbd4e7634cc537c8d32e73e7ce0f06248645) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: Ie675efd6c3ec04a151443f1cd0bf798d4847710f Signed-off-by: Amit Pundir --- drivers/block/zram/zram_drv.c | 112 ++++++++++++++++++++++++++++++---- drivers/block/zram/zram_drv.h | 3 +- 2 files changed, 102 insertions(+), 13 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index df0d61dfa110..a0838669b589 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -452,9 +452,75 @@ static void put_entry_bdev(struct zram *zram, unsigned long entry) WARN_ON_ONCE(!was_set); } +void zram_page_end_io(struct bio *bio) +{ + struct page *page = bio->bi_io_vec[0].bv_page; + + page_endio(page, bio_data_dir(bio), bio->bi_error); + bio_put(bio); +} + +static int write_to_bdev(struct zram *zram, struct bio_vec *bvec, + u32 index, struct bio *parent, + unsigned long *pentry) +{ + struct bio *bio; + unsigned long entry; + + bio = bio_alloc(GFP_ATOMIC, 1); + if (!bio) + return -ENOMEM; + + entry = get_entry_bdev(zram); + if (!entry) { + bio_put(bio); + return -ENOSPC; + } + + bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9); + bio->bi_bdev = zram->bdev; + if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len, + bvec->bv_offset)) { + bio_put(bio); + put_entry_bdev(zram, entry); + return -EIO; + } + + if (!parent) { + bio->bi_rw = REQ_WRITE | REQ_SYNC; + bio->bi_end_io = zram_page_end_io; + } else { + bio->bi_rw = parent->bi_rw; + bio_chain(bio, parent); + } + + submit_bio(WRITE, bio); + *pentry = entry; + + return 0; +} + +static void zram_wb_clear(struct zram *zram, u32 index) +{ + unsigned long entry; + + zram_clear_flag(zram, index, ZRAM_WB); + entry = zram_get_element(zram, index); + zram_set_element(zram, index, 0); + put_entry_bdev(zram, entry); +} + #else static bool zram_wb_enabled(struct zram *zram) { return false; } static inline void reset_bdev(struct zram *zram) {}; +static int write_to_bdev(struct zram *zram, struct bio_vec *bvec, + u32 index, struct bio *parent, + unsigned long *pentry) + +{ + return -EIO; +} +static void zram_wb_clear(struct zram *zram, u32 index) {} #endif @@ -679,7 +745,13 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) */ static void zram_free_page(struct zram *zram, size_t index) { - unsigned long handle = zram_get_handle(zram, index); + unsigned long handle; + + if (zram_wb_enabled(zram) && zram_test_flag(zram, index, ZRAM_WB)) { + zram_wb_clear(zram, index); + atomic64_dec(&zram->stats.pages_stored); + return; + } /* * No memory is allocated for same element filled pages. @@ -693,6 +765,7 @@ static void zram_free_page(struct zram *zram, size_t index) return; } + handle = zram_get_handle(zram, index); if (!handle) return; @@ -777,7 +850,8 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, return ret; } -static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index) +static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, + u32 index, struct bio *bio) { int ret = 0; unsigned long alloced_pages; @@ -788,6 +862,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index) struct page *page = bvec->bv_page; unsigned long element = 0; enum zram_pageflags flags = 0; + bool allow_wb = true; mem = kmap_atomic(page); if (page_same_filled(mem, &element)) { @@ -812,8 +887,20 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index) return ret; } - if (unlikely(comp_len > max_zpage_size)) + if (unlikely(comp_len > max_zpage_size)) { + if (zram_wb_enabled(zram) && allow_wb) { + zcomp_stream_put(zram->comp); + ret = write_to_bdev(zram, bvec, index, bio, &element); + if (!ret) { + flags = ZRAM_WB; + ret = 1; + goto out; + } + allow_wb = false; + goto compress_again; + } comp_len = PAGE_SIZE; + } /* * handle allocation has 2 paths: @@ -873,10 +960,11 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index) */ zram_slot_lock(zram, index); zram_free_page(zram, index); - if (flags == ZRAM_SAME) { - zram_set_flag(zram, index, ZRAM_SAME); + + if (flags) { + zram_set_flag(zram, index, flags); zram_set_element(zram, index, element); - } else { + } else { zram_set_handle(zram, index, handle); zram_set_obj_size(zram, index, comp_len); } @@ -888,7 +976,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index) } static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, - u32 index, int offset) + u32 index, int offset, struct bio *bio) { int ret; struct page *page = NULL; @@ -921,7 +1009,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, vec.bv_offset = 0; } - ret = __zram_bvec_write(zram, &vec, index); + ret = __zram_bvec_write(zram, &vec, index, bio); out: if (is_partial_io(bvec)) __free_page(page); @@ -972,7 +1060,7 @@ static void zram_bio_discard(struct zram *zram, u32 index, * Returns 1 if IO request was successfully submitted. */ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, - int offset, int rw) + int offset, int rw, struct bio *bio) { unsigned long start_time = jiffies; int ret; @@ -986,7 +1074,7 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, flush_dcache_page(bvec->bv_page); } else { atomic64_inc(&zram->stats.num_writes); - ret = zram_bvec_write(zram, bvec, index, offset); + ret = zram_bvec_write(zram, bvec, index, offset, bio); } generic_end_io_acct(rw, &zram->disk->part0, start_time); @@ -1026,7 +1114,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) do { bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset, unwritten); - if (zram_bvec_rw(zram, &bv, index, offset, rw) < 0) + if (zram_bvec_rw(zram, &bv, index, offset, rw, bio) < 0) goto out; bv.bv_offset += bv.bv_len; @@ -1100,7 +1188,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector, bv.bv_len = PAGE_SIZE; bv.bv_offset = 0; - ret = zram_bvec_rw(zram, &bv, index, offset, rw); + ret = zram_bvec_rw(zram, &bv, index, offset, rw, NULL); out: /* * If I/O fails, just return error(ie, non-zero) without diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 707aec0a2681..31762db861e3 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -60,9 +60,10 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3; /* Flags for zram pages (table[page_no].value) */ enum zram_pageflags { - /* Page consists entirely of zeros */ + /* Page consists the same element */ ZRAM_SAME = ZRAM_FLAG_SHIFT, ZRAM_ACCESS, /* page is now accessed */ + ZRAM_WB, /* page is stored on backing_device */ __NR_ZRAM_PAGEFLAGS, }; From 3249ca4a577256658349ddf2585bb9e272fb5d55 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 6 Sep 2017 16:20:07 -0700 Subject: [PATCH 1181/1212] BACKPORT: zram: read page from backing device This patch enables read IO from backing device. For the feature, it implements two IO read functions to transfer data from backing storage. One is asynchronous IO function and other is synchronous one. A reason I need synchrnous IO is due to partial write which need to complete read IO before the overwriting partial data. We can make the partial IO's case asynchronous, too but at the moment, I don't feel adding more complexity to support such rare use cases so want to go with simple. [xieyisheng1@huawei.com: read_from_bdev_async(): return 1 to avoid call page_endio() in zram_rw_page()] Link: http://lkml.kernel.org/r/1502707447-6944-1-git-send-email-xieyisheng1@huawei.com Link: http://lkml.kernel.org/r/1498459987-24562-9-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Signed-off-by: Yisheng Xie Cc: Juneho Choi Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 8e654f8fbff52ac483fb69957222853d7e2fc588) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: Ia82f5fc4697aacc723a336e4dad4e7bc56a1bdb9 Signed-off-by: Amit Pundir --- drivers/block/zram/zram_drv.c | 123 ++++++++++++++++++++++++++++++++-- 1 file changed, 118 insertions(+), 5 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index a0838669b589..656495d61998 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -460,6 +460,95 @@ void zram_page_end_io(struct bio *bio) bio_put(bio); } +/* + * Returns 1 if the submission is successful. + */ +static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec, + unsigned long entry, struct bio *parent) +{ + struct bio *bio; + + bio = bio_alloc(GFP_ATOMIC, 1); + if (!bio) + return -ENOMEM; + + bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9); + bio->bi_bdev = zram->bdev; + if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len, bvec->bv_offset)) { + bio_put(bio); + return -EIO; + } + + if (!parent) { + bio->bi_rw = 0; + bio->bi_end_io = zram_page_end_io; + } else { + bio->bi_rw = parent->bi_rw; + bio_chain(bio, parent); + } + + submit_bio(READ, bio); + return 1; +} + +struct zram_work { + struct work_struct work; + struct zram *zram; + unsigned long entry; + struct bio *bio; +}; + +#if PAGE_SIZE != 4096 +static void zram_sync_read(struct work_struct *work) +{ + struct bio_vec bvec; + struct zram_work *zw = container_of(work, struct zram_work, work); + struct zram *zram = zw->zram; + unsigned long entry = zw->entry; + struct bio *bio = zw->bio; + + read_from_bdev_async(zram, &bvec, entry, bio); +} + +/* + * Block layer want one ->make_request_fn to be active at a time + * so if we use chained IO with parent IO in same context, + * it's a deadlock. To avoid, it, it uses worker thread context. + */ +static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec, + unsigned long entry, struct bio *bio) +{ + struct zram_work work; + + work.zram = zram; + work.entry = entry; + work.bio = bio; + + INIT_WORK_ONSTACK(&work.work, zram_sync_read); + queue_work(system_unbound_wq, &work.work); + flush_work(&work.work); + destroy_work_on_stack(&work.work); + + return 1; +} +#else +static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec, + unsigned long entry, struct bio *bio) +{ + WARN_ON(1); + return -EIO; +} +#endif + +static int read_from_bdev(struct zram *zram, struct bio_vec *bvec, + unsigned long entry, struct bio *parent, bool sync) +{ + if (sync) + return read_from_bdev_sync(zram, bvec, entry, parent); + else + return read_from_bdev_async(zram, bvec, entry, parent); +} + static int write_to_bdev(struct zram *zram, struct bio_vec *bvec, u32 index, struct bio *parent, unsigned long *pentry) @@ -520,6 +609,12 @@ static int write_to_bdev(struct zram *zram, struct bio_vec *bvec, { return -EIO; } + +static int read_from_bdev(struct zram *zram, struct bio_vec *bvec, + unsigned long entry, struct bio *parent, bool sync) +{ + return -EIO; +} static void zram_wb_clear(struct zram *zram, u32 index) {} #endif @@ -779,13 +874,31 @@ static void zram_free_page(struct zram *zram, size_t index) zram_set_obj_size(zram, index, 0); } -static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index) +static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, + struct bio *bio, bool partial_io) { int ret; unsigned long handle; unsigned int size; void *src, *dst; + if (zram_wb_enabled(zram)) { + zram_slot_lock(zram, index); + if (zram_test_flag(zram, index, ZRAM_WB)) { + struct bio_vec bvec; + + zram_slot_unlock(zram, index); + + bvec.bv_page = page; + bvec.bv_len = PAGE_SIZE; + bvec.bv_offset = 0; + return read_from_bdev(zram, &bvec, + zram_get_element(zram, index), + bio, partial_io); + } + zram_slot_unlock(zram, index); + } + if (zram_same_page_read(zram, index, page, 0, PAGE_SIZE)) return 0; @@ -818,7 +931,7 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index) } static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, - u32 index, int offset) + u32 index, int offset, struct bio *bio) { int ret; struct page *page; @@ -831,7 +944,7 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, return -ENOMEM; } - ret = __zram_bvec_read(zram, page, index); + ret = __zram_bvec_read(zram, page, index, bio, is_partial_io(bvec)); if (unlikely(ret)) goto out; @@ -994,7 +1107,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, if (!page) return -ENOMEM; - ret = __zram_bvec_read(zram, page, index); + ret = __zram_bvec_read(zram, page, index, bio, true); if (ret) goto out; @@ -1070,7 +1183,7 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, if (rw == READ) { atomic64_inc(&zram->stats.num_reads); - ret = zram_bvec_read(zram, bvec, index, offset); + ret = zram_bvec_read(zram, bvec, index, offset, bio); flush_dcache_page(bvec->bv_page); } else { atomic64_inc(&zram->stats.num_writes); From 1b89745386d9e0be76eca0182755a7026b147a7c Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 6 Sep 2017 16:20:10 -0700 Subject: [PATCH 1182/1212] UPSTREAM: zram: add config and doc file for writeback feature This patch adds document and kconfig for using of writeback feature. Link: http://lkml.kernel.org/r/1498459987-24562-10-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Juneho Choi Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 5a47074f0279421778f97b1b1e75686696a5f42a) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I9ec2230739a6468a4481a90a9c9f966badf9ac48 Signed-off-by: Amit Pundir --- Documentation/ABI/testing/sysfs-block-zram | 8 ++++++++ Documentation/blockdev/zram.txt | 11 +++++++++++ drivers/block/zram/Kconfig | 12 ++++++++++++ 3 files changed, 31 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram index 451b6d882b2c..c1513c756af1 100644 --- a/Documentation/ABI/testing/sysfs-block-zram +++ b/Documentation/ABI/testing/sysfs-block-zram @@ -90,3 +90,11 @@ Description: device's debugging info useful for kernel developers. Its format is not documented intentionally and may change anytime without any notice. + +What: /sys/block/zram/backing_dev +Date: June 2017 +Contact: Minchan Kim +Description: + The backing_dev file is read-write and set up backing + device for zram to write incompressible pages. + For using, user should enable CONFIG_ZRAM_WRITEBACK. diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 4fced8a21307..257e65714c6a 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -168,6 +168,7 @@ max_comp_streams RW the number of possible concurrent compress operations comp_algorithm RW show and change the compression algorithm compact WO trigger memory compaction debug_stat RO this file is used for zram debugging purposes +backing_dev RW set up backend storage for zram to write out User space is advised to use the following files to read the device statistics. @@ -231,5 +232,15 @@ line of text and contains the following stats separated by whitespace: resets the disksize to zero. You must set the disksize again before reusing the device. +* Optional Feature + += writeback + +With incompressible pages, there is no memory saving with zram. +Instead, with CONFIG_ZRAM_WRITEBACK, zram can write incompressible page +to backing storage rather than keeping it in memory. +User should set up backing device via /sys/block/zramX/backing_dev +before disksize setting. + Nitin Gupta ngupta@vflare.org diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index b8ecba6dcd3b..7cd4a8ec3c8f 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -13,3 +13,15 @@ config ZRAM disks and maybe many more. See zram.txt for more information. + +config ZRAM_WRITEBACK + bool "Write back incompressible page to backing device" + depends on ZRAM + default n + help + With incompressible page, there is no memory saving to keep it + in memory. Instead, write it out to backing device. + For this feature, admin should set up backing device via + /sys/block/zramX/backing_dev. + + See zram.txt for more infomration. From a992749ff2a437a022db475afda1f167e938f66a Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 3 Oct 2017 16:15:19 -0700 Subject: [PATCH 1183/1212] UPSTREAM: zram: fix null dereference of handle In testing I found handle passed to zs_map_object in __zram_bvec_read is NULL so eh kernel goes oops in pin_object(). The reason is there is no routine to check the slot's freeing after getting the slot's lock. This patch fixes it. [minchan@kernel.org: v2] Link: http://lkml.kernel.org/r/1505887347-10881-1-git-send-email-minchan@kernel.org Link: http://lkml.kernel.org/r/1505788488-26723-1-git-send-email-minchan@kernel.org Fixes: 1f7319c74275 ("zram: partial IO refactoring") Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit ae94264ed4b0cf7cd887947650db4c69acb62072) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I0ff4a8c2f1fcd0ee39511985809b58bf94b2d44c Signed-off-by: Amit Pundir --- drivers/block/zram/zram_drv.c | 36 ++++++++++++----------------------- 1 file changed, 12 insertions(+), 24 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 656495d61998..c1f15204ebb7 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -781,27 +781,6 @@ static void zram_slot_unlock(struct zram *zram, u32 index) bit_spin_unlock(ZRAM_ACCESS, &zram->table[index].value); } -static bool zram_same_page_read(struct zram *zram, u32 index, - struct page *page, - unsigned int offset, unsigned int len) -{ - zram_slot_lock(zram, index); - if (unlikely(!zram_get_handle(zram, index) || - zram_test_flag(zram, index, ZRAM_SAME))) { - void *mem; - - zram_slot_unlock(zram, index); - mem = kmap_atomic(page); - zram_fill_page(mem + offset, len, - zram_get_element(zram, index)); - kunmap_atomic(mem); - return true; - } - zram_slot_unlock(zram, index); - - return false; -} - static void zram_meta_free(struct zram *zram, u64 disksize) { size_t num_pages = disksize >> PAGE_SHIFT; @@ -899,11 +878,20 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, zram_slot_unlock(zram, index); } - if (zram_same_page_read(zram, index, page, 0, PAGE_SIZE)) - return 0; - zram_slot_lock(zram, index); handle = zram_get_handle(zram, index); + if (!handle || zram_test_flag(zram, index, ZRAM_SAME)) { + unsigned long value; + void *mem; + + value = handle ? zram_get_element(zram, index) : 0; + mem = kmap_atomic(page); + zram_fill_page(mem, PAGE_SIZE, value); + kunmap_atomic(mem); + zram_slot_unlock(zram, index); + return 0; + } + size = zram_get_obj_size(zram, index); src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO); From 1b5bebae86730c83a2c38df18e54b9473d318490 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 15 Nov 2017 17:32:56 -0800 Subject: [PATCH 1184/1212] BACKPORT: zram: set BDI_CAP_STABLE_WRITES once With fast swap storage, the platform wants to use swap more aggressively and swap-in is crucial to application latency. The rw_page() based synchronous devices like zram, pmem and btt are such fast storage. When I profile swapin performance with zram lz4 decompress test, S/W overhead is more than 70%. Maybe, it would be bigger in nvdimm. This patchset reduces swap-in latency by skipping swapcache if the swap device is a synchronous device like a rw_page() based device. It enhances by 45% my swapin test (5G sequential swapin, no readahead) from 2.41sec to 1.64sec. This patch (of 4): Commit 19b7ccf8651d ("block: get rid of blk_integrity_revalidate()") fixed a weird thing (i.e., reset BDI_CAP_STABLE_WRITES flag unconditionally whenever revalidat_disk is called) so zram doesn't need to reset the flag any more when revalidating the bdev. Instead, set the flag just once when the zram device is created. It shouldn't change any behavior. Link: http://lkml.kernel.org/r/1505886205-9671-2-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Cc: Ilya Dryomov Cc: Christoph Hellwig Cc: Dan Williams Cc: Ross Zwisler Cc: Jens Axboe Cc: Hugh Dickins Cc: Huang Ying Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit e447a0151f7ce8dd884fea48279274bd64434c29) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: If41edc4871ed470f050bbf4d51a24fe5c0e18738 Signed-off-by: Amit Pundir --- drivers/block/zram/zram_drv.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index c1f15204ebb7..1460bc5cc2fc 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -121,14 +121,6 @@ static inline bool is_partial_io(struct bio_vec *bvec) } #endif -static void zram_revalidate_disk(struct zram *zram) -{ - revalidate_disk(zram->disk); - /* revalidate_disk reset the BDI_CAP_STABLE_WRITES so set again */ - zram->disk->queue->backing_dev_info.capabilities |= - BDI_CAP_STABLE_WRITES; -} - /* * Check if request is within bounds and aligned on zram logical blocks. */ @@ -1380,7 +1372,8 @@ static ssize_t disksize_store(struct device *dev, zram->comp = comp; zram->disksize = disksize; set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); - zram_revalidate_disk(zram); + + revalidate_disk(zram->disk); up_write(&zram->init_lock); return len; @@ -1427,7 +1420,7 @@ static ssize_t reset_store(struct device *dev, /* Make sure all the pending I/O are finished */ fsync_bdev(bdev); zram_reset_device(zram); - zram_revalidate_disk(zram); + revalidate_disk(zram->disk); bdput(bdev); mutex_lock(&bdev->bd_mutex); @@ -1546,6 +1539,7 @@ static int zram_add(void) /* zram devices sort of resembles non-rotational disks */ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue); queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue); + /* * To ensure that we always get PAGE_SIZE aligned * and n*PAGE_SIZED sized I/O requests. @@ -1571,6 +1565,8 @@ static int zram_add(void) zram->disk->queue->limits.discard_zeroes_data = 0; queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zram->disk->queue); + zram->disk->queue->backing_dev_info.capabilities |= + BDI_CAP_STABLE_WRITES; add_disk(zram->disk); ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj, From 0e0c0d4a4c05912f0e872c5cdbd5491143b70246 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 15 Nov 2017 17:37:08 -0800 Subject: [PATCH 1185/1212] UPSTREAM: drivers/block/zram/zram_drv.c: make zram_page_end_io() static zram_page_end_io() is local to the source and does not need to be in global scope, so make it static. Cleans up sparse warning: symbol 'zram_page_end_io' was not declared. Should it be static? Link: http://lkml.kernel.org/r/20171016173336.20320-1-colin.king@canonical.com Signed-off-by: Colin Ian King Reviewed-by: Sergey Senozhatsky Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 384bc41fc064bd8b12b7081aa3e81d26f3407045) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: Ie0f250e580bc1dd16e963b5dbe5bdc429fb4cd65 Signed-off-by: Amit Pundir --- drivers/block/zram/zram_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 1460bc5cc2fc..72f130c78bb1 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -444,7 +444,7 @@ static void put_entry_bdev(struct zram *zram, unsigned long entry) WARN_ON_ONCE(!was_set); } -void zram_page_end_io(struct bio *bio) +static void zram_page_end_io(struct bio *bio) { struct page *page = bio->bi_io_vec[0].bv_page; From ccca79acca3df9f45caf26ec40ae3f3765f9b23c Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 28 Feb 2018 10:15:30 -0800 Subject: [PATCH 1186/1212] UPSTREAM: zram: Delete gendisk before cleaning up the request queue Remove the disk, partition and bdi sysfs attributes before cleaning up the request queue associated with the disk. Signed-off-by: Bart Van Assche Reviewed-by: Johannes Thumshirn Reviewed-by: Joseph Qi Reviewed-by: Ming Lei Cc: Minchan Kim Cc: Nitin Gupta Cc: Sergey Senozhatsky Signed-off-by: Jens Axboe (cherry picked from commit 392db38058eb47250a9d0cc737af37e78a7e443d) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: Ifbcb6e03fee764054dc9a371c00b95547e4de745 Signed-off-by: Amit Pundir --- drivers/block/zram/zram_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 72f130c78bb1..d65827eb9b27 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1628,8 +1628,8 @@ static int zram_remove(struct zram *zram) pr_info("Removed device: %s\n", zram->disk->disk_name); - blk_cleanup_queue(zram->disk->queue); del_gendisk(zram->disk); + blk_cleanup_queue(zram->disk->queue); put_disk(zram->disk); kfree(zram); return 0; From 129dca992b6fb6e0e284d7c46bcabad2840d7bb5 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 7 Jun 2018 17:05:39 -0700 Subject: [PATCH 1187/1212] UPSTREAM: zram: correct flag name of ZRAM_ACCESS Patch series "zram memory tracking", v5. zRam as swap is useful for small memory device. However, swap means those pages on zram are mostly cold pages due to VM's LRU algorithm. Especially, once init data for application are touched for launching, they tend to be not accessed any more and finally swapped out. zRAM can store such cold pages as compressed form but it's pointless to keep in memory. As well, it's pointless to store incompressible pages to zram so better idea is app developers manages them directly like free or mlock rather than remaining them on heap. This patch provides a debugfs /sys/kernel/debug/zram/zram0/block_state to represent each block's state so admin can investigate what memory is cold|incompressible|same page with using pagemap once the pages are swapped out. The output is as follows: 300 75.033841 .wh 301 63.806904 s.. 302 63.806919 ..h First column is zram's block index and 3rh one represents symbol (s: same page w: written page to backing store h: huge page) of the block state. Second column represents usec time unit of the block was last accessed. So above example means the 300th block is accessed at 75.033851 second and it was huge so it was written to the backing store. This patch (of 4): ZRAM_ACCESS is used for locking a slot of zram so correct the name. It is also not a common flag to indicate status of the block so move the declare position on top of the flag. Lastly, let's move the function to the top of source code to be able to use it easily without forward declaration. Link: http://lkml.kernel.org/r/20180416090946.63057-2-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit c4d6c4cc7bfd5ecc18548420b7fb9440cf8416ae) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I037a22a739fb4005918eb668d10e8be354a1524f Signed-off-by: Amit Pundir --- drivers/block/zram/zram_drv.c | 20 ++++++++++---------- drivers/block/zram/zram_drv.h | 6 +++--- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index d65827eb9b27..5439ccd0be9a 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -46,6 +46,16 @@ static unsigned int num_devices = 1; static void zram_free_page(struct zram *zram, size_t index); +static void zram_slot_lock(struct zram *zram, u32 index) +{ + bit_spin_lock(ZRAM_LOCK, &zram->table[index].value); +} + +static void zram_slot_unlock(struct zram *zram, u32 index) +{ + bit_spin_unlock(ZRAM_LOCK, &zram->table[index].value); +} + static inline bool init_done(struct zram *zram) { return zram->disksize; @@ -763,16 +773,6 @@ static DEVICE_ATTR_RO(io_stat); static DEVICE_ATTR_RO(mm_stat); static DEVICE_ATTR_RO(debug_stat); -static void zram_slot_lock(struct zram *zram, u32 index) -{ - bit_spin_lock(ZRAM_ACCESS, &zram->table[index].value); -} - -static void zram_slot_unlock(struct zram *zram, u32 index) -{ - bit_spin_unlock(ZRAM_ACCESS, &zram->table[index].value); -} - static void zram_meta_free(struct zram *zram, u64 disksize) { size_t num_pages = disksize >> PAGE_SHIFT; diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 31762db861e3..a473d5c7d74f 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -60,9 +60,9 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3; /* Flags for zram pages (table[page_no].value) */ enum zram_pageflags { - /* Page consists the same element */ - ZRAM_SAME = ZRAM_FLAG_SHIFT, - ZRAM_ACCESS, /* page is now accessed */ + /* zram slot is locked */ + ZRAM_LOCK = ZRAM_FLAG_SHIFT, + ZRAM_SAME, /* Page consists the same element */ ZRAM_WB, /* page is stored on backing_device */ __NR_ZRAM_PAGEFLAGS, From 5df312f29da828488f8e307774d642a4cee5bef6 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 7 Jun 2018 17:05:42 -0700 Subject: [PATCH 1188/1212] BACKPORT: zram: mark incompressible page as ZRAM_HUGE Mark incompressible pages so that we could investigate who is the owner of the incompressible pages once the page is swapped out via using upcoming zram memory tracker feature. With it, we could prevent such pages to be swapped out by using mlock. Otherwise we might remove them. This patch exposes new stat for huge pages via mm_stat. Link: http://lkml.kernel.org/r/20180416090946.63057-3-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 89e85bce4b02edb7408aebf69d5d1a6692a05f4f) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: If1b7b2d6ea6672a575ffc3d70c2c8b58ecafd0d7 Signed-off-by: Amit Pundir --- Documentation/blockdev/zram.txt | 1 + drivers/block/zram/zram_drv.c | 17 ++++++++++++++--- drivers/block/zram/zram_drv.h | 2 ++ 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 257e65714c6a..78db38d02bc9 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -218,6 +218,7 @@ line of text and contains the following stats separated by whitespace: same_pages the number of same element filled pages written to this disk. No memory is allocated for such pages. pages_compacted the number of pages freed during compaction + huge_pages the number of incompressible pages 9) Deactivate: swapoff /dev/zram0 diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 5439ccd0be9a..8e74cafe9008 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -739,14 +739,15 @@ static ssize_t mm_stat_show(struct device *dev, max_used = atomic_long_read(&zram->stats.max_used_pages); ret = scnprintf(buf, PAGE_SIZE, - "%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n", + "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu\n", orig_size << PAGE_SHIFT, (u64)atomic64_read(&zram->stats.compr_data_size), mem_used << PAGE_SHIFT, zram->limit_pages << PAGE_SHIFT, max_used << PAGE_SHIFT, (u64)atomic64_read(&zram->stats.same_pages), - pool_stats.pages_compacted); + pool_stats.pages_compacted, + (u64)atomic64_read(&zram->stats.huge_pages)); up_read(&zram->init_lock); return ret; @@ -813,6 +814,11 @@ static void zram_free_page(struct zram *zram, size_t index) { unsigned long handle; + if (zram_test_flag(zram, index, ZRAM_HUGE)) { + zram_clear_flag(zram, index, ZRAM_HUGE); + atomic64_dec(&zram->stats.huge_pages); + } + if (zram_wb_enabled(zram) && zram_test_flag(zram, index, ZRAM_WB)) { zram_wb_clear(zram, index); atomic64_dec(&zram->stats.pages_stored); @@ -981,6 +987,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, } if (unlikely(comp_len > max_zpage_size)) { + comp_len = PAGE_SIZE; if (zram_wb_enabled(zram) && allow_wb) { zcomp_stream_put(zram->comp); ret = write_to_bdev(zram, bvec, index, bio, &element); @@ -992,7 +999,6 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, allow_wb = false; goto compress_again; } - comp_len = PAGE_SIZE; } /* @@ -1054,6 +1060,11 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, zram_slot_lock(zram, index); zram_free_page(zram, index); + if (comp_len == PAGE_SIZE) { + zram_set_flag(zram, index, ZRAM_HUGE); + atomic64_inc(&zram->stats.huge_pages); + } + if (flags) { zram_set_flag(zram, index, flags); zram_set_element(zram, index, element); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index a473d5c7d74f..4c4bc6042c89 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -64,6 +64,7 @@ enum zram_pageflags { ZRAM_LOCK = ZRAM_FLAG_SHIFT, ZRAM_SAME, /* Page consists the same element */ ZRAM_WB, /* page is stored on backing_device */ + ZRAM_HUGE, /* Incompressible page */ __NR_ZRAM_PAGEFLAGS, }; @@ -88,6 +89,7 @@ struct zram_stats { atomic64_t invalid_io; /* non-page-aligned I/O requests */ atomic64_t notify_free; /* no. of swap slot free notifications */ atomic64_t same_pages; /* no. of same element filled pages */ + atomic64_t huge_pages; /* no. of huge pages */ atomic64_t pages_stored; /* no. of pages currently stored */ atomic_long_t max_used_pages; /* no. of maximum pages stored */ atomic64_t writestall; /* no. of write slow paths */ From 95986f401e7a3f98753e4eed97f4db870728831f Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 7 Jun 2018 17:05:45 -0700 Subject: [PATCH 1189/1212] BACKPORT: zram: record accessed second zRam as swap is useful for small memory device. However, swap means those pages on zram are mostly cold pages due to VM's LRU algorithm. Especially, once init data for application are touched for launching, they tend to be not accessed any more and finally swapped out. zRAM can store such cold pages as compressed form but it's pointless to keep in memory. Better idea is app developers free them directly rather than remaining them on heap. This patch records last access time of each block of zram so that With upcoming zram memory tracking, it could help userspace developers to reduce memory footprint. Link: http://lkml.kernel.org/r/20180416090946.63057-4-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit d7eac6b6e1838ef1a1400df4ec55daa34bbc855e) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I5b217d3cd4da57e548196658e0824d65a0cad631 Signed-off-by: Amit Pundir --- drivers/block/zram/zram_drv.c | 16 ++++++++++++++++ drivers/block/zram/zram_drv.h | 1 + 2 files changed, 17 insertions(+) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 8e74cafe9008..de4fdb599a58 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -101,6 +101,16 @@ static inline void zram_set_element(struct zram *zram, u32 index, zram->table[index].element = element; } +static void zram_accessed(struct zram *zram, u32 index) +{ + zram->table[index].ac_time = sched_clock(); +} + +static void zram_reset_access(struct zram *zram, u32 index) +{ + zram->table[index].ac_time = 0; +} + static unsigned long zram_get_element(struct zram *zram, u32 index) { return zram->table[index].element; @@ -814,6 +824,8 @@ static void zram_free_page(struct zram *zram, size_t index) { unsigned long handle; + zram_reset_access(zram, index); + if (zram_test_flag(zram, index, ZRAM_HUGE)) { zram_clear_flag(zram, index, ZRAM_HUGE); atomic64_dec(&zram->stats.huge_pages); @@ -1183,6 +1195,10 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, generic_end_io_acct(rw, &zram->disk->part0, start_time); + zram_slot_lock(zram, index); + zram_accessed(zram, index); + zram_slot_unlock(zram, index); + if (unlikely(ret < 0)) { if (rw == READ) atomic64_inc(&zram->stats.failed_reads); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 4c4bc6042c89..79c73f50a2a2 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -78,6 +78,7 @@ struct zram_table_entry { unsigned long element; }; unsigned long value; + u64 ac_time; }; struct zram_stats { From d8ea3525da54d73b5d69fa44999b01a857209060 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 7 Jun 2018 17:05:49 -0700 Subject: [PATCH 1190/1212] BACKPORT: zram: introduce zram memory tracking zRam as swap is useful for small memory device. However, swap means those pages on zram are mostly cold pages due to VM's LRU algorithm. Especially, once init data for application are touched for launching, they tend to be not accessed any more and finally swapped out. zRAM can store such cold pages as compressed form but it's pointless to keep in memory. Better idea is app developers free them directly rather than remaining them on heap. This patch tell us last access time of each block of zram via "cat /sys/kernel/debug/zram/zram0/block_state". The output is as follows, 300 75.033841 .wh 301 63.806904 s.. 302 63.806919 ..h First column is zram's block index and 3rh one represents symbol (s: same page w: written page to backing store h: huge page) of the block state. Second column represents usec time unit of the block was last accessed. So above example means the 300th block is accessed at 75.033851 second and it was huge so it was written to the backing store. Admin can leverage this information to catch cold|incompressible pages of process with *pagemap* once part of heaps are swapped out. I used the feature a few years ago to find memory hoggers in userspace to notify them what memory they have wasted without touch for a long time. With it, they could reduce unnecessary memory space. However, at that time, I hacked up zram for the feature but now I need the feature again so I decided it would be better to upstream rather than keeping it alone. I hope I submit the userspace tool to use the feature soon. [akpm@linux-foundation.org: fix i386 printk warning] [minchan@kernel.org: use ktime_get_boottime() instead of sched_clock()] Link: http://lkml.kernel.org/r/20180420063525.GA253739@rodete-desktop-imager.corp.google.com [akpm@linux-foundation.org: documentation tweak] [akpm@linux-foundation.org: fix i386 printk warning] [minchan@kernel.org: fix compile warning] Link: http://lkml.kernel.org/r/20180508104849.GA8209@rodete-desktop-imager.corp.google.com [rdunlap@infradead.org: fix printk formats] Link: http://lkml.kernel.org/r/3652ccb1-96ef-0b0b-05d1-f661d7733dcc@infradead.org Link: http://lkml.kernel.org/r/20180416090946.63057-5-minchan@kernel.org Signed-off-by: Minchan Kim Signed-off-by: Randy Dunlap Reviewed-by: Sergey Senozhatsky Acked-by: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit c0265342bff4fcaa2cdf13f4596244c18d4a7ae5) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I932447d33d1b6af78ae6463b494006c725e5e38c Signed-off-by: Amit Pundir --- Documentation/blockdev/zram.txt | 24 ++++++ drivers/block/zram/Kconfig | 14 +++- drivers/block/zram/zram_drv.c | 140 +++++++++++++++++++++++++++++--- drivers/block/zram/zram_drv.h | 7 +- 4 files changed, 171 insertions(+), 14 deletions(-) diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 78db38d02bc9..875b2b56b87f 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -243,5 +243,29 @@ to backing storage rather than keeping it in memory. User should set up backing device via /sys/block/zramX/backing_dev before disksize setting. += memory tracking + +With CONFIG_ZRAM_MEMORY_TRACKING, user can know information of the +zram block. It could be useful to catch cold or incompressible +pages of the process with*pagemap. +If you enable the feature, you could see block state via +/sys/kernel/debug/zram/zram0/block_state". The output is as follows, + + 300 75.033841 .wh + 301 63.806904 s.. + 302 63.806919 ..h + +First column is zram's block index. +Second column is access time since the system was booted +Third column is state of the block. +(s: same page +w: written page to backing store +h: huge page) + +First line of above example says 300th block is accessed at 75.033841sec +and the block's state is huge so it is written back to the backing +storage. It's a debugging feature so anyone shouldn't rely on it to work +properly. + Nitin Gupta ngupta@vflare.org diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index 7cd4a8ec3c8f..cb53957d58f9 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -12,7 +12,7 @@ config ZRAM It has several use cases, for example: /tmp storage, use as swap disks and maybe many more. - See zram.txt for more information. + See Documentation/blockdev/zram.txt for more information. config ZRAM_WRITEBACK bool "Write back incompressible page to backing device" @@ -24,4 +24,14 @@ config ZRAM_WRITEBACK For this feature, admin should set up backing device via /sys/block/zramX/backing_dev. - See zram.txt for more infomration. + See Documentation/blockdev/zram.txt for more information. + +config ZRAM_MEMORY_TRACKING + bool "Track zRam block status" + depends on ZRAM && DEBUG_FS + help + With this feature, admin can track the state of allocated blocks + of zRAM. Admin could see the information via + /sys/kernel/debug/zram/zramX/block_state. + + See Documentation/blockdev/zram.txt for more information. diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index de4fdb599a58..5d8abb87eed2 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "zram_drv.h" @@ -61,6 +62,13 @@ static inline bool init_done(struct zram *zram) return zram->disksize; } +static inline bool zram_allocated(struct zram *zram, u32 index) +{ + + return (zram->table[index].value >> (ZRAM_FLAG_SHIFT + 1)) || + zram->table[index].handle; +} + static inline struct zram *dev_to_zram(struct device *dev) { return (struct zram *)dev_to_disk(dev)->private_data; @@ -77,7 +85,7 @@ static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle) } /* flag operations require table entry bit_spin_lock() being held */ -static int zram_test_flag(struct zram *zram, u32 index, +static bool zram_test_flag(struct zram *zram, u32 index, enum zram_pageflags flag) { return zram->table[index].value & BIT(flag); @@ -101,16 +109,6 @@ static inline void zram_set_element(struct zram *zram, u32 index, zram->table[index].element = element; } -static void zram_accessed(struct zram *zram, u32 index) -{ - zram->table[index].ac_time = sched_clock(); -} - -static void zram_reset_access(struct zram *zram, u32 index) -{ - zram->table[index].ac_time = 0; -} - static unsigned long zram_get_element(struct zram *zram, u32 index) { return zram->table[index].element; @@ -630,6 +628,122 @@ static int read_from_bdev(struct zram *zram, struct bio_vec *bvec, static void zram_wb_clear(struct zram *zram, u32 index) {} #endif +#ifdef CONFIG_ZRAM_MEMORY_TRACKING + +static struct dentry *zram_debugfs_root; + +static void zram_debugfs_create(void) +{ + zram_debugfs_root = debugfs_create_dir("zram", NULL); +} + +static void zram_debugfs_destroy(void) +{ + debugfs_remove_recursive(zram_debugfs_root); +} + +static void zram_accessed(struct zram *zram, u32 index) +{ + zram->table[index].ac_time = ktime_get_boottime(); +} + +static void zram_reset_access(struct zram *zram, u32 index) +{ + zram->table[index].ac_time.tv64 = 0; +} + +static ssize_t read_block_state(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + char *kbuf; + ssize_t index, written = 0; + struct zram *zram = file->private_data; + unsigned long nr_pages = zram->disksize >> PAGE_SHIFT; + struct timespec64 ts; + gfp_t kmalloc_flags; + + kmalloc_flags = GFP_KERNEL; + if (count > PAGE_SIZE) + kmalloc_flags |= __GFP_NOWARN | __GFP_NORETRY; + + kbuf = kmalloc_node(count, kmalloc_flags, NUMA_NO_NODE); + if (!kbuf && count > PAGE_SIZE) + kbuf = vmalloc(count); + + if (!kbuf) + return -ENOMEM; + + down_read(&zram->init_lock); + if (!init_done(zram)) { + up_read(&zram->init_lock); + kvfree(kbuf); + return -EINVAL; + } + + for (index = *ppos; index < nr_pages; index++) { + int copied; + + zram_slot_lock(zram, index); + if (!zram_allocated(zram, index)) + goto next; + + ts = ktime_to_timespec64(zram->table[index].ac_time); + copied = snprintf(kbuf + written, count, + "%12zd %12lld.%06lu %c%c%c\n", + index, (s64)ts.tv_sec, + ts.tv_nsec / NSEC_PER_USEC, + zram_test_flag(zram, index, ZRAM_SAME) ? 's' : '.', + zram_test_flag(zram, index, ZRAM_WB) ? 'w' : '.', + zram_test_flag(zram, index, ZRAM_HUGE) ? 'h' : '.'); + + if (count < copied) { + zram_slot_unlock(zram, index); + break; + } + written += copied; + count -= copied; +next: + zram_slot_unlock(zram, index); + *ppos += 1; + } + + up_read(&zram->init_lock); + if (copy_to_user(buf, kbuf, written)) + written = -EFAULT; + kvfree(kbuf); + + return written; +} + +static const struct file_operations proc_zram_block_state_op = { + .open = simple_open, + .read = read_block_state, + .llseek = default_llseek, +}; + +static void zram_debugfs_register(struct zram *zram) +{ + if (!zram_debugfs_root) + return; + + zram->debugfs_dir = debugfs_create_dir(zram->disk->disk_name, + zram_debugfs_root); + debugfs_create_file("block_state", 0400, zram->debugfs_dir, + zram, &proc_zram_block_state_op); +} + +static void zram_debugfs_unregister(struct zram *zram) +{ + debugfs_remove_recursive(zram->debugfs_dir); +} +#else +static void zram_debugfs_create(void) {}; +static void zram_debugfs_destroy(void) {}; +static void zram_accessed(struct zram *zram, u32 index) {}; +static void zram_reset_access(struct zram *zram, u32 index) {}; +static void zram_debugfs_register(struct zram *zram) {}; +static void zram_debugfs_unregister(struct zram *zram) {}; +#endif /* * We switched to per-cpu streams and this attr is not needed anymore. @@ -1605,6 +1719,7 @@ static int zram_add(void) } strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor)); + zram_debugfs_register(zram); pr_info("Added device: %s\n", zram->disk->disk_name); return device_id; @@ -1638,6 +1753,7 @@ static int zram_remove(struct zram *zram) zram->claim = true; mutex_unlock(&bdev->bd_mutex); + zram_debugfs_unregister(zram); /* * Remove sysfs first, so no one will perform a disksize * store while we destroy the devices. This also helps during @@ -1736,6 +1852,7 @@ static void destroy_devices(void) { class_unregister(&zram_control_class); idr_for_each(&zram_index_idr, &zram_remove_cb, NULL); + zram_debugfs_destroy(); idr_destroy(&zram_index_idr); unregister_blkdev(zram_major, "zram"); } @@ -1750,6 +1867,7 @@ static int __init zram_init(void) return ret; } + zram_debugfs_create(); zram_major = register_blkdev(0, "zram"); if (zram_major <= 0) { pr_err("Unable to get major number\n"); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 79c73f50a2a2..bbda650f0dc1 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -78,7 +78,9 @@ struct zram_table_entry { unsigned long element; }; unsigned long value; - u64 ac_time; +#ifdef CONFIG_ZRAM_MEMORY_TRACKING + ktime_t ac_time; +#endif }; struct zram_stats { @@ -127,5 +129,8 @@ struct zram { unsigned long nr_pages; spinlock_t bitmap_lock; #endif +#ifdef CONFIG_ZRAM_MEMORY_TRACKING + struct dentry *debugfs_dir; +#endif }; #endif From 4e04ef38baf1f3090c39477f6150675a0f183949 Mon Sep 17 00:00:00 2001 From: Peter Kalauskas Date: Tue, 21 Aug 2018 21:54:02 -0700 Subject: [PATCH 1191/1212] UPSTREAM: drivers/block/zram/zram_drv.c: fix bug storing backing_dev The call to strlcpy in backing_dev_store is incorrect. It should take the size of the destination buffer instead of the size of the source buffer. Additionally, ignore the newline character (\n) when reading the new file_name buffer. This makes it possible to set the backing_dev as follows: echo /dev/sdX > /sys/block/zram0/backing_dev The reason it worked before was the fact that strlcpy() copies 'len - 1' bytes, which is strlen(buf) - 1 in our case, so it accidentally didn't copy the trailing new line symbol. Which also means that "echo -n /dev/sdX" most likely was broken. Signed-off-by: Peter Kalauskas Link: http://lkml.kernel.org/r/20180813061623.GC64836@rodete-desktop-imager.corp.google.com Acked-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Cc: [4.14+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit c8bd134a4bddafe5917d163eea73873932c15e83) Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: I0a0d602b61169ae9adc8f89914ce4e30cc10e191 Signed-off-by: Amit Pundir --- drivers/block/zram/zram_drv.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 5d8abb87eed2..ec034123cbec 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -339,6 +339,7 @@ static ssize_t backing_dev_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { char *file_name; + size_t sz; struct file *backing_dev = NULL; struct inode *inode; struct address_space *mapping; @@ -360,7 +361,11 @@ static ssize_t backing_dev_store(struct device *dev, goto out; } - strlcpy(file_name, buf, len); + strlcpy(file_name, buf, PATH_MAX); + /* ignore trailing newline */ + sz = strlen(file_name); + if (sz > 0 && file_name[sz - 1] == '\n') + file_name[sz - 1] = 0x00; backing_dev = filp_open(file_name, O_RDWR|O_LARGEFILE, 0); if (IS_ERR(backing_dev)) { From 8459a575c95c8fbf6e9bf9927db7ec39205f4d1e Mon Sep 17 00:00:00 2001 From: Peter Kalauskas Date: Fri, 24 Aug 2018 12:27:10 -0700 Subject: [PATCH 1192/1212] ANDROID: x86_64_cuttlefish_defconfig: Enable lz4 compression for zram Signed-off-by: Peter Kalauskas Bug: 112488418 Change-Id: Iab302cdf63691a3cc3124b5826206b9f6bd4adfb Signed-off-by: Peter Kalauskas Signed-off-by: Amit Pundir --- arch/x86/configs/x86_64_cuttlefish_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig index e42f748f7414..38dd54633384 100644 --- a/arch/x86/configs/x86_64_cuttlefish_defconfig +++ b/arch/x86/configs/x86_64_cuttlefish_defconfig @@ -452,6 +452,7 @@ CONFIG_SECURITY_SELINUX=y CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1 # CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set CONFIG_CRYPTO_SHA512=y +CONFIG_CRYPTO_LZ4=y CONFIG_CRYPTO_ZSTD=y CONFIG_ASYMMETRIC_KEY_TYPE=y CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y From 4c0933247e89e4319a9711122017aaae0725df68 Mon Sep 17 00:00:00 2001 From: Adrian Salido Date: Tue, 18 Apr 2017 11:44:33 -0700 Subject: [PATCH 1193/1212] ANDROID: tracing: fix race condition reading saved tgids Commit 939c7a4f04fc ("tracing: Introduce saved_cmdlines_size file") introduced ability to change saved cmdlines size. This resized saved command lines but missed resizing tgid mapping as well. Another issue is that when the resize happens, it removes saved command lines and reallocates new memory for it. This introduced a race condition when reading the global savecmd as this can be freed in the middle of accessing it causing a use after free access. Fix this by implementing locking. Signed-off-by: Adrian Salido Bug: 36007735 Change-Id: I334791ac35f8bcbd34362ed112aa624275a46947 (cherry picked from commit 7116d306da66de0de21e982024b4d3a3056f4461) Signed-off-by: Amit Pundir --- kernel/trace/trace.c | 101 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 81 insertions(+), 20 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a3a3dd833fcf..2bee793749d8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1359,11 +1359,11 @@ void tracing_reset_all_online_cpus(void) #define SAVED_CMDLINES_DEFAULT 128 #define NO_CMDLINE_MAP UINT_MAX -static unsigned saved_tgids[SAVED_CMDLINES_DEFAULT]; static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; struct saved_cmdlines_buffer { unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; unsigned *map_cmdline_to_pid; + unsigned *map_cmdline_to_tgid; unsigned cmdline_num; int cmdline_idx; char *saved_cmdlines; @@ -1397,12 +1397,23 @@ static int allocate_cmdlines_buffer(unsigned int val, return -ENOMEM; } + s->map_cmdline_to_tgid = kmalloc_array(val, + sizeof(*s->map_cmdline_to_tgid), + GFP_KERNEL); + if (!s->map_cmdline_to_tgid) { + kfree(s->map_cmdline_to_pid); + kfree(s->saved_cmdlines); + return -ENOMEM; + } + s->cmdline_idx = 0; s->cmdline_num = val; memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(s->map_pid_to_cmdline)); memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, val * sizeof(*s->map_cmdline_to_pid)); + memset(s->map_cmdline_to_tgid, NO_CMDLINE_MAP, + val * sizeof(*s->map_cmdline_to_tgid)); return 0; } @@ -1568,14 +1579,17 @@ static int trace_save_cmdline(struct task_struct *tsk) if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) return 0; + preempt_disable(); /* * It's not the end of the world if we don't get * the lock, but we also don't want to spin * nor do we want to disable interrupts, * so if we miss here, then better luck next time. */ - if (!arch_spin_trylock(&trace_cmdline_lock)) + if (!arch_spin_trylock(&trace_cmdline_lock)) { + preempt_enable(); return 0; + } idx = savedcmd->map_pid_to_cmdline[tsk->pid]; if (idx == NO_CMDLINE_MAP) { @@ -1598,8 +1612,9 @@ static int trace_save_cmdline(struct task_struct *tsk) } set_cmdline(idx, tsk->comm); - saved_tgids[idx] = tsk->tgid; + savedcmd->map_cmdline_to_tgid[idx] = tsk->tgid; arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); return 1; } @@ -1641,19 +1656,29 @@ void trace_find_cmdline(int pid, char comm[]) preempt_enable(); } -int trace_find_tgid(int pid) +static int __find_tgid_locked(int pid) { unsigned map; int tgid; - preempt_disable(); - arch_spin_lock(&trace_cmdline_lock); map = savedcmd->map_pid_to_cmdline[pid]; if (map != NO_CMDLINE_MAP) - tgid = saved_tgids[map]; + tgid = savedcmd->map_cmdline_to_tgid[map]; else tgid = -1; + return tgid; +} + +int trace_find_tgid(int pid) +{ + int tgid; + + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + + tgid = __find_tgid_locked(pid); + arch_spin_unlock(&trace_cmdline_lock); preempt_enable(); @@ -3970,10 +3995,15 @@ tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, { char buf[64]; int r; + unsigned int n; + preempt_disable(); arch_spin_lock(&trace_cmdline_lock); - r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num); + n = savedcmd->cmdline_num; arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); + + r = scnprintf(buf, sizeof(buf), "%u\n", n); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } @@ -3982,6 +4012,7 @@ static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) { kfree(s->saved_cmdlines); kfree(s->map_cmdline_to_pid); + kfree(s->map_cmdline_to_tgid); kfree(s); } @@ -3998,10 +4029,12 @@ static int tracing_resize_saved_cmdlines(unsigned int val) return -ENOMEM; } + preempt_disable(); arch_spin_lock(&trace_cmdline_lock); savedcmd_temp = savedcmd; savedcmd = s; arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); free_saved_cmdlines_buffer(savedcmd_temp); return 0; @@ -4220,33 +4253,61 @@ tracing_saved_tgids_read(struct file *file, char __user *ubuf, char *file_buf; char *buf; int len = 0; - int pid; int i; + int *pids; + int n = 0; - file_buf = kmalloc(SAVED_CMDLINES_DEFAULT*(16+1+16), GFP_KERNEL); - if (!file_buf) + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + + pids = kmalloc_array(savedcmd->cmdline_num, 2*sizeof(int), GFP_KERNEL); + if (!pids) { + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); return -ENOMEM; + } - buf = file_buf; - - for (i = 0; i < SAVED_CMDLINES_DEFAULT; i++) { - int tgid; - int r; + for (i = 0; i < savedcmd->cmdline_num; i++) { + int pid; pid = savedcmd->map_cmdline_to_pid[i]; if (pid == -1 || pid == NO_CMDLINE_MAP) continue; - tgid = trace_find_tgid(pid); - r = sprintf(buf, "%d %d\n", pid, tgid); + pids[n] = pid; + pids[n+1] = __find_tgid_locked(pid); + n += 2; + } + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); + + if (n == 0) { + kfree(pids); + return 0; + } + + /* enough to hold max pair of pids + space, lr and nul */ + len = n * 12; + file_buf = kmalloc(len, GFP_KERNEL); + if (!file_buf) { + kfree(pids); + return -ENOMEM; + } + + buf = file_buf; + for (i = 0; i < n && len > 0; i += 2) { + int r; + + r = snprintf(buf, len, "%d %d\n", pids[i], pids[i+1]); buf += r; - len += r; + len -= r; } len = simple_read_from_buffer(ubuf, cnt, ppos, - file_buf, len); + file_buf, buf - file_buf); kfree(file_buf); + kfree(pids); return len; } From 49c3b184dc7d13bcd1e8ae430159750f3292d707 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 5 Apr 2018 16:24:47 -0700 Subject: [PATCH 1194/1212] BACKPORT: zram: drop max_zpage_size and use zs_huge_class_size() Remove ZRAM's enforced "huge object" value and use zsmalloc huge-class watermark instead, which makes more sense. TEST - I used a 1G zram device, LZO compression back-end, original data set size was 444MB. Looking at zsmalloc classes stats the test ended up to be pretty fair. BASE ZRAM/ZSMALLOC ===================== zram mm_stat 498978816 191482495 199831552 0 199831552 15634 0 zsmalloc classes class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable ... 151 2448 0 0 1240 1240 744 3 0 168 2720 0 0 4200 4200 2800 2 0 190 3072 0 0 10100 10100 7575 3 0 202 3264 0 0 380 380 304 4 0 254 4096 0 0 10620 10620 10620 1 0 Total 7 46 106982 106187 48787 0 PATCHED ZRAM/ZSMALLOC ===================== zram mm_stat 498978816 182579184 194248704 0 194248704 15628 0 zsmalloc classes class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable ... 151 2448 0 0 1240 1240 744 3 0 168 2720 0 0 4200 4200 2800 2 0 190 3072 0 0 10100 10100 7575 3 0 202 3264 0 0 7180 7180 5744 4 0 254 4096 0 0 3820 3820 3820 1 0 Total 8 45 106959 106193 47424 0 As we can see, we reduced the number of objects stored in class-4096, because a huge number of objects which we previously forcibly stored in class-4096 now stored in non-huge class-3264. This results in lower memory consumption: - zsmalloc now uses 47424 physical pages, which is less than 48787 pages zsmalloc used before. - objects that we store in class-3264 share zspages. That's why overall the number of pages that both class-4096 and class-3264 consumed went down from 10924 to 9564. [sergey.senozhatsky.work@gmail.com: add pool param to zs_huge_class_size()] Link: http://lkml.kernel.org/r/20180314081833.1096-3-sergey.senozhatsky@gmail.com Link: http://lkml.kernel.org/r/20180306070639.7389-3-sergey.senozhatsky@gmail.com Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 60f5921a9a4f126e081318bd6bb2bc2798b7bba8) Signed-off-by: Peter Kalauskas Bug: 113183619 Change-Id: I1d3ede25543e99a24802ad03f68995f33aaf79b5 Signed-off-by: Amit Pundir --- drivers/block/zram/zram_drv.c | 10 ++++++++-- drivers/block/zram/zram_drv.h | 16 ---------------- 2 files changed, 8 insertions(+), 18 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index ec034123cbec..7ccc2e3e4ca3 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -44,6 +44,11 @@ static const char *default_compressor = "lzo"; /* Module params (documentation at end) */ static unsigned int num_devices = 1; +/* + * Pages that compress to sizes equals or greater than this are stored + * uncompressed in memory. + */ +static size_t huge_class_size; static void zram_free_page(struct zram *zram, size_t index); @@ -931,6 +936,8 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) return false; } + if (!huge_class_size) + huge_class_size = zs_huge_class_size(zram->mem_pool); return true; } @@ -1117,8 +1124,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, return ret; } - if (unlikely(comp_len > max_zpage_size)) { - comp_len = PAGE_SIZE; + if (unlikely(comp_len >= huge_class_size)) { if (zram_wb_enabled(zram) && allow_wb) { zcomp_stream_put(zram->comp); ret = write_to_bdev(zram, bvec, index, bio, &element); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index bbda650f0dc1..3a1cac486e96 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -21,22 +21,6 @@ #include "zcomp.h" -/*-- Configurable parameters */ - -/* - * Pages that compress to size greater than this are stored - * uncompressed in memory. - */ -static const size_t max_zpage_size = PAGE_SIZE / 4 * 3; - -/* - * NOTE: max_zpage_size must be less than or equal to: - * ZS_MAX_ALLOC_SIZE. Otherwise, zs_malloc() would - * always return failure. - */ - -/*-- End of configurable params */ - #define SECTOR_SHIFT 9 #define SECTORS_PER_PAGE_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) #define SECTORS_PER_PAGE (1 << SECTORS_PER_PAGE_SHIFT) From 00d3adf9f2b52139497aadcbb1a8be25acc9acb6 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 5 Apr 2018 16:24:43 -0700 Subject: [PATCH 1195/1212] BACKPORT: zsmalloc: introduce zs_huge_class_size() Patch series "zsmalloc/zram: drop zram's max_zpage_size", v3. ZRAM's max_zpage_size is a bad thing. It forces zsmalloc to store normal objects as huge ones, which results in bigger zsmalloc memory usage. Drop it and use actual zsmalloc huge-class value when decide if the object is huge or not. This patch (of 2): Not every object can be share its zspage with other objects, e.g. when the object is as big as zspage or nearly as big a zspage. For such objects zsmalloc has a so called huge class - every object which belongs to huge class consumes the entire zspage (which consists of a physical page). On x86_64, PAGE_SHIFT 12 box, the first non-huge class size is 3264, so starting down from size 3264, objects can share page(-s) and thus minimize memory wastage. ZRAM, however, has its own statically defined watermark for huge objects, namely "3 * PAGE_SIZE / 4 = 3072", and forcibly stores every object larger than this watermark (3072) as a PAGE_SIZE object, in other words, to a huge class, while zsmalloc can keep some of those objects in non-huge classes. This results in increased memory consumption. zsmalloc knows better if the object is huge or not. Introduce zs_huge_class_size() function which tells if the given object can be stored in one of non-huge classes or not. This will let us to drop ZRAM's huge object watermark and fully rely on zsmalloc when we decide if the object is huge. [sergey.senozhatsky.work@gmail.com: add pool param to zs_huge_class_size()] Link: http://lkml.kernel.org/r/20180314081833.1096-2-sergey.senozhatsky@gmail.com Link: http://lkml.kernel.org/r/20180306070639.7389-2-sergey.senozhatsky@gmail.com Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 010b495e2fa32353d0ef6aa70a8169e5ef617a15) Signed-off-by: Peter Kalauskas Bug: 113183619 Change-Id: Ic35f8c1ec75f0b78bf2d83729b6aedd2999f25c8 Signed-off-by: Amit Pundir --- include/linux/zsmalloc.h | 2 ++ mm/zsmalloc.c | 43 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 57a8e98f2708..2219cce81ca4 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -47,6 +47,8 @@ void zs_destroy_pool(struct zs_pool *pool); unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags); void zs_free(struct zs_pool *pool, unsigned long obj); +size_t zs_huge_class_size(struct zs_pool *pool); + void *zs_map_object(struct zs_pool *pool, unsigned long handle, enum zs_mapmode mm); void zs_unmap_object(struct zs_pool *pool, unsigned long handle); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 8fced2101492..290e8210c13e 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -202,6 +202,7 @@ static int zs_size_classes; * (see: fix_fullness_group()) */ static const int fullness_threshold_frac = 4; +static size_t huge_class_size; struct size_class { spinlock_t lock; @@ -1351,6 +1352,25 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) } EXPORT_SYMBOL_GPL(zs_unmap_object); +/** + * zs_huge_class_size() - Returns the size (in bytes) of the first huge + * zsmalloc &size_class. + * @pool: zsmalloc pool to use + * + * The function returns the size of the first huge class - any object of equal + * or bigger size will be stored in zspage consisting of a single physical + * page. + * + * Context: Any context. + * + * Return: the size (in bytes) of the first huge zsmalloc &size_class. + */ +size_t zs_huge_class_size(struct zs_pool *pool) +{ + return huge_class_size; +} +EXPORT_SYMBOL_GPL(zs_huge_class_size); + static unsigned long obj_malloc(struct page *first_page, struct size_class *class, unsigned long handle) { @@ -1919,12 +1939,35 @@ struct zs_pool *zs_create_pool(const char *name) for (i = zs_size_classes - 1; i >= 0; i--) { int size; int pages_per_zspage; + int objs_per_zspage; struct size_class *class; size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; if (size > ZS_MAX_ALLOC_SIZE) size = ZS_MAX_ALLOC_SIZE; pages_per_zspage = get_pages_per_zspage(size); + objs_per_zspage = pages_per_zspage * PAGE_SIZE / size; + + /* + * We iterate from biggest down to smallest classes, + * so huge_class_size holds the size of the first huge + * class. Any object bigger than or equal to that will + * endup in the huge class. + */ + if (pages_per_zspage != 1 && objs_per_zspage != 1 && + !huge_class_size) { + huge_class_size = size; + /* + * The object uses ZS_HANDLE_SIZE bytes to store the + * handle. We need to subtract it, because zs_malloc() + * unconditionally adds handle size before it performs + * size class search - so object may be smaller than + * huge class size, yet it still can end up in the huge + * class because it grows by ZS_HANDLE_SIZE extra bytes + * right before class lookup. + */ + huge_class_size -= (ZS_HANDLE_SIZE - 1); + } /* * size_class is used for normal zsmalloc operation such From c5613bdc9e50268cbbe70ef5e516eadbfe95e0e6 Mon Sep 17 00:00:00 2001 From: Greg Hackmann Date: Wed, 5 Sep 2018 09:39:22 -0700 Subject: [PATCH 1196/1212] ANDROID: arm64: mm: fix 4.4.154 merge android-4.4 contains an out-of-tree backport of 68709f45385a ("arm64: only consider memblocks with NOMAP cleared for linear mapping"), so it should use the 4.9.y implementation of pfn_valid() that calls memblock_is_map_memory(). Change-Id: Id1d67813ee2a0a85ec69ef255daa27c4f6286800 Reported-by: Nathan Chancellor Signed-off-by: Greg Hackmann Signed-off-by: Amit Pundir --- arch/arm64/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 816a69dedcbb..0dce70b86900 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -273,7 +273,7 @@ int pfn_valid(unsigned long pfn) if ((addr >> PAGE_SHIFT) != pfn) return 0; - return memblock_is_memory(addr); + return memblock_is_map_memory(addr); } EXPORT_SYMBOL(pfn_valid); #endif From 1bf5d59504cf5363c68b47599c552c7df8e92652 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 8 Jun 2017 16:44:22 -0700 Subject: [PATCH 1197/1212] BACKPORT: arm64/vdso: Fix nsec handling for CLOCK_MONOTONIC_RAW commit dbb236c1ceb697a559e0694ac4c9e7b9131d0b16 upstream. Recently vDSO support for CLOCK_MONOTONIC_RAW was added in 49eea433b326 ("arm64: Add support for CLOCK_MONOTONIC_RAW in clock_gettime() vDSO"). Noticing that the core timekeeping code never set tkr_raw.xtime_nsec, the vDSO implementation didn't bother exposing it via the data page and instead took the unshifted tk->raw_time.tv_nsec value which was then immediately shifted left in the vDSO code. Unfortunately, by accellerating the MONOTONIC_RAW clockid, it uncovered potential 1ns time inconsistencies caused by the timekeeping core not handing sub-ns resolution. Now that the core code has been fixed and is actually setting tkr_raw.xtime_nsec, we need to take that into account in the vDSO by adding it to the shifted raw_time value, in order to fix the user-visible inconsistency. Rather than do that at each use (and expand the data page in the process), instead perform the shift/addition operation when populating the data page and remove the shift from the vDSO code entirely. [jstultz: minor whitespace tweak, tried to improve commit message to make it more clear this fixes a regression] Reported-by: John Stultz Signed-off-by: Will Deacon Signed-off-by: John Stultz Tested-by: Daniel Mentz Acked-by: Kevin Brodsky Cc: Prarit Bhargava Cc: Richard Cochran Cc: Stephen Boyd Cc: Miroslav Lichvar Link: http://lkml.kernel.org/r/1496965462-20003-4-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman Change-Id: I51b5ebd994635eb091ad6a084ddfa12074f27d81 Tested-by: Freddy Hsin Signed-off-by: Miles Chen Signed-off-by: Greg Kroah-Hartman Signed-off-by: Amit Pundir --- arch/arm64/kernel/vdso.c | 1 - arch/arm64/kernel/vdso/gettimeofday.S | 1 - 2 files changed, 2 deletions(-) diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index 7e9dd94452bb..46fa4de29fb1 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -222,7 +222,6 @@ void update_vsyscall(struct timekeeper *tk) vdso_data->raw_time_nsec = tk->tkr_raw.xtime_nsec; vdso_data->xtime_clock_sec = tk->xtime_sec; vdso_data->xtime_clock_nsec = tk->tkr_mono.xtime_nsec; - /* tkr_raw.xtime_nsec == 0 */ vdso_data->cs_mono_mult = tk->tkr_mono.mult; vdso_data->cs_raw_mult = tk->tkr_raw.mult; /* tkr_mono.shift == tkr_raw.shift */ diff --git a/arch/arm64/kernel/vdso/gettimeofday.S b/arch/arm64/kernel/vdso/gettimeofday.S index c97ce91cf023..c39872a7b03c 100644 --- a/arch/arm64/kernel/vdso/gettimeofday.S +++ b/arch/arm64/kernel/vdso/gettimeofday.S @@ -256,7 +256,6 @@ monotonic_raw: seqcnt_check fail=monotonic_raw /* All computations are done with left-shifted nsecs. */ - lsl x14, x14, x12 get_nsec_per_sec res=x9 lsl x9, x9, x12 From dde03585afa4eec313d14126c3a34ba39b023acf Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Sat, 25 Aug 2018 13:50:56 -0700 Subject: [PATCH 1198/1212] FROMLIST: ANDROID: binder: Add BINDER_GET_NODE_INFO_FOR_REF ioctl. This allows the context manager to retrieve information about nodes that it holds a reference to, such as the current number of references to those nodes. Such information can for example be used to determine whether the servicemanager is the only process holding a reference to a node. This information can then be passed on to the process holding the node, which can in turn decide whether it wants to shut down to reduce resource usage. Signed-off-by: Martijn Coenen Signed-off-by: Amit Pundir --- drivers/android/binder.c | 55 +++++++++++++++++++++++++++++ include/uapi/linux/android/binder.h | 10 ++++++ 2 files changed, 65 insertions(+) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index e0c46ce312d7..11c297806d7d 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -4714,6 +4714,42 @@ static int binder_ioctl_set_ctx_mgr(struct file *filp) return ret; } +static int binder_ioctl_get_node_info_for_ref(struct binder_proc *proc, + struct binder_node_info_for_ref *info) +{ + struct binder_node *node; + struct binder_context *context = proc->context; + __u32 handle = info->handle; + + if (info->strong_count || info->weak_count || info->reserved1 || + info->reserved2 || info->reserved3) { + binder_user_error("%d BINDER_GET_NODE_INFO_FOR_REF: only handle may be non-zero.", + proc->pid); + return -EINVAL; + } + + /* This ioctl may only be used by the context manager */ + mutex_lock(&context->context_mgr_node_lock); + if (!context->binder_context_mgr_node || + context->binder_context_mgr_node->proc != proc) { + mutex_unlock(&context->context_mgr_node_lock); + return -EPERM; + } + mutex_unlock(&context->context_mgr_node_lock); + + node = binder_get_node_from_ref(proc, handle, true, NULL); + if (!node) + return -EINVAL; + + info->strong_count = node->local_strong_refs + + node->internal_strong_refs; + info->weak_count = node->local_weak_refs; + + binder_put_node(node); + + return 0; +} + static int binder_ioctl_get_node_debug_info(struct binder_proc *proc, struct binder_node_debug_info *info) { struct rb_node *n; @@ -4807,6 +4843,25 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } break; } + case BINDER_GET_NODE_INFO_FOR_REF: { + struct binder_node_info_for_ref info; + + if (copy_from_user(&info, ubuf, sizeof(info))) { + ret = -EFAULT; + goto err; + } + + ret = binder_ioctl_get_node_info_for_ref(proc, &info); + if (ret < 0) + goto err; + + if (copy_to_user(ubuf, &info, sizeof(info))) { + ret = -EFAULT; + goto err; + } + + break; + } case BINDER_GET_NODE_DEBUG_INFO: { struct binder_node_debug_info info; diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h index 5539933b3491..bd0da0e992b8 100644 --- a/include/uapi/linux/android/binder.h +++ b/include/uapi/linux/android/binder.h @@ -246,6 +246,15 @@ struct binder_node_debug_info { __u32 has_weak_ref; }; +struct binder_node_info_for_ref { + __u32 handle; + __u32 strong_count; + __u32 weak_count; + __u32 reserved1; + __u32 reserved2; + __u32 reserved3; +}; + #define BINDER_WRITE_READ _IOWR('b', 1, struct binder_write_read) #define BINDER_SET_IDLE_TIMEOUT _IOW('b', 3, __s64) #define BINDER_SET_MAX_THREADS _IOW('b', 5, __u32) @@ -254,6 +263,7 @@ struct binder_node_debug_info { #define BINDER_THREAD_EXIT _IOW('b', 8, __s32) #define BINDER_VERSION _IOWR('b', 9, struct binder_version) #define BINDER_GET_NODE_DEBUG_INFO _IOWR('b', 11, struct binder_node_debug_info) +#define BINDER_GET_NODE_INFO_FOR_REF _IOWR('b', 12, struct binder_node_info_for_ref) /* * NOTE: Two special error codes you should check for when calling From 5b182deb97f7aab0df333e102b32489083b255cf Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 17 Aug 2016 14:42:08 -0700 Subject: [PATCH 1199/1212] BACKPORT: list: Split list_add() debug checking into separate function (cherry-picked from d7c816733d501b59dbdc2483f2cc8e4431fd9160) Right now, __list_add() code is repeated either in list.h or in list_debug.c, but the only differences between the two versions are the debug checks. This commit therefore extracts these debug checks into a separate __list_add_valid() function and consolidates __list_add(). Additionally this new __list_add_valid() function will stop list manipulations if a corruption is detected, instead of allowing for further corruption that may lead to even worse conditions. This is slight refactoring of the same hardening done in PaX and Grsecurity. Change-Id: I9a9c9a58857cf837bec7abdb2ee4970cd1242a5e Signed-off-by: Kees Cook Acked-by: Steven Rostedt Signed-off-by: Paul E. McKenney Acked-by: Rik van Riel Signed-off-by: Satya Tangirala Signed-off-by: Amit Pundir --- include/linux/list.h | 22 ++++++++++++++------ lib/list_debug.c | 48 +++++++++++++++++++++----------------------- 2 files changed, 39 insertions(+), 31 deletions(-) diff --git a/include/linux/list.h b/include/linux/list.h index 993395a2e55c..eb783a0192bd 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -28,27 +28,37 @@ static inline void INIT_LIST_HEAD(struct list_head *list) list->prev = list; } +#ifdef CONFIG_DEBUG_LIST +extern bool __list_add_valid(struct list_head *new, + struct list_head *prev, + struct list_head *next); +#else +static inline bool __list_add_valid(struct list_head *new, + struct list_head *prev, + struct list_head *next) +{ + return true; +} +#endif + /* * Insert a new entry between two known consecutive entries. * * This is only for internal list manipulation where we know * the prev/next entries already! */ -#ifndef CONFIG_DEBUG_LIST static inline void __list_add(struct list_head *new, struct list_head *prev, struct list_head *next) { + if (!__list_add_valid(new, prev, next)) + return; + next->prev = new; new->next = next; new->prev = prev; prev->next = new; } -#else -extern void __list_add(struct list_head *new, - struct list_head *prev, - struct list_head *next); -#endif /** * list_add - add a new entry diff --git a/lib/list_debug.c b/lib/list_debug.c index c24c2f7e296f..149dd57b583b 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -2,8 +2,7 @@ * Copyright 2006, Red Hat, Inc., Dave Jones * Released under the General Public License (GPL). * - * This file contains the linked list implementations for - * DEBUG_LIST. + * This file contains the linked list validation for DEBUG_LIST. */ #include @@ -13,33 +12,32 @@ #include /* - * Insert a new entry between two known consecutive entries. - * - * This is only for internal list manipulation where we know - * the prev/next entries already! + * Check that the data structures for the list manipulations are reasonably + * valid. Failures here indicate memory corruption (and possibly an exploit + * attempt). */ -void __list_add(struct list_head *new, - struct list_head *prev, - struct list_head *next) +bool __list_add_valid(struct list_head *new, struct list_head *prev, + struct list_head *next) { - WARN(next->prev != prev, - "list_add corruption. next->prev should be " - "prev (%p), but was %p. (next=%p).\n", - prev, next->prev, next); - WARN(prev->next != next, - "list_add corruption. prev->next should be " - "next (%p), but was %p. (prev=%p).\n", - next, prev->next, prev); - WARN(new == prev || new == next, - "list_add double add: new=%p, prev=%p, next=%p.\n", - new, prev, next); - next->prev = new; - new->next = next; - new->prev = prev; - prev->next = new; + if (unlikely(next->prev != prev)) { + WARN(1, "list_add corruption. next->prev should be prev (%p), but was %p. (next=%p).\n", + prev, next->prev, next); + return false; + } + if (unlikely(prev->next != next)) { + WARN(1, "list_add corruption. prev->next should be next (%p), but was %p. (prev=%p).\n", + next, prev->next, prev); + return false; + } + if (unlikely(new == prev || new == next)) { + WARN(1, "list_add double add: new=%p, prev=%p, next=%p.\n", + new, prev, next); + return false; + } + return true; } -EXPORT_SYMBOL(__list_add); +EXPORT_SYMBOL(__list_add_valid); void __list_del_entry(struct list_head *entry) { From e73afb4435d8c4e869a557f705498da3e08e4a83 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 17 Aug 2016 14:42:09 -0700 Subject: [PATCH 1200/1212] UPSTREAM: rculist: Consolidate DEBUG_LIST for list_add_rcu() (cherry-picked from 54acd4397d7e7a725c94101180cd9f38ef701acc) This commit consolidates the debug checking for list_add_rcu() into the new single __list_add_valid() debug function. Notably, this commit fixes the sanity check that was added in commit 17a801f4bfeb ("list_debug: WARN for adding something already in the list"), which wasn't checking RCU-protected lists. Change-Id: I1f7e169d4dc45bbc9938087a171c5df747344414 Signed-off-by: Kees Cook Acked-by: Steven Rostedt Signed-off-by: Paul E. McKenney Acked-by: Rik van Riel Signed-off-by: Satya Tangirala Signed-off-by: Amit Pundir --- include/linux/rculist.h | 8 +++----- lib/list_debug.c | 19 ------------------- 2 files changed, 3 insertions(+), 24 deletions(-) diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 5ed540986019..0c94d17a4642 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -45,19 +45,17 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list) * This is only for internal list manipulation where we know * the prev/next entries already! */ -#ifndef CONFIG_DEBUG_LIST static inline void __list_add_rcu(struct list_head *new, struct list_head *prev, struct list_head *next) { + if (!__list_add_valid(new, prev, next)) + return; + new->next = next; new->prev = prev; rcu_assign_pointer(list_next_rcu(prev), new); next->prev = new; } -#else -void __list_add_rcu(struct list_head *new, - struct list_head *prev, struct list_head *next); -#endif /** * list_add_rcu - add a new entry to rcu-protected list diff --git a/lib/list_debug.c b/lib/list_debug.c index 149dd57b583b..d0b89b9d0736 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -77,22 +77,3 @@ void list_del(struct list_head *entry) entry->prev = LIST_POISON2; } EXPORT_SYMBOL(list_del); - -/* - * RCU variants. - */ -void __list_add_rcu(struct list_head *new, - struct list_head *prev, struct list_head *next) -{ - WARN(next->prev != prev, - "list_add_rcu corruption. next->prev should be prev (%p), but was %p. (next=%p).\n", - prev, next->prev, next); - WARN(prev->next != next, - "list_add_rcu corruption. prev->next should be next (%p), but was %p. (prev=%p).\n", - next, prev->next, prev); - new->next = next; - new->prev = prev; - rcu_assign_pointer(list_next_rcu(prev), new); - next->prev = new; -} -EXPORT_SYMBOL(__list_add_rcu); From a54e75c143b63efafb5d4095458be98e17271e48 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 17 Aug 2016 14:42:10 -0700 Subject: [PATCH 1201/1212] UPSTREAM: list: Split list_del() debug checking into separate function (cherry-picked from 0cd340dcb05c4a43742fe156f36737bb2a321bfd) Similar to the list_add() debug consolidation, this commit consolidates the debug checking performed during CONFIG_DEBUG_LIST into a new __list_del_entry_valid() function, and stops list updates when corruption is found. Refactored from same hardening in PaX and Grsecurity. Change-Id: I9e3b8654ab25f3a196e3336fc4882b73010873e7 Signed-off-by: Kees Cook Acked-by: Steven Rostedt Signed-off-by: Paul E. McKenney Acked-by: Rik van Riel Signed-off-by: Satya Tangirala Signed-off-by: Amit Pundir --- include/linux/list.h | 15 ++++++++----- lib/list_debug.c | 53 +++++++++++++++++++------------------------- 2 files changed, 32 insertions(+), 36 deletions(-) diff --git a/include/linux/list.h b/include/linux/list.h index eb783a0192bd..d5750f2f1c36 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -32,6 +32,7 @@ static inline void INIT_LIST_HEAD(struct list_head *list) extern bool __list_add_valid(struct list_head *new, struct list_head *prev, struct list_head *next); +extern bool __list_del_entry_valid(struct list_head *entry); #else static inline bool __list_add_valid(struct list_head *new, struct list_head *prev, @@ -39,6 +40,10 @@ static inline bool __list_add_valid(struct list_head *new, { return true; } +static inline bool __list_del_entry_valid(struct list_head *entry) +{ + return true; +} #endif /* @@ -106,22 +111,20 @@ static inline void __list_del(struct list_head * prev, struct list_head * next) * Note: list_empty() on entry does not return true after this, the entry is * in an undefined state. */ -#ifndef CONFIG_DEBUG_LIST static inline void __list_del_entry(struct list_head *entry) { + if (!__list_del_entry_valid(entry)) + return; + __list_del(entry->prev, entry->next); } static inline void list_del(struct list_head *entry) { - __list_del(entry->prev, entry->next); + __list_del_entry(entry); entry->next = LIST_POISON1; entry->prev = LIST_POISON2; } -#else -extern void __list_del_entry(struct list_head *entry); -extern void list_del(struct list_head *entry); -#endif /** * list_replace - replace old entry by new one diff --git a/lib/list_debug.c b/lib/list_debug.c index d0b89b9d0736..276565fca2a6 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -39,41 +39,34 @@ bool __list_add_valid(struct list_head *new, struct list_head *prev, } EXPORT_SYMBOL(__list_add_valid); -void __list_del_entry(struct list_head *entry) +bool __list_del_entry_valid(struct list_head *entry) { struct list_head *prev, *next; prev = entry->prev; next = entry->next; - if (WARN(next == LIST_POISON1, - "list_del corruption, %p->next is LIST_POISON1 (%p)\n", - entry, LIST_POISON1) || - WARN(prev == LIST_POISON2, - "list_del corruption, %p->prev is LIST_POISON2 (%p)\n", - entry, LIST_POISON2) || - WARN(prev->next != entry, - "list_del corruption. prev->next should be %p, " - "but was %p\n", entry, prev->next) || - WARN(next->prev != entry, - "list_del corruption. next->prev should be %p, " - "but was %p\n", entry, next->prev)) - return; + if (unlikely(next == LIST_POISON1)) { + WARN(1, "list_del corruption, %p->next is LIST_POISON1 (%p)\n", + entry, LIST_POISON1); + return false; + } + if (unlikely(prev == LIST_POISON2)) { + WARN(1, "list_del corruption, %p->prev is LIST_POISON2 (%p)\n", + entry, LIST_POISON2); + return false; + } + if (unlikely(prev->next != entry)) { + WARN(1, "list_del corruption. prev->next should be %p, but was %p\n", + entry, prev->next); + return false; + } + if (unlikely(next->prev != entry)) { + WARN(1, "list_del corruption. next->prev should be %p, but was %p\n", + entry, next->prev); + return false; + } + return true; - __list_del(prev, next); } -EXPORT_SYMBOL(__list_del_entry); - -/** - * list_del - deletes entry from list. - * @entry: the element to delete from the list. - * Note: list_empty on entry does not return true after this, the entry is - * in an undefined state. - */ -void list_del(struct list_head *entry) -{ - __list_del_entry(entry); - entry->next = LIST_POISON1; - entry->prev = LIST_POISON2; -} -EXPORT_SYMBOL(list_del); +EXPORT_SYMBOL(__list_del_entry_valid); From 234c12e7b2baf300c0b7e40d423afd53fc01a78f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 17 Aug 2016 14:42:11 -0700 Subject: [PATCH 1202/1212] UPSTREAM: bug: Provide toggle for BUG on data corruption (cherry-picked from de54ebbe26bb371a6f1fbc0593372232f04e3107) The kernel checks for cases of data structure corruption under some CONFIGs (e.g. CONFIG_DEBUG_LIST). When corruption is detected, some systems may want to BUG() immediately instead of letting the system run with known corruption. Usually these kinds of manipulation primitives can be used by security flaws to gain arbitrary memory write control. This provides a new config CONFIG_BUG_ON_DATA_CORRUPTION and a corresponding macro CHECK_DATA_CORRUPTION for handling these situations. Notably, even if not BUGing, the kernel should not continue processing the corrupted structure. This is inspired by similar hardening by Syed Rameez Mustafa in MSM kernels, and in PaX and Grsecurity, which is likely in response to earlier removal of the BUG calls in commit 924d9addb9b1 ("list debugging: use WARN() instead of BUG()"). Change-Id: I4cdfa9fbebe32a990a111d051e4ec4e421f77a09 Signed-off-by: Kees Cook Acked-by: Steven Rostedt Signed-off-by: Paul E. McKenney Acked-by: Rik van Riel Signed-off-by: Satya Tangirala Signed-off-by: Amit Pundir --- include/linux/bug.h | 17 ++++++++++++++ lib/Kconfig.debug | 10 ++++++++ lib/list_debug.c | 57 +++++++++++++++++---------------------------- 3 files changed, 49 insertions(+), 35 deletions(-) diff --git a/include/linux/bug.h b/include/linux/bug.h index 7f4818673c41..2bafb1d6ee89 100644 --- a/include/linux/bug.h +++ b/include/linux/bug.h @@ -109,4 +109,21 @@ static inline enum bug_trap_type report_bug(unsigned long bug_addr, } #endif /* CONFIG_GENERIC_BUG */ + +/* + * Since detected data corruption should stop operation on the affected + * structures, this returns false if the corruption condition is found. + */ +#define CHECK_DATA_CORRUPTION(condition, fmt, ...) \ + do { \ + if (unlikely(condition)) { \ + if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) { \ + pr_err(fmt, ##__VA_ARGS__); \ + BUG(); \ + } else \ + WARN(1, fmt, ##__VA_ARGS__); \ + return false; \ + } \ + } while (0) + #endif /* _LINUX_BUG_H */ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 947ec1a19af5..d9adee033786 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1891,6 +1891,16 @@ config TEST_STATIC_KEYS If unsure, say N. +config BUG_ON_DATA_CORRUPTION + bool "Trigger a BUG when data corruption is detected" + select CONFIG_DEBUG_LIST + help + Select this option if the kernel should BUG when it encounters + data corruption in kernel memory structures when they get checked + for validity. + + If unsure, say N. + source "samples/Kconfig" source "lib/Kconfig.kgdb" diff --git a/lib/list_debug.c b/lib/list_debug.c index 276565fca2a6..7f7bfa55eb6d 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -20,21 +20,16 @@ bool __list_add_valid(struct list_head *new, struct list_head *prev, struct list_head *next) { - if (unlikely(next->prev != prev)) { - WARN(1, "list_add corruption. next->prev should be prev (%p), but was %p. (next=%p).\n", - prev, next->prev, next); - return false; - } - if (unlikely(prev->next != next)) { - WARN(1, "list_add corruption. prev->next should be next (%p), but was %p. (prev=%p).\n", - next, prev->next, prev); - return false; - } - if (unlikely(new == prev || new == next)) { - WARN(1, "list_add double add: new=%p, prev=%p, next=%p.\n", - new, prev, next); - return false; - } + CHECK_DATA_CORRUPTION(next->prev != prev, + "list_add corruption. next->prev should be prev (%p), but was %p. (next=%p).\n", + prev, next->prev, next); + CHECK_DATA_CORRUPTION(prev->next != next, + "list_add corruption. prev->next should be next (%p), but was %p. (prev=%p).\n", + next, prev->next, prev); + CHECK_DATA_CORRUPTION(new == prev || new == next, + "list_add double add: new=%p, prev=%p, next=%p.\n", + new, prev, next); + return true; } EXPORT_SYMBOL(__list_add_valid); @@ -46,26 +41,18 @@ bool __list_del_entry_valid(struct list_head *entry) prev = entry->prev; next = entry->next; - if (unlikely(next == LIST_POISON1)) { - WARN(1, "list_del corruption, %p->next is LIST_POISON1 (%p)\n", - entry, LIST_POISON1); - return false; - } - if (unlikely(prev == LIST_POISON2)) { - WARN(1, "list_del corruption, %p->prev is LIST_POISON2 (%p)\n", - entry, LIST_POISON2); - return false; - } - if (unlikely(prev->next != entry)) { - WARN(1, "list_del corruption. prev->next should be %p, but was %p\n", - entry, prev->next); - return false; - } - if (unlikely(next->prev != entry)) { - WARN(1, "list_del corruption. next->prev should be %p, but was %p\n", - entry, next->prev); - return false; - } + CHECK_DATA_CORRUPTION(next == LIST_POISON1, + "list_del corruption, %p->next is LIST_POISON1 (%p)\n", + entry, LIST_POISON1); + CHECK_DATA_CORRUPTION(prev == LIST_POISON2, + "list_del corruption, %p->prev is LIST_POISON2 (%p)\n", + entry, LIST_POISON2); + CHECK_DATA_CORRUPTION(prev->next != entry, + "list_del corruption. prev->next should be %p, but was %p\n", + entry, prev->next); + CHECK_DATA_CORRUPTION(next->prev != entry, + "list_del corruption. next->prev should be %p, but was %p\n", + entry, next->prev); return true; } From 6fb418c93eb9596fc91b371302c0db3e62d26bf0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 17 Aug 2016 14:42:12 -0700 Subject: [PATCH 1203/1212] BACKPORT: lkdtm: Add tests for struct list corruption (cherry-picked from 6819d101dd739dd4e8cbe60a98c9ebb224ecc992) When building under CONFIG_DEBUG_LIST, list addition and removal will be sanity-checked. This validates that the check is working as expected by setting up classic corruption attacks against list manipulations, available with the new lkdtm tests CORRUPT_LIST_ADD and CORRUPT_LIST_DEL. Change-Id: Iddf70c61b745342dd4f055dc9c1eb221ca779c2e Signed-off-by: Kees Cook Acked-by: Steven Rostedt Signed-off-by: Paul E. McKenney Acked-by: Rik van Riel Signed-off-by: Satya Tangirala Signed-off-by: Amit Pundir --- drivers/misc/lkdtm.c | 70 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/drivers/misc/lkdtm.c b/drivers/misc/lkdtm.c index 2a6eaf1122b4..42a0a99007be 100644 --- a/drivers/misc/lkdtm.c +++ b/drivers/misc/lkdtm.c @@ -47,11 +47,16 @@ #include #include #include +#include #ifdef CONFIG_IDE #include #endif +struct lkdtm_list { + struct list_head node; +}; + /* * Make sure our attempts to over run the kernel stack doesn't trigger * a compiler warning when CONFIG_FRAME_WARN is set. Then make sure we @@ -88,6 +93,8 @@ enum ctype { CT_EXCEPTION, CT_LOOP, CT_OVERFLOW, + CT_CORRUPT_LIST_ADD, + CT_CORRUPT_LIST_DEL, CT_CORRUPT_STACK, CT_UNALIGNED_LOAD_STORE_WRITE, CT_OVERWRITE_ALLOCATION, @@ -126,6 +133,8 @@ static char* cp_type[] = { "EXCEPTION", "LOOP", "OVERFLOW", + "CORRUPT_LIST_ADD", + "CORRUPT_LIST_DEL", "CORRUPT_STACK", "UNALIGNED_LOAD_STORE_WRITE", "OVERWRITE_ALLOCATION", @@ -548,6 +557,67 @@ static void lkdtm_do_action(enum ctype which) do_overwritten(); break; } + case CT_CORRUPT_LIST_ADD: { + /* + * Initially, an empty list via LIST_HEAD: + * test_head.next = &test_head + * test_head.prev = &test_head + */ + LIST_HEAD(test_head); + struct lkdtm_list good, bad; + void *target[2] = { }; + void *redirection = ⌖ + + pr_info("attempting good list addition\n"); + + /* + * Adding to the list performs these actions: + * test_head.next->prev = &good.node + * good.node.next = test_head.next + * good.node.prev = test_head + * test_head.next = good.node + */ + list_add(&good.node, &test_head); + + pr_info("attempting corrupted list addition\n"); + /* + * In simulating this "write what where" primitive, the "what" is + * the address of &bad.node, and the "where" is the address held + * by "redirection". + */ + test_head.next = redirection; + list_add(&bad.node, &test_head); + + if (target[0] == NULL && target[1] == NULL) + pr_err("Overwrite did not happen, but no BUG?!\n"); + else + pr_err("list_add() corruption not detected!\n"); + break; + } + case CT_CORRUPT_LIST_DEL: { + LIST_HEAD(test_head); + struct lkdtm_list item; + void *target[2] = { }; + void *redirection = ⌖ + + list_add(&item.node, &test_head); + + pr_info("attempting good list removal\n"); + list_del(&item.node); + + pr_info("attempting corrupted list removal\n"); + list_add(&item.node, &test_head); + + /* As with the list_add() test above, this corrupts "next". */ + item.node.next = redirection; + list_del(&item.node); + + if (target[0] == NULL && target[1] == NULL) + pr_err("Overwrite did not happen, but no BUG?!\n"); + else + pr_err("list_del() corruption not detected!\n"); + break; + } case CT_NONE: default: break; From 3891454fb8131caa4807262dc6498ade4c0ce82a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 24 Feb 2017 15:00:38 -0800 Subject: [PATCH 1204/1212] UPSTREAM: bug: switch data corruption check to __must_check (cherry-picked from 85caa95b9f19bb3a26d7e025d1134760b69e0c40) The CHECK_DATA_CORRUPTION() macro was designed to have callers do something meaningful/protective on failure. However, using "return false" in the macro too strictly limits the design patterns of callers. Instead, let callers handle the logic test directly, but make sure that the result IS checked by forcing __must_check (which appears to not be able to be used directly on macro expressions). Change-Id: I635dc2f39959104ea8b475d2d5018af3502f33ba Link: http://lkml.kernel.org/r/20170206204547.GA125312@beast Signed-off-by: Kees Cook Suggested-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Satya Tangirala Signed-off-by: Amit Pundir --- include/linux/bug.h | 12 +++++++----- lib/list_debug.c | 45 ++++++++++++++++++++++++--------------------- 2 files changed, 31 insertions(+), 26 deletions(-) diff --git a/include/linux/bug.h b/include/linux/bug.h index 2bafb1d6ee89..833746d361cf 100644 --- a/include/linux/bug.h +++ b/include/linux/bug.h @@ -112,18 +112,20 @@ static inline enum bug_trap_type report_bug(unsigned long bug_addr, /* * Since detected data corruption should stop operation on the affected - * structures, this returns false if the corruption condition is found. + * structures. Return value must be checked and sanely acted on by caller. */ +static inline __must_check bool check_data_corruption(bool v) { return v; } #define CHECK_DATA_CORRUPTION(condition, fmt, ...) \ - do { \ - if (unlikely(condition)) { \ + check_data_corruption(({ \ + bool corruption = unlikely(condition); \ + if (corruption) { \ if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) { \ pr_err(fmt, ##__VA_ARGS__); \ BUG(); \ } else \ WARN(1, fmt, ##__VA_ARGS__); \ - return false; \ } \ - } while (0) + corruption; \ + })) #endif /* _LINUX_BUG_H */ diff --git a/lib/list_debug.c b/lib/list_debug.c index 7f7bfa55eb6d..a34db8d27667 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -20,15 +20,16 @@ bool __list_add_valid(struct list_head *new, struct list_head *prev, struct list_head *next) { - CHECK_DATA_CORRUPTION(next->prev != prev, - "list_add corruption. next->prev should be prev (%p), but was %p. (next=%p).\n", - prev, next->prev, next); - CHECK_DATA_CORRUPTION(prev->next != next, - "list_add corruption. prev->next should be next (%p), but was %p. (prev=%p).\n", - next, prev->next, prev); - CHECK_DATA_CORRUPTION(new == prev || new == next, - "list_add double add: new=%p, prev=%p, next=%p.\n", - new, prev, next); + if (CHECK_DATA_CORRUPTION(next->prev != prev, + "list_add corruption. next->prev should be prev (%p), but was %p. (next=%p).\n", + prev, next->prev, next) || + CHECK_DATA_CORRUPTION(prev->next != next, + "list_add corruption. prev->next should be next (%p), but was %p. (prev=%p).\n", + next, prev->next, prev) || + CHECK_DATA_CORRUPTION(new == prev || new == next, + "list_add double add: new=%p, prev=%p, next=%p.\n", + new, prev, next)) + return false; return true; } @@ -41,18 +42,20 @@ bool __list_del_entry_valid(struct list_head *entry) prev = entry->prev; next = entry->next; - CHECK_DATA_CORRUPTION(next == LIST_POISON1, - "list_del corruption, %p->next is LIST_POISON1 (%p)\n", - entry, LIST_POISON1); - CHECK_DATA_CORRUPTION(prev == LIST_POISON2, - "list_del corruption, %p->prev is LIST_POISON2 (%p)\n", - entry, LIST_POISON2); - CHECK_DATA_CORRUPTION(prev->next != entry, - "list_del corruption. prev->next should be %p, but was %p\n", - entry, prev->next); - CHECK_DATA_CORRUPTION(next->prev != entry, - "list_del corruption. next->prev should be %p, but was %p\n", - entry, next->prev); + if (CHECK_DATA_CORRUPTION(next == LIST_POISON1, + "list_del corruption, %p->next is LIST_POISON1 (%p)\n", + entry, LIST_POISON1) || + CHECK_DATA_CORRUPTION(prev == LIST_POISON2, + "list_del corruption, %p->prev is LIST_POISON2 (%p)\n", + entry, LIST_POISON2) || + CHECK_DATA_CORRUPTION(prev->next != entry, + "list_del corruption. prev->next should be %p, but was %p\n", + entry, prev->next) || + CHECK_DATA_CORRUPTION(next->prev != entry, + "list_del corruption. next->prev should be %p, but was %p\n", + entry, next->prev)) + return false; + return true; } From f6e2385809202c2d49e538960bebbcf22f720271 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 24 Mar 2017 10:51:25 -0700 Subject: [PATCH 1205/1212] BACKPORT: lkdtm: add bad USER_DS test (cherry-picked from e22aa9d781a27a961581c57442911309fb86a48e) This adds CORRUPT_USER_DS to check that the get_fs() test on syscall return (via __VERIFY_PRE_USERMODE_STATE) still sees USER_DS. Since trying to deal with values other than USER_DS and KERNEL_DS across all architectures in a safe way is not sensible, this sets KERNEL_DS, but since that could be extremely dangerous if the protection is not present, it also raises SIGKILL for current, so that no matter what, the process will die. A successful test will be visible with a BUG(), like all the other LKDTM tests. Change-Id: I1d2585de65032f0f6b9baea2a71f92bfc296c94b Signed-off-by: Kees Cook Signed-off-by: Greg Kroah-Hartman Signed-off-by: Satya Tangirala Signed-off-by: Amit Pundir --- drivers/misc/lkdtm.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/misc/lkdtm.c b/drivers/misc/lkdtm.c index 42a0a99007be..8e06e1020ad9 100644 --- a/drivers/misc/lkdtm.c +++ b/drivers/misc/lkdtm.c @@ -48,6 +48,8 @@ #include #include #include +#include +#include #ifdef CONFIG_IDE #include @@ -95,6 +97,7 @@ enum ctype { CT_OVERFLOW, CT_CORRUPT_LIST_ADD, CT_CORRUPT_LIST_DEL, + CT_CORRUPT_USER_DS, CT_CORRUPT_STACK, CT_UNALIGNED_LOAD_STORE_WRITE, CT_OVERWRITE_ALLOCATION, @@ -135,6 +138,7 @@ static char* cp_type[] = { "OVERFLOW", "CORRUPT_LIST_ADD", "CORRUPT_LIST_DEL", + "CORRUPT_USER_DS", "CORRUPT_STACK", "UNALIGNED_LOAD_STORE_WRITE", "OVERWRITE_ALLOCATION", @@ -618,6 +622,14 @@ static void lkdtm_do_action(enum ctype which) pr_err("list_del() corruption not detected!\n"); break; } + case CT_CORRUPT_USER_DS: { + pr_info("setting bad task size limit\n"); + set_fs(KERNEL_DS); + + /* Make sure we do not keep running with a KERNEL_DS! */ + force_sig(SIGKILL, current); + break; + } case CT_NONE: default: break; From 4ac94c62b18065514a02c0664da92a6832014c80 Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Wed, 14 Jun 2017 18:12:01 -0700 Subject: [PATCH 1206/1212] BACKPORT: x86/syscalls: Check address limit on user-mode return (cherry-picked from 5ea0727b163cb5575e36397a12eade68a1f35f24) Ensure the address limit is a user-mode segment before returning to user-mode. Otherwise a process can corrupt kernel-mode memory and elevate privileges [1]. The set_fs function sets the TIF_SETFS flag to force a slow path on return. In the slow path, the address limit is checked to be USER_DS if needed. The addr_limit_user_check function is added as a cross-architecture function to check the address limit. [1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990 Change-Id: I604d85b262cc5b439b2665852865ca5a9ea6c5a3 Signed-off-by: Thomas Garnier Signed-off-by: Thomas Gleixner Cc: Mark Rutland Cc: kernel-hardening@lists.openwall.com Cc: Catalin Marinas Cc: Will Deacon Cc: David Howells Cc: Dave Hansen Cc: Miroslav Benes Cc: Chris Metcalf Cc: Pratyush Anand Cc: Russell King Cc: Petr Mladek Cc: Rik van Riel Cc: Kees Cook Cc: Arnd Bergmann Cc: Al Viro Cc: Andy Lutomirski Cc: Josh Poimboeuf Cc: linux-arm-kernel@lists.infradead.org Cc: Will Drewry Cc: linux-api@vger.kernel.org Cc: Oleg Nesterov Cc: Andy Lutomirski Cc: Paolo Bonzini Link: http://lkml.kernel.org/r/20170615011203.144108-1-thgarnie@google.com Signed-off-by: Satya Tangirala Signed-off-by: Amit Pundir --- arch/x86/entry/common.c | 3 +++ arch/x86/include/asm/thread_info.h | 4 +++- arch/x86/include/asm/uaccess.h | 7 ++++++- include/linux/syscalls.h | 16 ++++++++++++++++ 4 files changed, 28 insertions(+), 2 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 071582a3b5c0..a9e501303e15 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -273,6 +274,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) struct thread_info *ti = pt_regs_to_thread_info(regs); u32 cached_flags; + addr_limit_user_check(); + if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled())) local_irq_disable(); diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 128a7105cbe2..561be63b61ab 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -111,6 +111,7 @@ struct thread_info { #define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ #define TIF_ADDR32 29 /* 32-bit address space on 64 bits */ #define TIF_X32 30 /* 32-bit native x86-64 binary */ +#define TIF_FSCHECK 31 /* Check FS is USER_DS on return */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -135,6 +136,7 @@ struct thread_info { #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_ADDR32 (1 << TIF_ADDR32) #define _TIF_X32 (1 << TIF_X32) +#define _TIF_FSCHECK (1 << TIF_FSCHECK) /* work to do in syscall_trace_enter() */ #define _TIF_WORK_SYSCALL_ENTRY \ @@ -145,7 +147,7 @@ struct thread_info { /* work to do on any return to user space */ #define _TIF_ALLWORK_MASK \ ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT | \ - _TIF_NOHZ) + _TIF_NOHZ | _TIF_FSCHECK) /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 6a07c05956a6..8857f6f4daa9 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -30,7 +30,12 @@ #define get_ds() (KERNEL_DS) #define get_fs() (current_thread_info()->addr_limit) -#define set_fs(x) (current_thread_info()->addr_limit = (x)) +static inline void set_fs(mm_segment_t fs) +{ + current_thread_info()->addr_limit = fs; + /* On user-mode return, check fs is correct */ + set_thread_flag(TIF_FSCHECK); +} #define segment_eq(a, b) ((a).seg == (b).seg) diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index c2b66a277e98..a95cb2589765 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -205,6 +205,22 @@ extern struct trace_event_functions exit_syscall_print_funcs; } \ static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__)) +#ifdef TIF_FSCHECK +/* + * Called before coming back to user-mode. Returning to user-mode with an + * address limit different than USER_DS can allow to overwrite kernel memory. + */ +static inline void addr_limit_user_check(void) +{ + + if (!test_thread_flag(TIF_FSCHECK)) + return; + + BUG_ON(!segment_eq(get_fs(), USER_DS)); + clear_thread_flag(TIF_FSCHECK); +} +#endif + asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special, qid_t id, void __user *addr); asmlinkage long sys_time(time_t __user *tloc); From 335f7b933d66e29a512f952880c52bd4e0f50a62 Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Wed, 14 Jun 2017 18:12:03 -0700 Subject: [PATCH 1207/1212] BACKPORT: arm64/syscalls: Check address limit on user-mode return (cherry-picked from cf7de27ab35172a9240f079477cae3146a182998) Ensure the address limit is a user-mode segment before returning to user-mode. Otherwise a process can corrupt kernel-mode memory and elevate privileges [1]. The set_fs function sets the TIF_SETFS flag to force a slow path on return. In the slow path, the address limit is checked to be USER_DS if needed. [1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990 Change-Id: Ic0e917286d7378cb26c15a7e553ef56fabb2f543 Signed-off-by: Thomas Garnier Reviewed-by: Catalin Marinas Signed-off-by: Thomas Gleixner Cc: Mark Rutland Cc: kernel-hardening@lists.openwall.com Cc: Will Deacon Cc: David Howells Cc: Dave Hansen Cc: Miroslav Benes Cc: Chris Metcalf Cc: Pratyush Anand Cc: Russell King Cc: Petr Mladek Cc: Rik van Riel Cc: Kees Cook Cc: Arnd Bergmann Cc: Al Viro Cc: Andy Lutomirski Cc: Josh Poimboeuf Cc: linux-arm-kernel@lists.infradead.org Cc: Will Drewry Cc: linux-api@vger.kernel.org Cc: Oleg Nesterov Cc: Andy Lutomirski Cc: Paolo Bonzini Link: http://lkml.kernel.org/r/20170615011203.144108-3-thgarnie@google.com Signed-off-by: Satya Tangirala Signed-off-by: Amit Pundir --- arch/arm64/include/asm/thread_info.h | 5 ++++- arch/arm64/include/asm/uaccess.h | 3 +++ arch/arm64/kernel/signal.c | 4 ++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 67dd228c3f17..8c22d1618260 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -120,6 +120,7 @@ static inline struct thread_info *current_thread_info(void) #define TIF_NEED_RESCHED 1 #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ #define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */ +#define TIF_FSCHECK 4 /* Check FS is USER_DS on return */ #define TIF_NOHZ 7 #define TIF_SYSCALL_TRACE 8 #define TIF_SYSCALL_AUDIT 9 @@ -140,10 +141,12 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) +#define _TIF_FSCHECK (1 << TIF_FSCHECK) #define _TIF_32BIT (1 << TIF_32BIT) #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ - _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE) + _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ + _TIF_FSCHECK) #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index d39d8bde42d7..d0919bcb1953 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -73,6 +73,9 @@ static inline void set_fs(mm_segment_t fs) { current_thread_info()->addr_limit = fs; + /* On user-mode return, check fs is correct */ + set_thread_flag(TIF_FSCHECK); + /* * Enable/disable UAO so that copy_to_user() etc can access * kernel memory with the unprivileged instructions. diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index a8eafdbc7cb8..0bed9a899850 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -402,6 +403,9 @@ static void do_signal(struct pt_regs *regs) asmlinkage void do_notify_resume(struct pt_regs *regs, unsigned int thread_flags) { + /* Check valid user FS if needed */ + addr_limit_user_check(); + if (thread_flags & _TIF_SIGPENDING) do_signal(regs); From 255b3ec50230c189ea86d5040f2d9df2cfc0c7c6 Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Thu, 7 Sep 2017 08:30:44 -0700 Subject: [PATCH 1208/1212] UPSTREAM: syscalls: Use CHECK_DATA_CORRUPTION for addr_limit_user_check (cherry-picked from bf29ed1567b67854dc13504f685c45a2ea9b2081) Use CHECK_DATA_CORRUPTION instead of BUG_ON to provide more flexibility on address limit failures. By default, send a SIGKILL signal to kill the current process preventing exploitation of a bad address limit. Make the TIF_FSCHECK flag optional so ARM can use this function. Change-Id: I02b39760aaa794db77de7b0c0b1b0ec66abe1cb1 Signed-off-by: Thomas Garnier Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Cc: Pratyush Anand Cc: Dave Martin Cc: Will Drewry Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Will Deacon Cc: Russell King Cc: Andy Lutomirski Cc: David Howells Cc: Dave Hansen Cc: Al Viro Cc: linux-api@vger.kernel.org Cc: Yonghong Song Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1504798247-48833-2-git-send-email-keescook@chromium.org Signed-off-by: Satya Tangirala Signed-off-by: Amit Pundir --- include/linux/syscalls.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index a95cb2589765..5d2779aa4bbe 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -205,22 +205,26 @@ extern struct trace_event_functions exit_syscall_print_funcs; } \ static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__)) -#ifdef TIF_FSCHECK /* * Called before coming back to user-mode. Returning to user-mode with an * address limit different than USER_DS can allow to overwrite kernel memory. */ static inline void addr_limit_user_check(void) { - +#ifdef TIF_FSCHECK if (!test_thread_flag(TIF_FSCHECK)) return; - - BUG_ON(!segment_eq(get_fs(), USER_DS)); - clear_thread_flag(TIF_FSCHECK); -} #endif + if (CHECK_DATA_CORRUPTION(!segment_eq(get_fs(), USER_DS), + "Invalid address limit on user-mode return")) + force_sig(SIGKILL, current); + +#ifdef TIF_FSCHECK + clear_thread_flag(TIF_FSCHECK); +#endif +} + asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special, qid_t id, void __user *addr); asmlinkage long sys_time(time_t __user *tloc); From f882309bbfec9d46a4fd6c61ae43368d5d1f087b Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Thu, 7 Sep 2017 08:30:46 -0700 Subject: [PATCH 1209/1212] BACKPORT: arm/syscalls: Optimize address limit check (cherry-picked from e33f8d32677fa4f4f8996ef46748f86aac81ccff) Disable the generic address limit check in favor of an architecture specific optimized implementation. The generic implementation using pending work flags did not work well with ARM and alignment faults. The address limit is checked on each syscall return path to user-mode path as well as the irq user-mode return function. If the address limit was changed, a function is called to report data corruption (stopping the kernel or process based on configuration). The address limit check has to be done before any pending work because they can reset the address limit and the process is killed using a SIGKILL signal. For example the lkdtm address limit check does not work because the signal to kill the process will reset the user-mode address limit. Change-Id: Ic61ba05961ad1dcf10c48040427d92bd650616af Signed-off-by: Thomas Garnier Signed-off-by: Kees Cook Tested-by: Kees Cook Tested-by: Leonard Crestez Reviewed-by: Kees Cook Signed-off-by: Thomas Gleixner Cc: Pratyush Anand Cc: Dave Martin Cc: Will Drewry Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Will Deacon Cc: Russell King Cc: Andy Lutomirski Cc: David Howells Cc: Dave Hansen Cc: Al Viro Cc: linux-api@vger.kernel.org Cc: Yonghong Song Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1504798247-48833-4-git-send-email-keescook@chromium.org Signed-off-by: Satya Tangirala Signed-off-by: Amit Pundir --- arch/arm/kernel/entry-common.S | 10 ++++++++++ arch/arm/kernel/signal.c | 7 +++++++ 2 files changed, 17 insertions(+) diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index 30a7228eaceb..9440b320a8a3 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -12,6 +12,7 @@ #include #include #include +#include #ifdef CONFIG_NEED_RET_TO_USER #include @@ -35,6 +36,9 @@ ret_fast_syscall: UNWIND(.fnstart ) UNWIND(.cantunwind ) disable_irq_notrace @ disable interrupts + ldr r2, [tsk, #TI_ADDR_LIMIT] + cmp r2, #TASK_SIZE + blne addr_limit_check_failed ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK bne fast_work_pending @@ -61,6 +65,9 @@ ret_fast_syscall: UNWIND(.cantunwind ) str r0, [sp, #S_R0 + S_OFF]! @ save returned r0 disable_irq_notrace @ disable interrupts + ldr r2, [tsk, #TI_ADDR_LIMIT] + cmp r2, #TASK_SIZE + blne addr_limit_check_failed ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK beq no_work_pending @@ -93,6 +100,9 @@ ENTRY(ret_to_user) ret_slow_syscall: disable_irq_notrace @ disable interrupts ENTRY(ret_to_user_from_irq) + ldr r2, [tsk, #TI_ADDR_LIMIT] + cmp r2, #TASK_SIZE + blne addr_limit_check_failed ldr r1, [tsk, #TI_FLAGS] tst r1, #_TIF_WORK_MASK bne slow_work_pending diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 7b8f2141427b..304e68408f9c 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -631,3 +632,9 @@ struct page *get_signal_page(void) return page; } + +/* Defer to generic check */ +asmlinkage void addr_limit_check_failed(void) +{ + addr_limit_user_check(); +} From bec291181b3f70cfd55165375d464acff4ccfe71 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Wed, 26 Sep 2018 13:48:19 -0700 Subject: [PATCH 1210/1212] ANDROID: restrict store of prefer_idle as boolean It works as boolean so stores like a boolean too. Bug: 116734731 Test: Set stune Change-Id: I0daa3cc1723d009ed5bc2a71fa1c2e3d4ece6a7f Signed-off-by: Wei Wang Signed-off-by: Amit Pundir --- kernel/sched/tune.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index d444fc1a4d58..5e47c29b44f6 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -569,7 +569,7 @@ prefer_idle_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 prefer_idle) { struct schedtune *st = css_st(css); - st->prefer_idle = prefer_idle; + st->prefer_idle = !!prefer_idle; return 0; } From 18c7d13c75ce870858134ddfb8a823bd6a057489 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Thu, 19 Jul 2018 18:08:35 -0700 Subject: [PATCH 1211/1212] ANDROID: sdcardfs: Don't use OVERRIDE_CRED macro The macro hides some control flow, making it easier to run into bugs. bug: 111642636 Change-Id: I37ec207c277d97c4e7f1e8381bc9ae743ad78435 Reported-by: Jann Horn Signed-off-by: Daniel Rosenberg Signed-off-by: Amit Pundir --- fs/sdcardfs/file.c | 24 +++-- fs/sdcardfs/inode.c | 198 +++++++++-------------------------------- fs/sdcardfs/lookup.c | 9 +- fs/sdcardfs/sdcardfs.h | 25 ------ 4 files changed, 66 insertions(+), 190 deletions(-) diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c index 1461254f301d..271c4c4cb760 100644 --- a/fs/sdcardfs/file.c +++ b/fs/sdcardfs/file.c @@ -118,7 +118,11 @@ static long sdcardfs_unlocked_ioctl(struct file *file, unsigned int cmd, goto out; /* save current_cred and override it */ - OVERRIDE_CRED(sbi, saved_cred, SDCARDFS_I(file_inode(file))); + saved_cred = override_fsids(sbi, SDCARDFS_I(file_inode(file))->data); + if (!saved_cred) { + err = -ENOMEM; + goto out; + } if (lower_file->f_op->unlocked_ioctl) err = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg); @@ -127,7 +131,7 @@ static long sdcardfs_unlocked_ioctl(struct file *file, unsigned int cmd, if (!err) sdcardfs_copy_and_fix_attrs(file_inode(file), file_inode(lower_file)); - REVERT_CRED(saved_cred); + revert_fsids(saved_cred); out: return err; } @@ -149,12 +153,16 @@ static long sdcardfs_compat_ioctl(struct file *file, unsigned int cmd, goto out; /* save current_cred and override it */ - OVERRIDE_CRED(sbi, saved_cred, SDCARDFS_I(file_inode(file))); + saved_cred = override_fsids(sbi, SDCARDFS_I(file_inode(file))->data); + if (!saved_cred) { + err = -ENOMEM; + goto out; + } if (lower_file->f_op->compat_ioctl) err = lower_file->f_op->compat_ioctl(lower_file, cmd, arg); - REVERT_CRED(saved_cred); + revert_fsids(saved_cred); out: return err; } @@ -241,7 +249,11 @@ static int sdcardfs_open(struct inode *inode, struct file *file) } /* save current_cred and override it */ - OVERRIDE_CRED(sbi, saved_cred, SDCARDFS_I(inode)); + saved_cred = override_fsids(sbi, SDCARDFS_I(inode)->data); + if (!saved_cred) { + err = -ENOMEM; + goto out_err; + } file->private_data = kzalloc(sizeof(struct sdcardfs_file_info), GFP_KERNEL); @@ -271,7 +283,7 @@ static int sdcardfs_open(struct inode *inode, struct file *file) sdcardfs_copy_and_fix_attrs(inode, sdcardfs_lower_inode(inode)); out_revert_cred: - REVERT_CRED(saved_cred); + revert_fsids(saved_cred); out_err: dput(parent); return err; diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c index 75a8ab2ce5a8..0b9c345cd01c 100644 --- a/fs/sdcardfs/inode.c +++ b/fs/sdcardfs/inode.c @@ -22,7 +22,6 @@ #include #include -/* Do not directly use this function. Use OVERRIDE_CRED() instead. */ const struct cred *override_fsids(struct sdcardfs_sb_info *sbi, struct sdcardfs_inode_data *data) { @@ -50,7 +49,6 @@ const struct cred *override_fsids(struct sdcardfs_sb_info *sbi, return old_cred; } -/* Do not directly use this function, use REVERT_CRED() instead. */ void revert_fsids(const struct cred *old_cred) { const struct cred *cur_cred; @@ -78,7 +76,10 @@ static int sdcardfs_create(struct inode *dir, struct dentry *dentry, } /* save current_cred and override it */ - OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dir)); + saved_cred = override_fsids(SDCARDFS_SB(dir->i_sb), + SDCARDFS_I(dir)->data); + if (!saved_cred) + return -ENOMEM; sdcardfs_get_lower_path(dentry, &lower_path); lower_dentry = lower_path.dentry; @@ -115,53 +116,11 @@ static int sdcardfs_create(struct inode *dir, struct dentry *dentry, out_unlock: unlock_dir(lower_parent_dentry); sdcardfs_put_lower_path(dentry, &lower_path); - REVERT_CRED(saved_cred); + revert_fsids(saved_cred); out_eacces: return err; } -#if 0 -static int sdcardfs_link(struct dentry *old_dentry, struct inode *dir, - struct dentry *new_dentry) -{ - struct dentry *lower_old_dentry; - struct dentry *lower_new_dentry; - struct dentry *lower_dir_dentry; - u64 file_size_save; - int err; - struct path lower_old_path, lower_new_path; - - OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb)); - - file_size_save = i_size_read(d_inode(old_dentry)); - sdcardfs_get_lower_path(old_dentry, &lower_old_path); - sdcardfs_get_lower_path(new_dentry, &lower_new_path); - lower_old_dentry = lower_old_path.dentry; - lower_new_dentry = lower_new_path.dentry; - lower_dir_dentry = lock_parent(lower_new_dentry); - - err = vfs_link(lower_old_dentry, d_inode(lower_dir_dentry), - lower_new_dentry, NULL); - if (err || !d_inode(lower_new_dentry)) - goto out; - - err = sdcardfs_interpose(new_dentry, dir->i_sb, &lower_new_path); - if (err) - goto out; - fsstack_copy_attr_times(dir, d_inode(lower_new_dentry)); - fsstack_copy_inode_size(dir, d_inode(lower_new_dentry)); - set_nlink(d_inode(old_dentry), - sdcardfs_lower_inode(d_inode(old_dentry))->i_nlink); - i_size_write(d_inode(new_dentry), file_size_save); -out: - unlock_dir(lower_dir_dentry); - sdcardfs_put_lower_path(old_dentry, &lower_old_path); - sdcardfs_put_lower_path(new_dentry, &lower_new_path); - REVERT_CRED(); - return err; -} -#endif - static int sdcardfs_unlink(struct inode *dir, struct dentry *dentry) { int err; @@ -178,7 +137,10 @@ static int sdcardfs_unlink(struct inode *dir, struct dentry *dentry) } /* save current_cred and override it */ - OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dir)); + saved_cred = override_fsids(SDCARDFS_SB(dir->i_sb), + SDCARDFS_I(dir)->data); + if (!saved_cred) + return -ENOMEM; sdcardfs_get_lower_path(dentry, &lower_path); lower_dentry = lower_path.dentry; @@ -209,43 +171,11 @@ static int sdcardfs_unlink(struct inode *dir, struct dentry *dentry) unlock_dir(lower_dir_dentry); dput(lower_dentry); sdcardfs_put_lower_path(dentry, &lower_path); - REVERT_CRED(saved_cred); + revert_fsids(saved_cred); out_eacces: return err; } -#if 0 -static int sdcardfs_symlink(struct inode *dir, struct dentry *dentry, - const char *symname) -{ - int err; - struct dentry *lower_dentry; - struct dentry *lower_parent_dentry = NULL; - struct path lower_path; - - OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb)); - - sdcardfs_get_lower_path(dentry, &lower_path); - lower_dentry = lower_path.dentry; - lower_parent_dentry = lock_parent(lower_dentry); - - err = vfs_symlink(d_inode(lower_parent_dentry), lower_dentry, symname); - if (err) - goto out; - err = sdcardfs_interpose(dentry, dir->i_sb, &lower_path); - if (err) - goto out; - fsstack_copy_attr_times(dir, sdcardfs_lower_inode(dir)); - fsstack_copy_inode_size(dir, d_inode(lower_parent_dentry)); - -out: - unlock_dir(lower_parent_dentry); - sdcardfs_put_lower_path(dentry, &lower_path); - REVERT_CRED(); - return err; -} -#endif - static int touch(char *abs_path, mode_t mode) { struct file *filp = filp_open(abs_path, O_RDWR|O_CREAT|O_EXCL|O_NOFOLLOW, mode); @@ -287,7 +217,10 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode } /* save current_cred and override it */ - OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dir)); + saved_cred = override_fsids(SDCARDFS_SB(dir->i_sb), + SDCARDFS_I(dir)->data); + if (!saved_cred) + return -ENOMEM; /* check disk space */ parent_dentry = dget_parent(dentry); @@ -366,13 +299,21 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode if (make_nomedia_in_obb || ((pd->perm == PERM_ANDROID) && (qstr_case_eq(&dentry->d_name, &q_data)))) { - REVERT_CRED(saved_cred); - OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(d_inode(dentry))); + revert_fsids(saved_cred); + saved_cred = override_fsids(sbi, + SDCARDFS_I(d_inode(dentry))->data); + if (!saved_cred) { + pr_err("sdcardfs: failed to set up .nomedia in %s: %d\n", + lower_path.dentry->d_name.name, + -ENOMEM); + goto out; + } set_fs_pwd(current->fs, &lower_path); touch_err = touch(".nomedia", 0664); if (touch_err) { pr_err("sdcardfs: failed to create .nomedia in %s: %d\n", - lower_path.dentry->d_name.name, touch_err); + lower_path.dentry->d_name.name, + touch_err); goto out; } } @@ -382,7 +323,7 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode out_unlock: sdcardfs_put_lower_path(dentry, &lower_path); out_revert: - REVERT_CRED(saved_cred); + revert_fsids(saved_cred); out_eacces: return err; } @@ -402,7 +343,10 @@ static int sdcardfs_rmdir(struct inode *dir, struct dentry *dentry) } /* save current_cred and override it */ - OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dir)); + saved_cred = override_fsids(SDCARDFS_SB(dir->i_sb), + SDCARDFS_I(dir)->data); + if (!saved_cred) + return -ENOMEM; /* sdcardfs_get_real_lower(): in case of remove an user's obb dentry * the dentry on the original path should be deleted. @@ -427,44 +371,11 @@ static int sdcardfs_rmdir(struct inode *dir, struct dentry *dentry) out: unlock_dir(lower_dir_dentry); sdcardfs_put_real_lower(dentry, &lower_path); - REVERT_CRED(saved_cred); + revert_fsids(saved_cred); out_eacces: return err; } -#if 0 -static int sdcardfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, - dev_t dev) -{ - int err; - struct dentry *lower_dentry; - struct dentry *lower_parent_dentry = NULL; - struct path lower_path; - - OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb)); - - sdcardfs_get_lower_path(dentry, &lower_path); - lower_dentry = lower_path.dentry; - lower_parent_dentry = lock_parent(lower_dentry); - - err = vfs_mknod(d_inode(lower_parent_dentry), lower_dentry, mode, dev); - if (err) - goto out; - - err = sdcardfs_interpose(dentry, dir->i_sb, &lower_path); - if (err) - goto out; - fsstack_copy_attr_times(dir, sdcardfs_lower_inode(dir)); - fsstack_copy_inode_size(dir, d_inode(lower_parent_dentry)); - -out: - unlock_dir(lower_parent_dentry); - sdcardfs_put_lower_path(dentry, &lower_path); - REVERT_CRED(); - return err; -} -#endif - /* * The locking rules in sdcardfs_rename are complex. We could use a simpler * superblock-level name-space lock for renames and copy-ups. @@ -489,7 +400,10 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry, } /* save current_cred and override it */ - OVERRIDE_CRED(SDCARDFS_SB(old_dir->i_sb), saved_cred, SDCARDFS_I(new_dir)); + saved_cred = override_fsids(SDCARDFS_SB(old_dir->i_sb), + SDCARDFS_I(new_dir)->data); + if (!saved_cred) + return -ENOMEM; sdcardfs_get_real_lower(old_dentry, &lower_old_path); sdcardfs_get_lower_path(new_dentry, &lower_new_path); @@ -536,7 +450,7 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry, dput(lower_new_dir_dentry); sdcardfs_put_real_lower(old_dentry, &lower_old_path); sdcardfs_put_lower_path(new_dentry, &lower_new_path); - REVERT_CRED(saved_cred); + revert_fsids(saved_cred); out_eacces: return err; } @@ -655,33 +569,7 @@ static int sdcardfs_permission(struct vfsmount *mnt, struct inode *inode, int ma if (IS_POSIXACL(inode)) pr_warn("%s: This may be undefined behavior...\n", __func__); err = generic_permission(&tmp, mask); - /* XXX - * Original sdcardfs code calls inode_permission(lower_inode,.. ) - * for checking inode permission. But doing such things here seems - * duplicated work, because the functions called after this func, - * such as vfs_create, vfs_unlink, vfs_rename, and etc, - * does exactly same thing, i.e., they calls inode_permission(). - * So we just let they do the things. - * If there are any security hole, just uncomment following if block. - */ -#if 0 - if (!err) { - /* - * Permission check on lower_inode(=EXT4). - * we check it with AID_MEDIA_RW permission - */ - struct inode *lower_inode; - - OVERRIDE_CRED(SDCARDFS_SB(inode->sb)); - - lower_inode = sdcardfs_lower_inode(inode); - err = inode_permission(lower_inode, mask); - - REVERT_CRED(); - } -#endif return err; - } static int sdcardfs_setattr_wrn(struct dentry *dentry, struct iattr *ia) @@ -756,7 +644,10 @@ static int sdcardfs_setattr(struct vfsmount *mnt, struct dentry *dentry, struct goto out_err; /* save current_cred and override it */ - OVERRIDE_CRED(SDCARDFS_SB(dentry->d_sb), saved_cred, SDCARDFS_I(inode)); + saved_cred = override_fsids(SDCARDFS_SB(dentry->d_sb), + SDCARDFS_I(inode)->data); + if (!saved_cred) + return -ENOMEM; sdcardfs_get_lower_path(dentry, &lower_path); lower_dentry = lower_path.dentry; @@ -815,7 +706,7 @@ static int sdcardfs_setattr(struct vfsmount *mnt, struct dentry *dentry, struct out: sdcardfs_put_lower_path(dentry, &lower_path); - REVERT_CRED(saved_cred); + revert_fsids(saved_cred); out_err: return err; } @@ -898,13 +789,6 @@ const struct inode_operations sdcardfs_dir_iops = { .setattr = sdcardfs_setattr_wrn, .setattr2 = sdcardfs_setattr, .getattr = sdcardfs_getattr, - /* XXX Following operations are implemented, - * but FUSE(sdcard) or FAT does not support them - * These methods are *NOT* perfectly tested. - .symlink = sdcardfs_symlink, - .link = sdcardfs_link, - .mknod = sdcardfs_mknod, - */ }; const struct inode_operations sdcardfs_main_iops = { diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c index 206f8cbc7d7d..a671ae2338ea 100644 --- a/fs/sdcardfs/lookup.c +++ b/fs/sdcardfs/lookup.c @@ -426,7 +426,12 @@ struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry, } /* save current_cred and override it */ - OVERRIDE_CRED_PTR(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dir)); + saved_cred = override_fsids(SDCARDFS_SB(dir->i_sb), + SDCARDFS_I(dir)->data); + if (!saved_cred) { + ret = ERR_PTR(-ENOMEM); + goto out_err; + } sdcardfs_get_lower_path(parent, &lower_parent_path); @@ -457,7 +462,7 @@ struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry, out: sdcardfs_put_lower_path(parent, &lower_parent_path); - REVERT_CRED(saved_cred); + revert_fsids(saved_cred); out_err: dput(parent); return ret; diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h index 055e413509e4..99227a07a8d6 100644 --- a/fs/sdcardfs/sdcardfs.h +++ b/fs/sdcardfs/sdcardfs.h @@ -88,31 +88,6 @@ (x)->i_mode = ((x)->i_mode & S_IFMT) | 0775;\ } while (0) -/* OVERRIDE_CRED() and REVERT_CRED() - * OVERRIDE_CRED() - * backup original task->cred - * and modifies task->cred->fsuid/fsgid to specified value. - * REVERT_CRED() - * restore original task->cred->fsuid/fsgid. - * These two macro should be used in pair, and OVERRIDE_CRED() should be - * placed at the beginning of a function, right after variable declaration. - */ -#define OVERRIDE_CRED(sdcardfs_sbi, saved_cred, info) \ - do { \ - saved_cred = override_fsids(sdcardfs_sbi, info->data); \ - if (!saved_cred) \ - return -ENOMEM; \ - } while (0) - -#define OVERRIDE_CRED_PTR(sdcardfs_sbi, saved_cred, info) \ - do { \ - saved_cred = override_fsids(sdcardfs_sbi, info->data); \ - if (!saved_cred) \ - return ERR_PTR(-ENOMEM); \ - } while (0) - -#define REVERT_CRED(saved_cred) revert_fsids(saved_cred) - /* Android 5.0 support */ /* Permission mode for a specific node. Controls how file permissions From 8fd9c723bde83907697121ca0f7beb51bbdb32da Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Fri, 20 Jul 2018 16:11:40 -0700 Subject: [PATCH 1212/1212] ANDROID: sdcardfs: Change current->fs under lock bug: 111641492 Change-Id: I79e9894f94880048edaf0f7cfa2d180f65cbcf3b Reported-by: Jann Horn Signed-off-by: Daniel Rosenberg Signed-off-by: Amit Pundir --- fs/sdcardfs/inode.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c index 0b9c345cd01c..6c0039284ae0 100644 --- a/fs/sdcardfs/inode.c +++ b/fs/sdcardfs/inode.c @@ -96,8 +96,11 @@ static int sdcardfs_create(struct inode *dir, struct dentry *dentry, err = -ENOMEM; goto out_unlock; } + copied_fs->umask = 0; + task_lock(current); current->fs = copied_fs; - current->fs->umask = 0; + task_unlock(current); + err = vfs_create2(lower_dentry_mnt, d_inode(lower_parent_dentry), lower_dentry, mode, want_excl); if (err) goto out; @@ -111,7 +114,9 @@ static int sdcardfs_create(struct inode *dir, struct dentry *dentry, fixup_lower_ownership(dentry, dentry->d_name.name); out: + task_lock(current); current->fs = saved_fs; + task_unlock(current); free_fs_struct(copied_fs); out_unlock: unlock_dir(lower_parent_dentry); @@ -249,8 +254,11 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode unlock_dir(lower_parent_dentry); goto out_unlock; } + copied_fs->umask = 0; + task_lock(current); current->fs = copied_fs; - current->fs->umask = 0; + task_unlock(current); + err = vfs_mkdir2(lower_mnt, d_inode(lower_parent_dentry), lower_dentry, mode); if (err) { @@ -318,7 +326,10 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode } } out: + task_lock(current); current->fs = saved_fs; + task_unlock(current); + free_fs_struct(copied_fs); out_unlock: sdcardfs_put_lower_path(dentry, &lower_path);